class ClusteringTraining(Training): def __init__(self, net, data, clustering=KMeans(3), order_less=True, loss=nn.CrossEntropyLoss(), optimizer=torch.optim.Adam, max_epochs=50, batch_size=128, device="cpu", report_interval=10, checkpoint_interval=1000, path_prefix=".", network_name="network"): super(ClusteringTraining, self).__init__() self.net = net.to(device) self.clustering = clustering self.data = data self.train_data = None self.loss = loss self.max_epochs = max_epochs self.batch_size = batch_size self.device = device self.report_interval = report_interval self.checkpoint_interval = checkpoint_interval self.checkpoint_path = f"{path_prefix}/{network_name}" self.order_less = order_less if not order_less: self.classifier = nn.Linear(256, 50) self.classifier = self.classifier.to(self.device) else: self.embedding = nn.Linear(256, 256) self.embedding = self.embedding.to(self.device) self.optimizer = optimizer(self.net.parameters()) self.network_name = network_name self.writer = SummaryWriter(network_name) self.epoch_id = 0 self.step_id = 0 def save_path(self): return f"{self.checkpoint_path}-save.torch" def checkpoint(self): the_net = self.net if isinstance(the_net, torch.nn.DataParallel): the_net = the_net.module netwrite( the_net, f"{self.checkpoint_path}-encoder-epoch-{self.epoch_id}-step-{self.step_id}.torch" ) self.each_checkpoint() def step(self, data, label, centers): self.optimizer.zero_grad() attention = self.net(data.to(self.device)).squeeze() centers = centers.to(self.device).unsqueeze(0) if self.order_less: center_embedding = self.embedding(centers.squeeze()) logits = center_embedding.matmul(attention.unsqueeze(2)).squeeze() else: logits = self.classifier(attention.reshape(attention.size(0), -1)) label = label.long().to(self.device) loss_val = self.loss(logits, label) loss_val.backward() self.writer.add_scalar("cluster assignment loss", float(loss_val), self.step_id) self.optimizer.step() self.each_step() def embed_all(self): self.net.eval() with torch.no_grad(): embedding = [] batch_loader = DataLoader(self.data, batch_size=self.batch_size, shuffle=False) for point, *_ in islice(batch_loader, 5000 // self.batch_size): latent_point = self.net(point.to(self.device)) latent_point = latent_point.to("cpu") latent_point = latent_point.reshape(latent_point.size(0), -1) embedding.append(latent_point) embedding = torch.cat(embedding, dim=0) self.net.train() return embedding def cluster(self, embedding): fit = self.clustering.fit(embedding.squeeze()) labels = list(fit.labels_) try: cluster_centers = fit.cluster_centers_ except: cluster_centers = [ embedding[(labels == label).astype(int)].mean( dim=0).squeeze().unsqueeze(0).numpy() for label in set(labels) ] cluster_centers = np.concatenate(cluster_centers, axis=0) if len(set(labels)) == 1: N = random.randint(2, 10) labels = [random.choice(list(range(N))) for label in labels] offsets = [ np.random.randn(*cluster_centers.shape) * 2.0 for _ in range(N) ] cluster_centers = np.concatenate( [cluster_centers + offsets[idx] for idx in range(N)], axis=0).squeeze() counts = [labels.count(label) for label in range(len(set(labels)))] weights = [1 / counts[label] for label in labels] centers = torch.Tensor(cluster_centers) return weights, labels, centers def _cluster_image(self, labels): count = 10 n_clusters = 50 #max(list(set(labels))) indices = list(range(len(labels))) random.shuffle(indices) cluster_done = [False for _ in range(n_clusters)] cluster_images = [[] for _ in range(n_clusters)] for index in indices: label = labels[index] if all(cluster_done): break if len(cluster_images[label]) < count: img, *_ = self.data[index] img = img - img.min() img = img / img.max() cluster_images[label].append(img) else: cluster_done[label] = True rows = [ torch.cat(image_list, dim=2) for image_list in cluster_images if image_list ] for idx, row in enumerate(rows): self.writer.add_image(f"cluster samples {idx}", row, self.step_id) def _cluster_plot(self, embedding, labels): silhouette = silhouette_score(embedding.squeeze(), labels) chs = calinski_harabaz_score(embedding.squeeze(), labels) dbs = davies_bouldin_score(embedding.squeeze(), labels) n_labels = len(set(labels)) self.writer.add_scalar(f"silhouette {n_labels}", silhouette, self.step_id) self.writer.add_scalar(f"chs {n_labels}", chs, self.step_id) self.writer.add_scalar(f"dbs {n_labels}", dbs, self.step_id) indices = list(range(len(labels))) random.shuffle(indices) samples_to_plot = indices[:1000] sample_labels = [labels[idx] for idx in samples_to_plot] sample_embedding = embedding[samples_to_plot] pca = PCA(2).fit_transform(sample_embedding.squeeze()) fig, ax = plt.subplots() ax.scatter(pca[:, 0], pca[:, 1], c=sample_labels, cmap="tab20") self.writer.add_figure(f"clustering {n_labels}", fig, self.step_id) def each_cluster(self, embedding, labels): self._cluster_image(labels) self._cluster_plot(embedding, labels) def train(self): for epoch_id in range(self.max_epochs): self.epoch_id = epoch_id embedding = self.embed_all() weights, labels, centers = self.cluster(embedding) self.each_cluster(embedding, labels) self.data.labels = labels self.train_data = None self.train_data = DataLoader(self.data, batch_size=self.batch_size, num_workers=8, sampler=WeightedRandomSampler( weights, len(self.data) * 4, replacement=True)) for data, label in self.train_data: self.step(data, label, centers) if self.step_id % self.checkpoint_interval == 0: self.checkpoint() self.step_id += 1 return self.net
def main(): # Create the log and model directiory if they're not present. model_dir = os.path.join( args.log_dir, 'models_' + time.strftime('%d_%b_%Y_%H_%M_%S', time.localtime())) pathlib.Path(model_dir).mkdir(parents=True, exist_ok=True) log_writer = SummaryWriter(log_dir=model_dir) train_data_path = os.path.join(args.data_dir, args.train_data_dict) with open(train_data_path, 'rb') as f: train_data_dict = pickle.load(f, encoding='latin1') train_dt = train_data_dict['dt'] print('Loaded training data from %s, train_dt = %.2f' % (train_data_path, train_dt)) if args.eval_every is not None: eval_data_path = os.path.join(args.data_dir, args.eval_data_dict) with open(eval_data_path, 'rb') as f: eval_data_dict = pickle.load(f, encoding='latin1') eval_dt = eval_data_dict['dt'] print('Loaded evaluation data from %s, eval_dt = %.2f' % (eval_data_path, eval_dt)) if args.incl_robot_node: robot_node = stg_node.STGNode('0', 'Pedestrian') else: robot_node = None for key in train_data_dict['input_dict'].keys(): if isinstance(key, stg_node.STGNode): random_node = key break model_registrar = ModelRegistrar(model_dir, args.device) hyperparams['state_dim'] = train_data_dict['input_dict'][ random_node].shape[2] hyperparams['pred_dim'] = len(train_data_dict['pred_indices']) hyperparams['pred_indices'] = train_data_dict['pred_indices'] hyperparams['dynamic_edges'] = args.dynamic_edges hyperparams['edge_state_combine_method'] = args.edge_state_combine_method hyperparams[ 'edge_influence_combine_method'] = args.edge_influence_combine_method hyperparams['nodes_standardization'] = train_data_dict[ 'nodes_standardization'] hyperparams['labels_standardization'] = train_data_dict[ 'labels_standardization'] hyperparams['edge_radius'] = args.edge_radius if args.eval_every is not None: eval_hyperparams = copy.deepcopy(hyperparams) eval_hyperparams['nodes_standardization'] = eval_data_dict[ "nodes_standardization"] eval_hyperparams['labels_standardization'] = eval_data_dict[ "labels_standardization"] kwargs_dict = { 'dynamic_edges': hyperparams['dynamic_edges'], 'edge_state_combine_method': hyperparams['edge_state_combine_method'], 'edge_influence_combine_method': hyperparams['edge_influence_combine_method'] } stg = SpatioTemporalGraphCVAEModel(robot_node, model_registrar, hyperparams, kwargs_dict, None, args.device) print('Created training STG model.') if args.eval_every is not None: # It is important that eval_stg uses the same model_registrar as # the stg being trained, otherwise you're just repeatedly evaluating # randomly-initialized weights! eval_stg = SpatioTemporalGraphCVAEModel(robot_node, model_registrar, eval_hyperparams, kwargs_dict, None, args.eval_device) print('Created evaluation STG model.') # Create the aggregate scene_graph for all the data, allowing # for batching, just like the old one. Then, for speed tests # we'll show how much faster this method is than keeping the # full version. Can show graphs of forward inference time vs problem size # with two lines (using aggregate graph, using online-computed graph). agg_scene_graph = create_batch_scene_graph( train_data_dict['input_dict'], float(hyperparams['edge_radius']), use_old_method=(args.dynamic_edges == 'no')) print('Created aggregate training scene graph.') if args.dynamic_edges == 'yes': agg_scene_graph.compute_edge_scaling(args.edge_addition_filter, args.edge_removal_filter) train_data_dict['input_dict'][ 'edge_scaling_mask'] = agg_scene_graph.edge_scaling_mask print('Computed edge scaling for the training scene graph.') stg.set_scene_graph(agg_scene_graph) stg.set_annealing_params() if args.eval_every is not None: eval_agg_scene_graph = create_batch_scene_graph( eval_data_dict['input_dict'], float(hyperparams['edge_radius']), use_old_method=(args.dynamic_edges == 'no')) print('Created aggregate evaluation scene graph.') if args.dynamic_edges == 'yes': eval_agg_scene_graph.compute_edge_scaling( args.edge_addition_filter, args.edge_removal_filter) eval_data_dict['input_dict'][ 'edge_scaling_mask'] = eval_agg_scene_graph.edge_scaling_mask print('Computed edge scaling for the evaluation scene graph.') eval_stg.set_scene_graph(eval_agg_scene_graph) eval_stg.set_annealing_params() # model_registrar.print_model_names() optimizer = optim.Adam(model_registrar.parameters(), lr=hyperparams['learning_rate']) lr_scheduler = optim.lr_scheduler.ExponentialLR( optimizer, gamma=hyperparams['learning_decay_rate']) # Keeping colors consistent throughout training. color_dict = defaultdict(dict) print_training_header(newline_start=True) for curr_iter in range(args.num_iters): # Necessary because we flip the weights contained between GPU and CPU sometimes. model_registrar.to(args.device) # Setting the current iterator value for internal logging. stg.set_curr_iter(curr_iter) # Stepping forward the learning rate scheduler and annealers. lr_scheduler.step() log_writer.add_scalar('dynstg/learning_rate', lr_scheduler.get_lr()[0], curr_iter) stg.step_annealers() # Zeroing gradients for the upcoming iteration. optimizer.zero_grad() train_losses = list() for mb_num in range(args.batch_multiplier): # Obtaining the batch's training loss. train_inputs, train_labels = sample_inputs_and_labels( train_data_dict, batch_size=hyperparams['batch_size']) # Compute the training loss. train_loss = stg.train_loss( train_inputs, train_labels, hyperparams['prediction_horizon']) / args.batch_multiplier train_losses.append(train_loss.item()) # Calculating gradients. train_loss.backward() # Print training information. Also, no newline here. It's added in at a later line. iter_train_loss = sum(train_losses) print('{:9} | {:10} | '.format(curr_iter, '%.2f' % iter_train_loss), end='', flush=True) log_writer.add_histogram('dynstg/train_minibatch_losses', np.asarray(train_losses), curr_iter) log_writer.add_scalar('dynstg/train_loss', iter_train_loss, curr_iter) # Clipping gradients. if hyperparams['grad_clip'] is not None: nn.utils.clip_grad_value_(model_registrar.parameters(), hyperparams['grad_clip']) # # Logging gradient norms. # len_prefix = len('model_dict.') # for name, param in model_registrar.named_parameters(): # if param.grad is None: # # print(name, 'grad is None') # continue # log_writer.add_scalar('gradient_norms/' + name[len_prefix:], # param.grad.norm(), # curr_iter) # Performing a gradient step. optimizer.step() # Freeing up memory. del train_loss if args.eval_every is not None and (curr_iter + 1) % args.eval_every == 0: with torch.no_grad(): # First plotting training predictions. pred_fig = plot_utils.plot_predictions_during_training( stg, train_inputs, hyperparams['prediction_horizon'], num_samples=100, dt=train_dt, max_speed=max_speed, color_dict=color_dict, most_likely=True) log_writer.add_figure('dynstg/train_prediction', pred_fig, curr_iter) train_mse_batch_errors, train_fse_batch_errors = eval_utils.compute_batch_statistics( stg, train_data_dict, hyperparams['minimum_history_length'], hyperparams['prediction_horizon'], num_samples=100, num_runs=100, dt=train_dt, max_speed=max_speed, robot_node=robot_node) log_writer.add_histogram('dynstg/train_mse', train_mse_batch_errors, curr_iter) log_writer.add_histogram('dynstg/train_fse', train_fse_batch_errors, curr_iter) mse_boxplot_fig, fse_boxplot_fig = plot_utils.plot_boxplots_during_training( train_mse_batch_errors, train_fse_batch_errors) log_writer.add_figure('dynstg/train_mse_boxplot', mse_boxplot_fig, curr_iter) log_writer.add_figure('dynstg/train_fse_boxplot', fse_boxplot_fig, curr_iter) log_writer.add_scalars( 'dynstg/train_sq_error', { 'mean_mse': torch.mean(train_mse_batch_errors), 'mean_fse': torch.mean(train_fse_batch_errors), 'median_mse': torch.median(train_mse_batch_errors), 'median_fse': torch.median(train_fse_batch_errors) }, curr_iter) # Then computing evaluation values and predictions. model_registrar.to(args.eval_device) eval_stg.set_curr_iter(curr_iter) eval_inputs, eval_labels = sample_inputs_and_labels( eval_data_dict, device=args.eval_device, batch_size=args.eval_batch_size) (eval_loss_q_is, eval_loss_p, eval_loss_exact) = eval_stg.eval_loss( eval_inputs, eval_labels, hyperparams['prediction_horizon']) log_writer.add_scalars( 'dynstg/eval', { 'nll_q_is': eval_loss_q_is, 'nll_p': eval_loss_p, 'nll_exact': eval_loss_exact }, curr_iter) pred_fig = plot_utils.plot_predictions_during_training( eval_stg, eval_inputs, hyperparams['prediction_horizon'], num_samples=100, dt=eval_dt, max_speed=max_speed, color_dict=color_dict, most_likely=True) log_writer.add_figure('dynstg/eval_prediction', pred_fig, curr_iter) eval_mse_batch_errors, eval_fse_batch_errors = eval_utils.compute_batch_statistics( eval_stg, eval_data_dict, hyperparams['minimum_history_length'], hyperparams['prediction_horizon'], num_samples=100, num_runs=100, dt=eval_dt, max_speed=max_speed, robot_node=robot_node) log_writer.add_histogram('dynstg/eval_mse', eval_mse_batch_errors, curr_iter) log_writer.add_histogram('dynstg/eval_fse', eval_fse_batch_errors, curr_iter) mse_boxplot_fig, fse_boxplot_fig = plot_utils.plot_boxplots_during_training( eval_mse_batch_errors, eval_fse_batch_errors) log_writer.add_figure('dynstg/eval_mse_boxplot', mse_boxplot_fig, curr_iter) log_writer.add_figure('dynstg/eval_fse_boxplot', fse_boxplot_fig, curr_iter) log_writer.add_scalars( 'dynstg/eval_sq_error', { 'mean_mse': torch.mean(eval_mse_batch_errors), 'mean_fse': torch.mean(eval_fse_batch_errors), 'median_mse': torch.median(eval_mse_batch_errors), 'median_fse': torch.median(eval_fse_batch_errors) }, curr_iter) print('{:15} | {:10} | {:14}'.format( '%.2f' % eval_loss_q_is.item(), '%.2f' % eval_loss_p.item(), '%.2f' % eval_loss_exact.item()), end='', flush=True) # Freeing up memory. del eval_loss_q_is del eval_loss_p del eval_loss_exact else: print('{:15} | {:10} | {:14}'.format('', '', ''), end='', flush=True) # Here's the newline that ends the current training information printing. print('') if args.save_every is not None and (curr_iter + 1) % args.save_every == 0: model_registrar.save_models(curr_iter) print_training_header()
def main(): global args args = parser.parse_args() out_dir = os.path.join(args.out_root, args.name) writer = SummaryWriter(out_dir) model = define_model(is_resnet=False, is_densenet=False, is_senet=True, model=args.model, parallel=args.parallel, semff=args.semff, pcamff=args.pcamff) gpu_num = torch.cuda.device_count() batch_size_per_gpu = 4 device_ids = [] for i in range(gpu_num): device_ids.append(i) model = torch.nn.DataParallel(model, device_ids=device_ids).cuda() batch_size = gpu_num * batch_size_per_gpu cudnn.benchmark = True """ Set Different Learning Rate """ """ params_with_lr = [] for name, param in model.named_parameters(): if "SEMFF.se" in name: params_with_lr.append({"params": param, "lr": args.lr/10}) else: params_with_lr.append({"params": param}) optimizer = torch.optim.Adam(params_with_lr, args.lr, weight_decay=args.weight_decay) """ optimizer = torch.optim.Adam(model.parameters(), args.lr, weight_decay=args.weight_decay) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.step_size, gamma=0.1) print("step_size is set to %d" % scheduler.step_size) if args.dataset == 'nyud': train_loader = loaddata.getTrainingData(nyud_root_path, batch_size) test_loader = loaddata.getTestingData(nyud_root_path, batch_size) elif args.dataset == 'sun': train_loader = loaddata_sun.getTrainingData(sun_root_path, batch_size) test_loader = loaddata_sun.getTestingData(sun_root_path, batch_size) else: raise NotImplementedError('Specify dataset in [\'nyud\', \'sun\']') vis_out_dir = osp.join(out_dir, "outputs") for epoch in range(args.epochs): scheduler.step() e = epoch + args.start_epoch lr = args.lr * (0.1**(epoch // args.step_size)) train(train_loader, model, optimizer, e, writer) results = None images = None pred_depths = None if isinstance(model.module, net.TBDPNet): mff_results, semff_results, images, mff_depths, semff_depths =\ test.test_tbdp(test_loader, model, dataset=args.dataset, returnValue=True, returnSamples=True, sample_idx=[0, 1, 2, 3, 4, 5]) results = [mff_results, semff_results] pred_depths = [mff_depths, semff_depths] elif isinstance(model.module, net.Hu): results, images, pred_depths = test.test_hu( test_loader, model, dataset=args.dataset, returnValue=True, returnSamples=True, sample_idx=[0, 1, 2, 3, 4, 5]) for i in range(len(images)): if epoch == 0: os.makedirs(osp.join(vis_out_dir, "images"), exist_ok=True) image = np.clip( denormalize_image(images[i].data.cpu().numpy(), mode='nyud').astype(np.uint8), 0, 254) image = visualize_array(image, f_name=osp.join( vis_out_dir, "images", str(i + 1) + ".png")) else: image = np.clip( denormalize_image(images[i].data.cpu().numpy(), mode='nyud').astype(np.uint8), 0, 254) image = visualize_array(image) writer.add_figure("image/%d" % (i + 1), image, e) if isinstance(model.module, net.TBDPNet): mff_depths, semff_depths = pred_depths os.makedirs(osp.join(vis_out_dir, "mff_depths", str(i + 1)), exist_ok=True) os.makedirs(osp.join(vis_out_dir, "semff_depths", str(i + 1)), exist_ok=True) mff_depth = visualize_array(mff_depths[i].data.cpu().numpy(), f_name=osp.join( vis_out_dir, "mff_depths", str(i + 1), "%d_%d.png" % (i + 1, e))) semff_depth = visualize_array( semff_depths[i].data.cpu().numpy(), f_name=osp.join(vis_out_dir, "semff_depths", str(i + 1), "%d_%d.png" % (i + 1, e))) writer.add_figure("mff_prediction/%d" % (i + 1), mff_depth, e) writer.add_figure("semff_prediction/%d" % (i + 1), semff_depth, e) elif isinstance(model.module, net.Hu): os.makedirs(osp.join(vis_out_dir, "pred", str(i + 1)), exist_ok=True) depth = visualize_array(pred_depths[i].data.cpu().numpy(), f_name=osp.join( vis_out_dir, "pred", str(i + 1), "%d_%d.png" % (i + 1, e))) writer.add_figure("prediction/%d" % (i + 1), depth, e) if isinstance(model.module, net.TBDPNet): mff_results, semff_results = results writer.add_scalar("mff/RMSE", mff_results["RMSE"], e) writer.add_scalar("mff/ABS_REL", mff_results["ABS_REL"], e) writer.add_scalar("mff/LG10", mff_results["LG10"], e) writer.add_scalar("mff/DELTA1", mff_results["DELTA1"], e) writer.add_scalar("mff/DELTA2", mff_results["DELTA2"], e) writer.add_scalar("mff/DELTA3", mff_results["DELTA3"], e) writer.add_scalar("mff/lr", lr, e) writer.add_scalar("semff/RMSE", semff_results["RMSE"], e) writer.add_scalar("semff/ABS_REL", semff_results["ABS_REL"], e) writer.add_scalar("semff/LG10", semff_results["LG10"], e) writer.add_scalar("semff/DELTA1", semff_results["DELTA1"], e) writer.add_scalar("semff/DELTA2", semff_results["DELTA2"], e) writer.add_scalar("semff/DELTA3", semff_results["DELTA3"], e) writer.add_scalar("semff/lr", lr, e) elif isinstance(model.module, net.Hu): writer.add_scalar("data/RMSE", results["RMSE"], e) writer.add_scalar("data/ABS_REL", results["ABS_REL"], e) writer.add_scalar("data/LG10", results["LG10"], e) writer.add_scalar("data/DELTA1", results["DELTA1"], e) writer.add_scalar("data/DELTA2", results["DELTA2"], e) writer.add_scalar("data/DELTA3", results["DELTA3"], e) writer.add_scalar("data/lr", lr, e) save_checkpoint(model.state_dict(), e, out_dir)
def main(dataset_train, dataset_validation, mmscaler, modelo='B', batch_size=100, num_epoch=50, p_dropout=0.5): writer = SummaryWriter('selected/' + modelo + '_' + str(batch_size) + '_' + str(num_epoch) + '_' + str(p_dropout)) red = Red_ArquitecturaB(input_size=168, output_size=24, p_drop=p_dropout) funcion_perdida = nn.MSELoss() optimizer = torch.optim.Adam(params=red.parameters(), lr=0.001) train = np.array( pd.read_csv(dataset_train, decimal=".", sep=",", header=None).values) train_ds = TemperaturaDataSet(train) train_dataloader = DataLoader(dataset=train_ds, shuffle=True, batch_size=batch_size) # entrenamiento de la red red.train() for epoch in range(num_epoch): batch = 0 for x_train, y_train in train_dataloader: optimizer.zero_grad() x_train = x_train.type(torch.float) y_train = y_train.type(torch.float) y_pred_train = red(x_train) # es lo mismo que red.forward(entrada) loss = funcion_perdida(y_pred_train, y_train) loss.backward() optimizer.step() print("Epoch: %2d Batch: %6d Loss: %2.8f ErrorMean: %2.8f" % (epoch, batch, loss.item(), (y_pred_train - y_train).mean())) batch = batch + 1 writer.add_scalar('data/train/loss', loss.item(), epoch) writer.add_scalar('data/train/ErrorMean', (y_pred_train - y_train).mean(), epoch) for name, param in red.named_parameters(): writer.add_histogram(name, param.clone().data.numpy(), epoch) # evaluacion de la validacion test = np.array( pd.read_csv(dataset_validation, decimal=".", sep=",", header=None).values) test_ds = TemperaturaDataSet(test) test_dataloader = DataLoader(dataset=test_ds, shuffle=False, batch_size=1) temp_scaler = pickle.load(open(mmscaler, 'rb')) print(temp_scaler) red.eval() batch = 0 for x_test, y_test in test_dataloader: x_test = x_test.type(torch.float) y_test = y_test.type(torch.float) y_pred_test = red(x_test) loss = funcion_perdida(y_pred_test, y_test) print("Batch: %6d Loss: %2.8f ErrorMean: %2.8f" % (batch, loss.item(), (y_pred_test - y_test).mean())) writer.add_scalar('data/test/loss', loss.item(), batch) writer.add_scalar('data/test/ErrorMean', (y_pred_test - y_test).mean(), batch) if batch % 100 == 0: fig = plt.figure(figsize=(13, 6)) plt.plot( temp_scaler.inverse_transform(y_test.data.numpy().reshape( -1, 1)), 'b', temp_scaler.inverse_transform(y_pred_test.data.numpy().reshape( -1, 1)), 'r') writer.add_figure('data/test/resultados', fig, batch) batch = batch + 1 writer.close()
class TBLogger(object): """ xyz_dummies: stretch the screen with empty plots so the legend would always fit for other plots """ def __init__(self, local_rank, log_dir, name, interval=1, dummies=False): self.enabled = (local_rank == 0) self.interval = interval self.cache = {} if local_rank == 0: self.summary_writer = SummaryWriter(log_dir=os.path.join( log_dir, name), flush_secs=120, max_queue=200) atexit.register(self.summary_writer.close) if dummies: for key in ('aaa', 'zzz'): self.summary_writer.add_scalar(key, 0.0, 1) def log_value(self, step, key, val, stat='mean'): if self.enabled: if key not in self.cache: self.cache[key] = [] self.cache[key].append(val) if len(self.cache[key]) == self.interval: agg_val = getattr(np, stat)(self.cache[key]) self.summary_writer.add_scalar(key, agg_val, step) del self.cache[key] def log_meta(self, step, meta): for k, v in meta.items(): self.log_value(step, k, v.item()) def log_grads(self, step: int, model): """Log max, min, mean gradients of the `model` at `step`. Args: step (int): Iteration number. model (nn.Model): """ if self.enabled: norms = [ p.grad.norm().item() for p in model.parameters() if p.grad is not None ] for stat in ('max', 'min', 'mean'): self.log_value(step, f'grad_{stat}', getattr(np, stat)(norms), stat=stat) def log_image(self, tag: str, img_tensor, global_step=None, walltime: float = None, dataformats: str = 'CHW'): """Add image to log via tensorboardX. Args: tag (str): Data identifier img_tensor (Union[torch.Tensor, numpy.array]): An uint8 or float Tensor of shape [channel, height, width] where channel is 1, 3, or 4. The elements in img_tensor can either have values in [0, 1] (float32) or [0, 255] (uint8). Users are responsible to scale the data in the correct range/type. walltime (float, optional): Optional override default walltime (time.time()) of event. Defaults to None. dataformats (str, optional): Specifies the meaning of each dimension of the input tensor. Supported: CHW, HWC, HW. Defaults to 'CHW'. """ if self.enabled: self.summary_writer.add_image(tag, img_tensor, global_step=global_step, walltime=walltime, dataformats=dataformats) def log_figure(self, tag: str, img_tensor, global_step=None, walltime: float = None): """Add image to log via tensorboardX. Args: tag (str): Data identifier img_tensor (Union[torch.Tensor, numpy.array]): An uint8 or float Tensor of shape [channel, height, width] where channel is 1, 3, or 4. The elements in img_tensor can either have values in [0, 1] (float32) or [0, 255] (uint8). Users are responsible to scale the data in the correct range/type. walltime (float, optional): Optional override default walltime (time.time()) of event. Defaults to None. dataformats (str, optional): Specifies the meaning of each dimension of the input tensor. Supported: CHW, HWC, HW. Defaults to 'CHW'. """ if self.enabled: self.summary_writer.add_figure(tag, img_tensor, global_step=global_step, walltime=walltime)
class Logger: def __init__(self, log_dir, n_logged_samples=10, summary_writer=None): self._log_dir = log_dir self._n_logged_samples = n_logged_samples if summary_writer is not None: self._summ_writer = summary_writer else: self._summ_writer = SummaryWriter(log_dir) def _loop_batch(self, fn, name, val, *argv, **kwargs): """Loops the logging function n times.""" for log_idx in range(min(self._n_logged_samples, len(val))): name_i = os.path.join(name, "_%d" % log_idx) fn(name_i, val[log_idx], *argv, **kwargs) @staticmethod def _check_size(val, size): if isinstance(val, torch.Tensor) or isinstance(val, np.ndarray): assert len( val.shape ) == size, "Size of tensor does not fit required size, {} vs {}".format( len(val.shape), size) elif isinstance(val, list): assert len( val[0].shape ) == size - 1, "Size of list element does not fit required size, {} vs {}".format( len(val[0].shape), size - 1) else: raise NotImplementedError( "Input type {} not supported for dimensionality check!".format( type(val))) if (val[0].shape[1] > 10000) or (val[0].shape[2] > 10000): raise ValueError("This might be a bit too much") def log_scalar(self, scalar, name, step, phase): self._summ_writer.add_scalar('{}_{}'.format(name, phase), scalar, step) def log_scalars(self, scalar_dict, group_name, step, phase): """Will log all scalars in the same plot.""" self._summ_writer.add_scalars('{}_{}'.format(group_name, phase), scalar_dict, step) def log_images(self, image, name, step, phase): self._check_size(image, 4) # [N, C, H, W] self._loop_batch(self._summ_writer.add_image, '{}_{}'.format(name, phase), image, step) def log_video(self, video_frames, name, step, phase): assert len( video_frames.shape ) == 4, "Need [T, C, H, W] input tensor for single video logging!" if not isinstance(video_frames, torch.Tensor): video_frames = torch.tensor(video_frames) video_frames = torch.transpose(video_frames, 0, 1) # tbX requires [C, T, H, W] video_frames = video_frames.unsqueeze( 0) # add an extra dimension to get grid of size 1 self._summ_writer.add_video('{}_{}'.format(name, phase), video_frames, step) def log_videos(self, video_frames, name, step, phase, fps=3): assert len( video_frames.shape ) == 5, "Need [N, T, C, H, W] input tensor for video logging!" video_frames = video_frames.unsqueeze( 1) # add an extra dimension after batch to get grid of size 1 self._loop_batch(self._summ_writer.add_video, '{}_{}'.format(name, phase), video_frames, step, fps=fps) def log_image(self, images, name, step, phase): self._summ_writer.add_image('{}_{}'.format(name, phase), images, step) def log_image_grid(self, images, name, step, phase, nrow=8): assert len( images.shape ) == 4, "Image grid logging requires input shape [batch, C, H, W]!" img_grid = torchvision.utils.make_grid(images, nrow=nrow) self.log_images(img_grid, '{}_{}'.format(name, phase), step) def log_video_grid(self, video_frames, name, step, phase, fps=3): assert len( video_frames.shape ) == 5, "Need [N, T, C, H, W] input tensor for video logging!" self._summ_writer.add_video('{}_{}'.format(name, phase), video_frames, step, fps=fps) def log_figures(self, figure, name, step, phase): """figure: matplotlib.pyplot figure handle""" assert figure.shape[ 0] > 0, "Figure logging requires input shape [batch x figures]!" self._loop_batch(self._summ_writer.add_figure, '{}_{}'.format(name, phase), figure, step) def log_figure(self, figure, name, step, phase): """figure: matplotlib.pyplot figure handle""" self._summ_writer.add_figure('{}_{}'.format(name, phase), figure, step) def log_graph(self, array, name, step, phase): """figure: matplotlib.pyplot figure handle""" im = plot_graph(array) self._summ_writer.add_image('{}_{}'.format(name, phase), im, step) def dump_scalars(self, log_path=None): log_path = os.path.join( self._log_dir, "scalar_data.json") if log_path is None else log_path self._summ_writer.export_scalars_to_json(log_path)
def infer(model, path, detections_file, resize, max_size, batch_size, mixed_precision=True, is_master=True, world=0, annotations=None, use_dali=True, is_validation=False, verbose=True, logdir=None, iteration=100): 'Run inference on images from path' backend = 'pytorch' if isinstance(model, Model) or isinstance( model, DDP) else 'tensorrt' stride = model.module.stride if isinstance(model, DDP) else model.stride # Create annotations if none was provided if not annotations: annotations = tempfile.mktemp('.json') images = [{ 'id': i, 'file_name': f } for i, f in enumerate(os.listdir(path))] json.dump({'images': images}, open(annotations, 'w')) # TensorRT only supports fixed input sizes, so override input size accordingly if backend == 'tensorrt': max_size = max(model.input_size) # Prepare dataset if verbose: print('Preparing dataset...') data_iterator = (DaliDataIterator if use_dali else DataIterator)( path, resize, max_size, batch_size, stride, world, annotations, training=False) if verbose: print(data_iterator) # Prepare model if backend is 'pytorch': # If we are doing validation during training, # no need to register model with AMP again if not is_validation: if torch.cuda.is_available(): model = model.cuda() model = amp.initialize(model, None, opt_level='O2' if mixed_precision else 'O0', keep_batchnorm_fp32=True, verbosity=0) model.eval() if verbose: print(' backend: {}'.format(backend)) print(' device: {} {}'.format( world, 'cpu' if not torch.cuda.is_available() else 'gpu' if world == 1 else 'gpus')) print(' batch: {}, precision: {}'.format( batch_size, 'unknown' if backend is 'tensorrt' else 'mixed' if mixed_precision else 'full')) print('Running inference...') results = [] profiler = Profiler(['infer', 'fw']) with torch.no_grad(): for i, (data, ids, ratios) in enumerate(data_iterator): # Forward pass profiler.start('fw') scores, boxes, classes = model(data) profiler.stop('fw') results.append([scores, boxes, classes, ids, ratios]) profiler.bump('infer') if verbose and (profiler.totals['infer'] > 60 or i == len(data_iterator) - 1): size = len(data_iterator.ids) msg = '[{:{len}}/{}]'.format(min((i + 1) * batch_size, size), size, len=len(str(size))) msg += ' {:.3f}s/{}-batch'.format(profiler.means['infer'], batch_size) msg += ' (fw: {:.3f}s)'.format(profiler.means['fw']) msg += ', {:.1f} im/s'.format(batch_size / profiler.means['infer']) print(msg, flush=True) profiler.reset() # Gather results from all devices if verbose: print('Gathering results...') results = [torch.cat(r, dim=0) for r in zip(*results)] if world > 1: for r, result in enumerate(results): all_result = [ torch.ones_like(result, device=result.device) for _ in range(world) ] torch.distributed.all_gather(list(all_result), result) results[r] = torch.cat(all_result, dim=0) if is_master: # Copy buffers back to host results = [r.cpu() for r in results] # Collect detections detections = [] processed_ids = set() for scores, boxes, classes, image_id, ratios in zip(*results): image_id = image_id.item() if image_id in processed_ids: continue processed_ids.add(image_id) keep = (scores > 0).nonzero() scores = scores[keep].view(-1) boxes = boxes[keep, :].view(-1, 4) / ratios classes = classes[keep].view(-1).int() for score, box, cat in zip(scores, boxes, classes): x1, y1, x2, y2 = box.data.tolist() cat = cat.item() if 'annotations' in data_iterator.coco.dataset: cat = data_iterator.coco.getCatIds()[cat] detections.append({ 'image_id': image_id, 'score': score.item(), 'bbox': [x1, y1, x2 - x1 + 1, y2 - y1 + 1], 'category_id': cat }) if detections: # Save detections if detections_file and verbose: print('Writing {}...'.format(detections_file)) detections = {'annotations': detections} detections['images'] = data_iterator.coco.dataset['images'] if 'categories' in data_iterator.coco.dataset: detections['categories'] = [ data_iterator.coco.dataset['categories'] ] if detections_file: json.dump(detections, open(detections_file, 'w'), indent=4) # Evaluate model on dataset if 'annotations' in data_iterator.coco.dataset: if verbose: print('Evaluating model...') with redirect_stdout(None): coco_pred = data_iterator.coco.loadRes( detections['annotations']) coco_eval = COCOeval(data_iterator.coco, coco_pred, 'bbox') coco_eval.evaluate() coco_eval.accumulate() coco_eval.summarize() results = coco_eval.stats # Create TensorBoard writer if logdir is not None: from tensorboardX import SummaryWriter if is_master and verbose: print('Infer writer: Writing TensorBoard logs to: {}'. format(logdir)) writer = SummaryWriter(logdir=logdir) if results != []: writer.add_scalar( 'Average Precision/IoU=0.50:0.95/area=all/maxDets=100', results[0], iteration) writer.add_scalar( 'Average Precision/IoU=0.50/area=all/maxDets=100', results[1], iteration) writer.add_scalar( 'Average Precision/IoU=0.75/area=all/maxDets=100', results[2], iteration) writer.add_scalar( 'Average Precision/IoU=0.50:0.95/area=small/maxDets=100', results[3], iteration) writer.add_scalar( 'Average Precision/IoU=0.50:0.95/area=medium/maxDets=100', results[4], iteration) writer.add_scalar( 'Average Precision/IoU=0.50:0.95/area=large/maxDets=100', results[5], iteration) writer.add_scalar( 'Average Recall/IoU=0.50:0.95/area=all/maxDets=1', results[6], iteration) writer.add_scalar( 'Average Recall/IoU=0.50:0.95/area=all/maxDets=10', results[7], iteration) writer.add_scalar( 'Average Recall/IoU=0.50:0.95/area=all/maxDets=100', results[8], iteration) writer.add_scalar( 'Average Recall/IoU=0.50:0.95/area= small/maxDets=100', results[9], iteration) writer.add_scalar( 'Average Recall/IoU=0.50:0.95/area=medium/maxDets=100', results[10], iteration) writer.add_scalar( 'Average Recall/IoU=0.50:0.95/area= large/maxDets=100', results[11], iteration) writer.close() else: print('No detections!') if logdir is not None and detections_file is not None: from tensorboardX import SummaryWriter if is_master and verbose: print('Writing TensorBoard logs to: {}'.format(logdir)) writer = SummaryWriter(logdir=logdir) def get_bounding_boxes(annotations: List, image_id: int) -> List: return [a for a in annotations if a["image_id"] == image_id] with open(detections_file, "r") as file: all_detections = json.load(file) with open(annotations, "r") as file: all_ground_truths = json.load(file) i = 0 for image_json in all_detections["images"][:3]: image_id = image_json["id"] image_path = path + '/' + image_json["file_name"] image = io.imread(image_path) assert (image_json["file_name"] == [ x["file_name"] for x in all_ground_truths["images"] if x["id"] == image_id ][0]) fig, ax = plt.subplots(figsize=(16, 16)) ax.imshow(image) detections = get_bounding_boxes(all_detections["annotations"], image_id) detections = [d for d in detections if d["score"] > 0.5] ground_truths = get_bounding_boxes( all_ground_truths["annotations"], image_id) for d in detections: x, y, width, height = d["bbox"] score = d["score"] category_id = d["category_id"] rectangle = patches.Rectangle( (x, y), width, height, linewidth=2, edgecolor="r", facecolor="none", ) ax.add_patch(rectangle) ax.text( x, y - 4, f"{category_id}: {score:0.2f}", color="r", fontsize=20, fontweight="bold", ) for gt in ground_truths: x, y, width, height = gt["bbox"] rectangle = patches.Rectangle( (x, y), width, height, linewidth=2, edgecolor="b", facecolor="none", ) ax.add_patch(rectangle) ax.axis("off") writer.add_figure('images', fig, i) i += 1 writer.close()
with torch.no_grad(): # Everything is in one batch, so this loop will only happen once for i, data in enumerate(vizloader): maze_loc_goal_ssps, directions, locs, goals = data outputs = model(maze_loc_goal_ssps) loss = criterion(outputs, directions) # print(loss.data.item()) if args.logdir != '': fig_pred = plot_path_predictions(directions=outputs, coords=locs, type='colour') writer.add_figure('viz set predictions', fig_pred) fig_truth = plot_path_predictions(directions=directions, coords=locs, type='colour') writer.add_figure('ground truth', fig_truth) fig_pred_quiver = plot_path_predictions(directions=outputs, coords=locs, dcell=xs[1] - xs[0]) writer.add_figure('viz set predictions quiver', fig_pred_quiver) fig_truth_quiver = plot_path_predictions(directions=directions, coords=locs, dcell=xs[1] - xs[0]) writer.add_figure('ground truth quiver', fig_truth_quiver) writer.add_scalar('viz_loss', loss.data.item())
def main(args): ''' --- SELECT DEVICES --- ''' # Select either gpu or cpu device = torch.device("cuda" if args.cuda else "cpu") # Select among available GPUs if args.cuda: os.environ["CUDA_VISIBLE_DEVICES"] = ','.join( str(x) for x in args.gpudevice) ''' --- CREATE EXPERIMENTS DIRECTORY AND LOGGERS IN TENSORBOARD --- ''' projdir = sys.path[0] # Path for saving and loading the network. saveloadpath = os.path.join(projdir, 'experiment\\checkpoints', args.exp_name + '.pth') Path(os.path.dirname(saveloadpath)).mkdir(exist_ok=True, parents=True) # timestamp = str(datetime.datetime.now().strftime('%Y-%m-%d-%H-%M')) tblogdir = os.path.join(projdir, 'experiment\\tensorboardX', args.exp_name) # + '_' + timestamp ) Path(tblogdir).mkdir(exist_ok=True, parents=True) # Create tb_writer(the writer will be used to write the information on tb) by using SummaryWriter, # flush_secs defines how much seconds need to wait for writing information. tb_writer = SummaryWriter(logdir=tblogdir, flush_secs=3, write_to_disk=True) ''' --- INIT DATASETS AND DATALOADER (FOR SINGLE EPOCH) --- ''' # Read data from file, and create training data and testing data which are both in multiple frames. Beware Ts is # recording for every frame, i.e. every 82ms the automotive radar records once to form single frame(We need this information for LSTM). train_dataset, test_dataset, class_names = read_dataset( args.datapath, Ts=0.082, train_test_split=0.8) # Prepare the traing and testing dataset. both trainDataset and testDataset are dataset have multiple frames data, # for each frame it contains the "unified number of detection points"(NMAX detection points per frame). # Init test dataset(Beware we should NOT use data augmentation for test dataset), beware we need "Resampling(maxPointsPerFrame=10)" # as long as we use PointNet/PointNet++ as feature extractor. We do NOT need "Resampling(maxPointsPerFrame=10)" asif rely on feature # engineering to extract feature. test_dataTransformations = transforms.Compose([ NormalizeTime(), FeatureEngineering() if args.head_network == 'manual' else Resampling( maxPointsPerFrame=10) ]) testDataset = RadarClassDataset(dataset=test_dataset, transforms=test_dataTransformations, sequence_length=args.sequence_length) # Init train datasets, beware we need "Resampling(maxPointsPerFrame=10)" as long as we use PointNet/PointNet++ as feature extractor. # We do NOT need "Resampling(maxPointsPerFrame=10)" asif rely on feature engineering to extract feature. train_dataTransformations = transforms.Compose([ NormalizeTime(), DataAugmentation(), FeatureEngineering() if args.head_network == 'manual' else Resampling( maxPointsPerFrame=10) ]) trainDataset = RadarClassDataset(dataset=train_dataset, transforms=train_dataTransformations, sequence_length=args.sequence_length) # Create dataloader for training by using batch_size frames' data in each batch trainDataLoader = DataLoader(trainDataset, batch_size=args.batchsize, shuffle=True, num_workers=args.num_workers) ''' --- INIT NETWORK MODEL --- ''' # Load PointLSTM network model and put it to right device classifier = PointLSTM( head_name=args.head_network, num_class=args.numclasses, pointCoordDim=6, # If args.head_network=='manual', num_features should be same as how many features have been extracted in class FeatureEngineering of file RadarFeatureTransforms.py num_features=34 if args.head_network == 'manual' else 128 # Note num_features = 34, NOT 15, is because we have 'hist_v', 'hist_RCS' are the vector with size [10, 1] and 'eig_cov_xy' is vecotr with size [2, 1] ).to(device) ''' --- LOAD NETWORK IF EXISTS --- ''' if os.path.exists(saveloadpath): print('Using pretrained model found...') checkpoint = torch.load(saveloadpath) start_epoch = checkpoint[ 'epoch'] + 1 # Just becase make sure counting starts from 1, 2, ..., rather than 0, 1, ..., when print the information of start_epoch iteration = checkpoint['iteration'] best_test_acc = checkpoint['test_accuracy'] classifier.load_state_dict(checkpoint['model_state_dict']) else: print('No existing model, starting training from scratch...') start_epoch = 1 # Just becase make sure counting starts from 1, 2, ..., rather than 0, 1, ..., when print the information of start_epoch iteration = 1 # Just becase make sure counting starts from 1, 2, ..., rather than 0, 1, ..., when print the information of iteration best_test_acc = 0 ''' --- CREATE OPTIMIZER ---''' if args.optimizer == 'SGD': optimizer = torch.optim.SGD(classifier.parameters(), lr=args.lr, momentum=0.9) elif args.optimizer == 'ADAM': optimizer = torch.optim.Adam(classifier.parameters(), lr=args.lr, betas=(0.9, 0.999), eps=1e-08, weight_decay=args.decay_rate) scheduler = torch.optim.lr_scheduler.StepLR( optimizer, step_size=args.lr_epoch_half, gamma=0.5) # half(0.5) the learning rate every 'step_size' epochs # log info printparams = 'Model parameters:' + json.dumps( vars(args), indent=4, sort_keys=True) print(printparams) tb_writer.add_text('hyper-parameters', printparams, iteration) # tb_writer.add_hparam(args) tb_writer.add_text( 'dataset', 'dataset sample size: training: {}, test: {}'.format( train_dataset.shape[0], test_dataset.shape[0]), iteration) ''' --- START TRANING ---''' for epoch in range(start_epoch, args.epoch + 1): print('Epoch %d/%s:' % (epoch, args.epoch)) # Add the "learning rate" into tensorboard scalar which will be shown in tensorboard tb_writer.add_scalar('learning_rate', optimizer.param_groups[0]['lr'], iteration) for batch_id, data in tqdm(enumerate(trainDataLoader, 0), total=len(trainDataLoader), smoothing=0.9): points, target = data # (B:batch x S:seq x C:features x N:points) , (B x S:seq) # Squeeze to drop Sequence dimension, which is equal to 1, convert all the data to float(otherwise there will be data type problems when running the model) and move to device points, target = points.float().to(device), target.float().to( device) # (B:batch x S:sequence, C:features x N:points) , (B) # points, target = points.float().to(device), target.float().to(device) # Reset gradients optimizer.zero_grad() # Sets the module in training mode classifier = classifier.train() # Forward propagation pred = classifier(points) # Calculate cross entropy loss (In the pointnet/pointnet2 network model, it outputs log_softmax result. Since # "log_softmax -> nll_loss" == CrossEntropyLoss, so that we just need to call F.nll_loss) loss = F.nll_loss(pred, target.long()) if args.head_network == 'pointnet': loss += feature_transform_regularizer( classifier.head.trans) * 0.001 if args.feature_transform: loss += feature_transform_regularizer( classifier.head.trans_feat) * 0.001 # Back propagate loss.backward() # Update weights optimizer.step() # Log once for every 5 batches, add the "train_loss/cross_entropy" into tensorboard scalar which will be shown in tensorboard if not batch_id % 5: tb_writer.add_scalar('train_loss/cross_entropy', loss.item(), iteration) iteration += 1 # if batch_id> 2: break scheduler.step() ''' --- TEST AND SAVE NETWORK --- ''' if not epoch % 10: # doing the following things every epoch # Perform predictions on the training data train_targ, train_pred = test(classifier, trainDataset, device, num_workers=args.num_workers, batch_size=1800) # Perform predictions on the testing data test_targ, test_pred = test(classifier, testDataset, device, num_workers=args.num_workers, batch_size=1800) # Calculate the accuracy rate for training data train_acc = metrics_accuracy(train_targ, train_pred) # Calculate the accuracy rate for testing data test_acc = metrics_accuracy(test_targ, test_pred) print('\r Training loss: {}'.format(loss.item())) print('Train Accuracy: {}\nTest Accuracy: {}'.format( train_acc, test_acc)) # Add the "train_acc" "test_acc" into tensorboard scalars which will be shown in tensorboard tb_writer.add_scalars('metrics/accuracy', { 'train': train_acc, 'test': test_acc }, iteration) # Calculate confusion matrix confmatrix_test = metrics_confusion_matrix(test_targ, test_pred) print('Test confusion matrix: \n', confmatrix_test) # Log confusion matrix fig, ax = plot_confusion_matrix(confmatrix_test, class_names, normalize=False, title='Test Confusion Matrix') # Log normalized confusion matrix fig_n, ax_n = plot_confusion_matrix( confmatrix_test, class_names, normalize=True, title='Test Confusion Matrix - Normalized') # Add the "confusion matrix" "normalized confusion matrix" into tensorboard figure which will be shown in tensorboard tb_writer.add_figure('test_confusion_matrix/abs', fig, global_step=iteration, close=True) tb_writer.add_figure('test_confusion_matrix/norm', fig_n, global_step=iteration, close=True) # Log precision recall curves for idx, clsname in enumerate(class_names): # Convert log_softmax to softmax(which is actual probability) and select the desired class test_pred_binary = torch.exp(test_pred[:, idx]) test_targ_binary = test_targ.eq(idx) # Add the "precision recall curves" which will be shown in tensorboard tb_writer.add_pr_curve(tag='pr_curves/' + clsname, labels=test_targ_binary, predictions=test_pred_binary, global_step=iteration) ''' --- SAVE NETWORK --- ''' # if (test_acc >= best_test_acc): # for now lets save every time, since we are only testing in a subset of the test dataset best_test_acc = test_acc # if test_acc > best_test_acc else best_test_acc state = { 'epoch': epoch, 'iteration': iteration, 'train_accuracy': train_acc if args.train_metric else 0.0, 'test_accuracy': best_test_acc, 'model_state_dict': classifier.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), } torch.save(state, saveloadpath) print('Model saved!!!') print('Best Accuracy: %f' % best_test_acc) tb_writer.close()
def train(batch_size, num_train_steps, generator, discriminator, model_dir, beat_type, device): # # Support for tensorboard: # writer = SummaryWriter(model_dir) # # 1. create the ECG dataset: # positive_configs = dataset_configs.DatasetConfigs( 'train', beat_type, one_vs_all=True, lstm_setting=False, over_sample_minority_class=False, under_sample_majority_class=False, only_take_heartbeat_of_type=beat_type, add_data_from_gan=False, gan_configs=None) dataset = ecg_dataset_pytorch.EcgHearBeatsDatasetPytorch( positive_configs, transform=ecg_dataset_pytorch.ToTensor()) dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=1) print("Size of real dataset is {}".format(len(dataset))) # # 2. Create the Networks: # netG = generator.float() netD = discriminator.float() num_d_iters = 5 weight_cliping_limit = 0.01 # # Loss functions for WGAN: # # Optimizers: # WGAN values from paper lr = 0.00005 writer.add_scalar('Learning_Rate', lr) # WGAN with gradient clipping uses RMSprop instead of ADAM optimizer_d = torch.optim.RMSprop(netD.parameters(), lr=lr) optimizer_g = torch.optim.RMSprop(netG.parameters(), lr=lr) # Noise for validation: val_noise = torch.from_numpy(np.random.uniform( 0, 1, (4, 100))).float().to(device) loss_d_real_hist = [] loss_d_fake_hist = [] loss_g_fake_hist = [] norma_grad_g = [] norm_grad_d = [] d_real_pred_hist = [] d_fake_pred_hist = [] epoch = 0 iters = 0 while True: num_of_beats_seen = 0 if iters == num_train_steps: break for i, data in enumerate(dataloader): if iters == num_train_steps: break # Train Dicriminator forward - loss - backward - update num_d_iters times while 1 Generator # forward-loss-backward-update for p in netD.parameters(): p.requires_grad = True for d_iter in range(num_d_iters): netD.zero_grad() # Clamp parameters to a range [-c, c], c=self.weight_cliping_limit for p in netD.parameters(): p.data.clamp_(-weight_cliping_limit, weight_cliping_limit) ecg_batch = data['cardiac_cycle'].float().to(device) b_size = ecg_batch.shape[0] # Check for batch to have full batch_size if (b_size != batch_size): continue num_of_beats_seen += ecg_batch.shape[0] output = netD(ecg_batch) # Adversarial loss loss_d_real = -torch.mean(output) writer.add_scalar('Discriminator/cross_entropy_on_real_batch', loss_d_real.item(), global_step=iters) writer.add_scalars( 'Merged/losses', {'d_cross_entropy_on_real_batch': loss_d_real.item()}, global_step=iters) loss_d_real.backward() loss_d_real_hist.append(loss_d_real.item()) mean_d_real_output = output.mean().item() d_real_pred_hist.append(mean_d_real_output) # # D loss from fake: # noise_input = torch.from_numpy( np.random.uniform(0, 1, (b_size, 100))).float().to(device) output_g_fake = netG(noise_input) output = netD(output_g_fake.detach()) loss_d_fake = torch.mean(output) # ce_loss_d_fake = cross_entropy_loss(output, labels) writer.add_scalar('Discriminator/cross_entropy_on_fake_batch', loss_d_fake.item(), iters) writer.add_scalars( 'Merged/losses', {'d_cross_entropy_on_fake_batch': loss_d_fake.item()}, global_step=iters) loss_d_fake.backward() loss_d_fake_hist.append(loss_d_fake.item()) mean_d_fake_output = output.mean().item() d_fake_pred_hist.append(mean_d_fake_output) total_loss_d = loss_d_fake + loss_d_real writer.add_scalar(tag='Discriminator/total_loss', scalar_value=total_loss_d.item(), global_step=iters) optimizer_d.step() # # Generator updates: # for p in netD.parameters(): p.requires_grad = False # to avoid computation netG.zero_grad() noise_input = torch.from_numpy( np.random.uniform(0, 1, (batch_size, 100))).float().to(device) output_g_fake = netG(noise_input) output = netD(output_g_fake) # Adversarial loss loss_g_fake = -torch.mean(output) loss_g_fake.backward() loss_g_fake_hist.append(loss_g_fake.item()) writer.add_scalar(tag='Generator/cross_entropy_on_fake_batch', scalar_value=loss_g_fake.item(), global_step=iters) writer.add_scalars( 'Merged/losses', {'g_cross_entropy_on_fake_batch': loss_g_fake.item()}, global_step=iters) mean_d_fake_output_2 = output.mean().item() optimizer_g.step() print( "{}/{}: Epoch #{}: Iteration #{}: Mean D(real_hb_batch) = {}, mean D(G(z)) = {}." .format(num_of_beats_seen, len(dataset), epoch, iters, mean_d_real_output, mean_d_fake_output), end=" ") print("mean D(G(z)) = {} After backprop of D".format( mean_d_fake_output_2)) print( "Loss D from real beats = {}. Loss D from Fake beats = {}. Total Loss D = {}" .format(loss_d_real, loss_d_fake, total_loss_d), end=" ") print("Loss G = {}".format(loss_g_fake)) # Norma of gradients: gNormGrad = get_gradient_norm_l2(netG) dNormGrad = get_gradient_norm_l2(netD) writer.add_scalar('Generator/gradients_norm', gNormGrad, iters) writer.add_scalar('Discriminator/gradients_norm', dNormGrad, iters) norm_grad_d.append(dNormGrad) norma_grad_g.append(gNormGrad) print( "Generator Norm of gradients = {}. Discriminator Norm of gradients = {}." .format(gNormGrad, dNormGrad)) if iters % 25 == 0: with torch.no_grad(): output_g = netG(val_noise) fig = plt.figure() plt.title( "Fake beats from Generator. iteration {}".format(i)) for p in range(4): plt.subplot(2, 2, p + 1) plt.plot(output_g[p].cpu().detach().numpy(), label="fake beat") plt.plot(ecg_batch[p].cpu().detach().numpy(), label="real beat") plt.legend() writer.add_figure('Generator/output_example', fig, iters) plt.close() iters += 1 epoch += 1 torch.save( { 'epoch': epoch, 'generator_state_dict': netG.state_dict(), 'discriminator_state_dict': netD.state_dict(), 'optimizer_g_state_dict': optimizer_g.state_dict(), 'optimizer_d_state_dict': optimizer_d.state_dict(), }, model_dir + '/checkpoint_epoch_{}_iters_{}'.format(epoch, iters)) writer.close()
def test(test_loader,model): top1 = AverageMeter(config) top5 = AverageMeter(config) matrix = runningScore(config=config) matrix.reset() times=0.0 timeall =0.0 precision1=0 precision5=0 #3.1 confirm the model converted to cuda # progress bar test_progressor = ProgressBar(mode="test",model_name=config.model_name, total=len(test_loader),weights=config.weights,Status=config.Status,current_time=config.time) # 2.2 switch to evaluate mode and confirm model has been transfered to cuda model.cuda() model.eval() with torch.no_grad(): for i, sample in enumerate(test_loader): image=sample['image'] target=sample['label'] test_progressor.current = i input2_size = image.size() input2 = np.zeros(input2_size).astype(np.float32) input2 = torch.from_numpy(input2).cuda() input = image.cuda() target = target.cuda() #target = Variable(target).cuda() # 2.2.1 compute output torch.cuda.synchronize() start = time.time() _,output= model(input2,input) torch.cuda.synchronize() end = time.time() times=end-start timeall=timeall+times # output=output.squeeze(2) # output=output.squeeze(2) # 2.2.2 measure accuracy and record loss precision1, precision5 = accuracy(output, target, topk=(1, 5)) matrix.update(output,target) top1.update(precision1[0],input.size(0)) # top1.perclass(class_correct,class_total) top5.update(precision5[0], input.size(0)) test_progressor.current_top1 = top1.avg test_progressor.current_top5 = top5.avg test_progressor() _, predicted = torch.max(output, 1) tag=obj[predicted.item()] right_label=obj[target.item()] resultdir=os.path.join(config.weights,config.model_name,config.Status,config.time) if os.path.exists( resultdir ): pass else: os.makedirs(resultdir) f=open(resultdir+'/upload.csv','a') csv_writer = csv.writer(f) csv_writer.writerow([right_label,tag]) # img_path=resultdir+'/'+right_label+str(i)+'.png' # shutil.copy(str(origin_path),img_path) test_progressor.done() logdir = os.path.join(config.weights,config.model_name,config.Status,config.time) writer = SummaryWriter(logdir) confusion_matrix=matrix.get_value() np.save(logdir +'/confusion.npy',confusion_matrix) # writer.add_figure('confusion matrix',figure=plot_confusion_matrix(confusion_matrix, object_names=obj, title='Not Normalized confusion matrix',normalize=False,),global_step=1) writer.add_figure('confusion matrix',figure=plot_confusion_matrix(confusion_matrix, object_names=obj,title='Normalized confusion matrix',config=config,normalize=True),global_step=1) # fig=plot_confusion_matrix(confusion_matrix,obj,'Test Confusion_matrix') writer.close() precision,recall=matrix.get_scores() with open(os.path.join(config.weights,config.model_name,config.Status,config.time)+"/%s_test.txt"%config.model_name,"a") as f: for i in range(config.num_classes): print('Precision of %5s : %f %%' % ( obj[i], 100*precision[i]),file=f) print('Recall of %5s: %f%%'%( obj[i], 100*recall[i]),file=f) print("Top1:%f,Top5:%f"%(top1.avg,top5.avg),file=f) print("avg Time:",timeall*1000/len(test_loader),"ms",file=f)
writer.add_scalar('loss-gen', loss_gen, n_gen_update) writer.add_scalar('n_clip_params', float(n_clip_params)/n_params, n_gen_update) writer.add_scalar('gen-grad-norm', gen_grad_norm, n_gen_update) writer.add_scalar('dis-grad-norm', dis_grad_norm, n_gen_update) writer.add_scalar('avg_loss-dis', loss_dis_avg, n_gen_update) writer.add_scalar('avg_loss-gen', loss_gen_avg, n_gen_update) writer.add_scalar('avg_n_clip_params', float(n_clip_params_avg)/n_params, n_gen_update) writer.add_scalar('avg_grad-norm-gen', gen_grad_norm_avg, n_gen_update) writer.add_scalar('avg_grad-norm-dis', dis_grad_norm_avg, n_gen_update) x_gen = gen(z) fig = plt.figure() plt.hist(x_gen.cpu().squeeze().data, bins=100) writer.add_figure('hist', fig, n_gen_update) plt.clf() fig = plt.figure() plt.hist(x_gen_avg.cpu().squeeze().data, bins=100) writer.add_figure('hist_avg', fig, n_gen_update) plt.clf() if args.save_stats: if n_gen_update == 1: checkpoint_1 = torch.load(os.path.join(OUTPUT_PATH, 'checkpoints/%i.state'%(n_gen_update)), map_location=device) if n_gen_update > 1: checkpoint_2 = torch.load(os.path.join(OUTPUT_PATH, 'checkpoints/%i.state'%(n_gen_update)), map_location=device) hist = compute_path_stats(gen, dis, checkpoint_1, checkpoint_2, dataloader, args, model_loss_gen, model_loss_dis, device, verbose=True)
def eval_epoch(model, data_loader, fold, epoch): writer = SummaryWriter( os.path.join(args.experiment_path, 'fold{}'.format(fold), 'eval')) metrics = { 'loss': utils.Mean(), } model.eval() with torch.no_grad(): fold_labels = [] fold_logits = [] fold_exps = [] for images, feats, exps, labels, _ in tqdm( data_loader, desc='epoch {} evaluation'.format(epoch)): images, feats, labels = images.to(DEVICE), feats.to( DEVICE), labels.to(DEVICE) logits = model(images, feats) loss = compute_loss(input=logits, target=labels) metrics['loss'].update(loss.data.cpu().numpy()) fold_labels.append(labels) fold_logits.append(logits) fold_exps.extend(exps) fold_labels = torch.cat(fold_labels, 0) fold_logits = torch.cat(fold_logits, 0) if epoch % 10 == 0: temp, metric, fig = find_temp_global(probs=fold_logits, target=fold_labels, exps=fold_exps) writer.add_scalar('temp', temp, global_step=epoch) writer.add_scalar('metric_final', metric, global_step=epoch) writer.add_figure('temps', fig, global_step=epoch) temp = 1. # use default temp fold_preds = assign_classes(probs=to_prob(fold_logits, temp).data.cpu().numpy(), exps=fold_exps) fold_preds = torch.tensor(fold_preds).to(fold_logits.device) metric = compute_metric(input=fold_preds, target=fold_labels, exps=fold_exps) metrics = {k: metrics[k].compute_and_reset() for k in metrics} for k in metric: metrics[k] = metric[k].mean().data.cpu().numpy() images = images_to_rgb(images)[:16] print('[FOLD {}][EPOCH {}][EVAL] {}'.format( fold, epoch, ', '.join('{}: {:.4f}'.format(k, metrics[k]) for k in metrics))) for k in metrics: writer.add_scalar(k, metrics[k], global_step=epoch) writer.add_image('images', torchvision.utils.make_grid( images, nrow=math.ceil(math.sqrt(images.size(0))), normalize=True), global_step=epoch) return metrics
def main(args): # Get device # device = torch.device('cuda'if torch.cuda.is_available()else 'cpu') device = 'cuda' # Define model model = FastSpeech().to(device) print("Model Has Been Defined") num_param = utils.get_param_num(model) print('Number of FastSpeech Parameters:', num_param) current_time = time.strftime("%Y-%m-%dT%H:%M", time.localtime()) writer = SummaryWriter(log_dir='log/' + current_time) optimizer = torch.optim.Adam(model.parameters(), betas=(0.9, 0.98), eps=1e-9) # Load checkpoint if exists try: checkpoint_in = open( os.path.join(hp.checkpoint_path, 'checkpoint.txt'), 'r') args.restore_step = int(checkpoint_in.readline().strip()) checkpoint_in.close() checkpoint = torch.load( os.path.join(hp.checkpoint_path, 'checkpoint_%08d.pth' % args.restore_step)) model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) print("\n---Model Restored at Step %d---\n" % args.restore_step) except: print("\n---Start New Training---\n") if not os.path.exists(hp.checkpoint_path): os.mkdir(hp.checkpoint_path) # Get dataset dataset = FastSpeechDataset() # Optimizer and loss scheduled_optim = ScheduledOptim(optimizer, hp.d_model, hp.n_warm_up_step, args.restore_step) fastspeech_loss = FastSpeechLoss().to(device) print("Defined Optimizer and Loss Function.") # Init logger if not os.path.exists(hp.logger_path): os.mkdir(hp.logger_path) # Define Some Information Time = np.array([]) Start = time.perf_counter() # Training model = model.train() t_l = 0.0 for epoch in range(hp.epochs): # Get Training Loader training_loader = DataLoader(dataset, batch_size=hp.batch_size**2, shuffle=True, collate_fn=collate_fn, drop_last=True, num_workers=0) total_step = hp.epochs * len(training_loader) * hp.batch_size for i, batchs in enumerate(training_loader): for j, data_of_batch in enumerate(batchs): start_time = time.perf_counter() current_step = i * hp.batch_size + j + args.restore_step + \ epoch * len(training_loader)*hp.batch_size + 1 # Init scheduled_optim.zero_grad() # Get Data condition1 = torch.from_numpy( data_of_batch["condition1"]).long().to(device) condition2 = torch.from_numpy( data_of_batch["condition2"]).long().to(device) mel_target = torch.from_numpy( data_of_batch["mel_target"]).long().to(device) norm_f0 = torch.from_numpy( data_of_batch["norm_f0"]).long().to(device) mel_in = torch.from_numpy( data_of_batch["mel_in"]).float().to(device) D = torch.from_numpy(data_of_batch["D"]).int().to(device) mel_pos = torch.from_numpy( data_of_batch["mel_pos"]).long().to(device) src_pos = torch.from_numpy( data_of_batch["src_pos"]).long().to(device) lens = data_of_batch["lens"] max_mel_len = data_of_batch["mel_max_len"] # print(condition1,condition2) # Forward mel_output = model(src_seq1=condition1, src_seq2=condition2, mel_in=mel_in, src_pos=src_pos, mel_pos=mel_pos, mel_max_length=max_mel_len, length_target=D) # print(mel_target.size()) # print(mel_output) # print(mel_postnet_output) # Cal Loss # mel_loss, mel_postnet_loss= fastspeech_loss(mel_output, mel_postnet_output,mel_target,) # print(mel_output.shape,mel_target.shape) Loss = torch.nn.CrossEntropyLoss() predict = mel_output.transpose(1, 2) target1 = mel_target.long().squeeze() target2 = norm_f0.long().squeeze() target = ((target1 + target2) / 2).long().squeeze() # print(predict.shape,target.shape) # print(target.float().mean()) losses = [] # print(lens,target) for index in range(predict.shape[0]): # print(predict[i,:,:lens[i]].shape,target[i,:lens[i]].shape) losses.append( Loss(predict[index, :, :lens[index]].transpose(0, 1), target[index, :lens[index]]).unsqueeze(0)) # losses.append(0.5*Loss(predict[index,:,:lens[index]].transpose(0,1),target2[index,:lens[index]]).unsqueeze(0)) total_loss = torch.cat(losses).mean() t_l += total_loss.item() # assert np.isnan(t_l)==False with open(os.path.join("logger", "total_loss.txt"), "a") as f_total_loss: f_total_loss.write(str(t_l) + "\n") # Backward if not np.isnan(t_l): total_loss.backward() else: print(condition1, condition2, D) # Clipping gradients to avoid gradient explosion nn.utils.clip_grad_norm_(model.parameters(), hp.grad_clip_thresh) # Update weights if args.frozen_learning_rate: scheduled_optim.step_and_update_lr_frozen( args.learning_rate_frozen) else: scheduled_optim.step_and_update_lr() # Print if current_step % hp.log_step == 0: Now = time.perf_counter() str1 = "Epoch[{}/{}] Step[{}/{}]:".format( epoch + 1, hp.epochs, current_step, total_step) str2 = "Loss:{:.4f} ".format(t_l / hp.log_step) str3 = "LR:{:.6f}".format( scheduled_optim.get_learning_rate()) str4 = "T: {:.1f}s ETR:{:.1f}s.".format( (Now - Start), (total_step - current_step) * np.mean(Time)) print('\r' + str1 + ' ' + str2 + ' ' + str3 + ' ' + str4, end='') writer.add_scalar('loss', t_l / hp.log_step, current_step) writer.add_scalar('lreaning rate', scheduled_optim.get_learning_rate(), current_step) if hp.gpu_log_step != -1 and current_step % hp.gpu_log_step == 0: os.system('nvidia-smi') with open(os.path.join("logger", "logger.txt"), "a") as f_logger: f_logger.write(str1 + "\n") f_logger.write(str2 + "\n") f_logger.write(str3 + "\n") f_logger.write(str4 + "\n") f_logger.write("\n") t_l = 0.0 if current_step % hp.fig_step == 0 or current_step == 20: f = plt.figure() plt.matshow(mel_output[0].cpu().detach().numpy()) plt.savefig('out_predicted.png') plt.matshow( F.softmax(predict, dim=1).transpose( 1, 2)[0].cpu().detach().numpy()) plt.savefig('out_predicted_softmax.png') writer.add_figure('predict', f, current_step) plt.cla() f = plt.figure(figsize=(8, 6)) # plt.matshow(mel_target[0].cpu().detach().numpy()) # x=np.arange(mel_target.shape[1]) # y=sample_from_discretized_mix_logistic(mel_output.transpose(1,2)).cpu().detach().numpy()[0] # plt.plot(x,y) sample = [] p = F.softmax(predict, dim=1).transpose( 1, 2)[0].detach().cpu().numpy() for index in range(p.shape[0]): sample.append(np.random.choice(200, 1, p=p[index])) sample = np.array(sample) plt.plot(np.arange(sample.shape[0]), sample, color='grey', linewidth='1') for index in range(D.shape[1]): x = np.arange(D[0][index].cpu().numpy() ) + D[0][:index].cpu().numpy().sum() y = np.arange(D[0][index].detach().cpu().numpy()) if condition2[0][index].cpu().numpy() != 0: y.fill( (condition2[0][index].cpu().numpy() - 40.0) * 5) plt.plot(x, y, color='blue') plt.plot(np.arange(target.shape[1]), target[0].squeeze().detach().cpu().numpy(), color='red', linewidth='1') plt.savefig('out_target.png', dpi=300) writer.add_figure('target', f, current_step) plt.cla() plt.close("all") if current_step % (hp.save_step) == 0: print("save model at step %d ..." % current_step, end='') torch.save( { 'model': model.state_dict(), 'optimizer': optimizer.state_dict() }, os.path.join(hp.checkpoint_path, 'checkpoint_%08d.pth' % current_step)) checkpoint_out = open( os.path.join(hp.checkpoint_path, 'checkpoint.txt'), 'w') checkpoint_out.write(str(current_step)) checkpoint_out.close() # os.system('python savefig.py') print('save completed') end_time = time.perf_counter() Time = np.append(Time, end_time - start_time) if len(Time) == hp.clear_Time: temp_value = np.mean(Time) Time = np.delete(Time, [i for i in range(len(Time))], axis=None) Time = np.append(Time, temp_value)
class Watcher(logging.getLoggerClass()): def __init__(self, log_path=None, rank=0): # local-rank is the most important term super().__init__(name="watcher-transformer") self.rank = rank self.progress_bar = None self.best_tracker = None self.tb_writer = None self.info_logger = None if self.rank == 0: formatter = logging.Formatter( '%(asctime)s %(levelname)s: - %(message)s', datefmt='%Y-%m-%d %H:%M:%S') if log_path is not None: fh = logging.FileHandler(log_path) fh.setLevel(logging.DEBUG) fh.setFormatter(formatter) self.addHandler(fh) ch = logging.StreamHandler() ch.setLevel(logging.DEBUG) ch.setFormatter(formatter) self.addHandler(ch) self.setLevel(logging.DEBUG) else: self.setLevel(logging.CRITICAL) def info(self, msg, *args, **kwargs): if self.rank == 0: super().info(msg, *args, **kwargs) # ----- progress bar ---- # def close_progress_bar(self): if self.rank == 0: if self.progress_bar is not None: self.progressbar.close() def set_progress_bar(self, steps=0): if self.rank == 0: self.progressbar = tqdm( total=steps, desc="start a new progress-bar", position=0) def step_progress_bar(self, info_str=None, step=1): if self.rank == 0: self.progressbar.update(step) if info_str is not None: self.progressbar.set_description(info_str) def set_languages(self, langs): try: from langid.langid import LanguageIdentifier, model except ImportError: print('Please install package of langid') self.langid = LanguageIdentifier.from_modelstring( model, norm_probs=True) try: self.langid.set_languages(langs) except ValueError: self.langid.set_languages(['en']) # ----- tensorboard ---- # def set_tensorboard(self, path): if self.rank == 0: self.tb_writer = SummaryWriter(path) def add_tensorboard(self, name, value, iters, dtype='scalar'): if self.rank == 0: if dtype == 'scalar': self.tb_writer.add_scalar(name, value, iters) elif dtype == 'figure': self.tb_writer.add_figure(name, value, iters) elif dtype == 'text': self.tb_writer.add_text(name, value, iters) else: raise NotImplementedError # ----- best performance tracker ---- # def set_best_tracker(self, model, opt, save_path, device, *names): self.best_tracker = Best( max, *names, 'i', model=model, opt=opt, path=save_path, gpu=device) def acc_best_tracker(self, iters, *values): if self.rank == 0: self.best_tracker.accumulate(*values, iters) def detect_lang(self, line): return self.langid.classify(line)[0] def match_lang(self, line, lang): scores = {l: v for l, v in self.langid.rank(line)} if lang not in scores: raise KeyError return scores[lang]
class Mytensorboard(NetManager): INSTANCE = None def __init__(self, comment=''): self.writer = SummaryWriter(comment='_' + comment) self.writerLayout = {'Loss': {}, 'PSNR': {}, 'MSE': {}} self.step = 0 @classmethod def get_instance(cls, comment=''): if cls.INSTANCE is None: cls.INSTANCE = Mytensorboard(comment=comment) return cls.INSTANCE def plotToTensorboard(self, fig, name): self.writer.add_figure(name, fig, global_step=self.step, close=True) def imgToTensorboard(self, img, name): # img = np.swapaxes(img, 0, 2) # if your TensorFlow + TensorBoard version are >= 1.8 or use tensorflow self.writer.add_image(name, img, global_step=self.step) def batchImageToTensorBoard(self, recon, resi, name): if recon is not None: img = (recon.cpu().detach().numpy() + resi.cpu().detach().numpy()) * 255.0 else: img = (resi.cpu().detach().numpy()) * 255.0 img = np.clip(img, 0, 255).astype(int) self.writer.add_image(name, img, global_step=self.step) def SaveImageToTensorBoard(self, name, image): image = np.clip(image * 255.0, 0, 255).astype(int) self.writer.add_image(name, image, global_step=self.step) def saveImageFromTest(self, recon, resi, name): img = (recon.cpu().detach().numpy() + resi.cpu().detach().numpy()) * 255.0 img = np.clip(img, 0, 255).astype(np.uint8) img = Image.fromarray(img[0, 0]) img.save(name + 'png') @staticmethod def Makegrid(imgs, nrow=None): if nrow is None: nrow = math.ceil(math.sqrt(imgs.shape[0])) return vutils.make_grid(imgs, nrow) def setObjectStep(self, num_set): self.object_step = num_set * self.OBJECT_EPOCH def plotScalars(self): for key, values in self.writerLayout.items(): self.writer.add_scalars(key, values, self.step) def plotDifferent(self, img, name, percentile=90): if isinstance(img, torch.Tensor): img = (img.cpu().detach().numpy()) * 1023.0 else: img = img * 1023.0 percentile = percentile + (100 - percentile) // 2 img = np.clip(img, np.percentile(img, 100 - percentile, interpolation='higher'), np.percentile(img, percentile, interpolation='lower')) img = np.clip(img, -1023.0, 1023.0) fig, ax = plt.subplots() if img.min() < 0 and img.max() > 0: mymax = max(abs(img.min()), img.max()) mymin = -mymax else: mymin = img.min() mymax = img.max() imgs = ax.imshow((img[0]).astype(int), vmin=mymin, vmax=mymax, interpolation='nearest', cmap=plt.cm.get_cmap('seismic')) v1 = np.linspace(mymin, mymax, 10, endpoint=True) cb = fig.colorbar(imgs, ticks=v1) cb.ax.set_yticklabels(["{:4.2f}".format(i) for i in v1]) self.plotToTensorboard(fig, name + '_percentile' + str(percentile)) return def plotMSEImage(self, resi, name): img = ((resi.cpu().detach().numpy()) ** 2) * 1023.0 * 1023.0 fig, ax = plt.subplots() if img.min() < 0 and img.max() > 0: mymax = max(abs(img.min()), img.max()) mymin = -mymax else: mymin = img.min() mymax = img.max() imgs = ax.imshow((img[0]).astype(int), vmin=mymin, vmax=mymax, interpolation='nearest', cmap=plt.cm.get_cmap('seismic')) v1 = np.linspace(mymin, mymax, 10, endpoint=True) cb = fig.colorbar(imgs, ticks=v1) cb.ax.set_yticklabels(["{:4.2f}".format(i) for i in v1]) self.plotToTensorboard(fig, name) return def plotMAEImage(self, resi, name, percentile=90): img = (resi.cpu().detach().numpy()) * 1023.0 np.abs(img, out=img) fig, ax = plt.subplots() if percentile < 100: np.clip(img, 0, np.percentile(img, percentile, interpolation='lower'), out=img) mymin = img.min() mymax = img.max() imgs = ax.imshow((img[0]).astype(int), vmin=mymin, vmax=mymax, interpolation='nearest', cmap=plt.cm.get_cmap('seismic')) v1 = np.linspace(mymin, mymax, 10, endpoint=True) cb = fig.colorbar(imgs, ticks=v1) cb.ax.set_yticklabels(["{:4.2f}".format(i) for i in v1]) self.plotToTensorboard(fig, name + '_percentile' + str(percentile)) return """ Use to plot input data 2d vminmax is range as list or tuple, example [22, 37] for example is qp is 22, 27, 32, 37 plotMap(qpmap, 'QPMap', vminmax = [22, 37], color_num = 4) """ def plotMap(self, img, name, vminmax=None, color_num=None): if vminmax is None: vminmax = (img.min().cpu(), img.max().cpu()) img = self.Makegrid(img) fig, ax = plt.subplots() img = img.cpu() if color_num is None: color_num = len(img.unique()) imgs = ax.imshow((img.numpy()[0]).astype(int), vmin=vminmax[0], vmax=vminmax[1], interpolation='nearest', cmap=plt.cm.get_cmap('viridis', color_num)) v1 = np.round(np.linspace(vminmax[0], vminmax[1], 10, endpoint=True)) cb = fig.colorbar(imgs, ticks=v1) cb.ax.set_yticklabels(["{:4.2f}".format(i) for i in v1]) self.plotToTensorboard(fig, name) return def SetLoss(self, name, value): self.writerLayout['Loss'][name] = value def SetPSNR(self, name, value): self.writerLayout['PSNR'][name] = value def SetMSE(self, name, value): self.writerLayout['MSE'][name] = value def SetLearningRate(self, value): self.writer.add_scalars('LearningRate', {'lr': value}, self.step)
def main(config_file): # read from config local_config = __import__(config_file) model_name = local_config.INPUTS['MODEL'] model = getattr(__import__('birdsong.models', fromlist=[model_name]), model_name) batch_size = local_config.INPUTS['BATCHSIZE'] optimizer_name = local_config.INPUTS['OPTIMIZER'] optimizer = getattr(__import__('torch.optim', fromlist=[optimizer_name]), optimizer_name) num_epochs = local_config.INPUTS['EPOCHS'] no_classes = local_config.INPUTS['CLASSES'] learning_rate = local_config.INPUTS['LR'] # logging start_time = time.time() date = time.strftime('%d-%m-%Y-%H-%M-%S', time.localtime()) log_path = f'./birdsong/run_log/{model_name}_{date}' state_fname, log_fname, summ_tensor_board = logger.create_log(log_path) writer = SummaryWriter(str(summ_tensor_board)) # Enhancement enh = None #Exponent(0.17) # Augmentation aug = SoundscapeNoise('storage/noise_slices', scaling=0.4) # Datasets and Dataloaders ds_train = SpectralDataset(TRAIN, INPUT_DIR, enhancement_func=enh, augmentation_func=aug) ds_test = SpectralDataset(TEST, INPUT_DIR, enhancement_func=enh) dl_train = DataLoader(ds_train, batch_size, num_workers=4, pin_memory=PIN, shuffle=True) dl_test = DataLoader(ds_test, batch_size, num_workers=4, pin_memory=PIN, shuffle=True) print('Dataloaders initialized') # Model time_axis = ds_test.shape[1] freq_axis = ds_test.shape[0] net = model(time_axis=time_axis, freq_axis=freq_axis, no_classes=no_classes) criterion = nn.CrossEntropyLoss() optimizer = optimizer(net.parameters(), lr=learning_rate) # Logging general run information: info = f""" INFO: \n File type: {FILE_TYPE} \n Optimizer: {optimizer_name} \n Batch Size: {batch_size} \n Classes': {no_classes} \n Enhancement: {ds_train.enhancement_func.__repr__()} \n Augmentation: {ds_train.augmentation_func.__repr__()} \n Supposed to run for: {num_epochs} \n Date: {date}""" writer.add_text('Info: ', info) # local vars best_acc = 0 for epoch in range(num_epochs): train(net, dl_train, epoch, optimizer, criterion, DEVICE) train_stats, train_preds = evaluate(net, dl_train, criterion, no_classes, DEVICE) print( f'Training: Loss: {train_stats[0]:.5f}, Acc: {train_stats[1]:.5f}, Top 5: {train_stats[2]:.5f}' ) test_stats, test_preds = evaluate(net, dl_test, criterion, no_classes, DEVICE) print( f'Validation: Loss: {test_stats[0]:.5f}, Acc: {test_stats[1]:.5f}, Top 5: {test_stats[2]:.5f}' ) is_best = test_stats[1] > best_acc best_acc = max(test_stats[1], best_acc) print('Best Accuracy: {:.5f}'.format(best_acc)) logger.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': net.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'best_accuracy': best_acc, }, is_best, filename=state_fname) # Store confusion matrix every 5 epochs or at the end of training if epoch % 5 == 0 or epoch == num_epochs - 1: cm_train = plot_confusion_matrix(train_preds[0], train_preds[1], np.arange(no_classes), normalize=True) cm_val = plot_confusion_matrix(test_preds[0], test_preds[1], np.arange(no_classes), normalize=True) writer.add_figure('Training', cm_train, epoch) writer.add_figure('Validation', cm_val, epoch) logger.write_summary(writer, epoch, train_stats, test_stats) logger.dump_log_txt(date, start_time, local_config, train_stats, test_stats, best_acc, epoch + 1, log_fname) # LR schedule update_lr(optimizer, epoch, learning_rate, 0.05) writer.close() print('Finished Training')
class Logger: def __init__(self, log_dir, n_logged_samples=10, summary_writer=None): self._log_dir = log_dir print('########################') print('logging outputs to ', log_dir) print('########################') self._n_logged_samples = n_logged_samples self._summ_writer = SummaryWriter(log_dir, flush_secs=1, max_queue=1) def log_scalar(self, scalar, name, step_): self._summ_writer.add_scalar('{}'.format(name), scalar, step_) def log_scalars(self, scalar_dict, group_name, step, phase): """Will log all scalars in the same plot.""" self._summ_writer.add_scalars('{}_{}'.format(group_name, phase), scalar_dict, step) def log_image(self, image, name, step): assert(len(image.shape) == 3) # [C, H, W] self._summ_writer.add_image('{}'.format(name), image, step) def log_video(self, video_frames, name, step, fps=10): assert len(video_frames.shape) == 5, "Need [N, T, C, H, W] input tensor for video logging!" self._summ_writer.add_video('{}'.format(name), video_frames, step, fps=fps) def log_paths_as_videos(self, paths, step, max_videos_to_save=2, fps=10, video_title='video'): # reshape the rollouts videos = [np.transpose(p['image_obs'], [0, 3, 1, 2]) for p in paths] #0312 videos = np.flip(videos, 3) # max rollout length max_videos_to_save = np.min([max_videos_to_save, len(videos)]) max_length = videos[0].shape[0] for i in range(max_videos_to_save): if videos[i].shape[0]>max_length: max_length = videos[i].shape[0] # pad rollouts to all be same length for i in range(max_videos_to_save): if videos[i].shape[0]<max_length: padding = np.tile([videos[i][-1]], (max_length-videos[i].shape[0],1,1,1)) videos[i] = np.concatenate([videos[i], padding], 0) # log videos to tensorboard event file videos = np.stack(videos[:max_videos_to_save], 0) self.log_video(videos, video_title, step, fps=fps) def log_figures(self, figure, name, step, phase): """figure: matplotlib.pyplot figure handle""" assert figure.shape[0] > 0, "Figure logging requires input shape [batch x figures]!" self._summ_writer.add_figure('{}_{}'.format(name, phase), figure, step) def log_figure(self, figure, name, step, phase): """figure: matplotlib.pyplot figure handle""" self._summ_writer.add_figure('{}_{}'.format(name, phase), figure, step) def log_graph(self, array, name, step, phase): """figure: matplotlib.pyplot figure handle""" im = plot_graph(array) self._summ_writer.add_image('{}_{}'.format(name, phase), im, step) def dump_scalars(self, log_path=None): log_path = os.path.join(self._log_dir, "scalar_data.json") if log_path is None else log_path self._summ_writer.export_scalars_to_json(log_path) def flush(self): self._summ_writer.flush()
# print statistics running_loss += loss.item() if i % 2000 == 1999: # print every 2000 mini-batches print('[%d, %5d] loss: %.3f' % (epoch + 1, i + 1, running_loss / 2000)) # 把数据写入tensorflow # ...log the running loss writer.add_scalar('image training loss', running_loss / 2000, epoch * len(trainloader) + i) # ...log a Matplotlib Figure showing the models's predictions on a # random mini-batch writer.add_figure('predictions vs. actuals', plot_classes_preds(net, inputs, labels), global_step=epoch * len(trainloader) + i) running_loss = 0.0 torch.save(net.state_dict(), PATH) print('Finished Training') print("Time taken:", datetime.now() - startTime) print("***************************") # 获取一些随机测试数据 print("获取一些随机测试数据") dataiter = iter(testloader) images, labels = dataiter.next()
def test(config,logger): """ Note: when 'config.mode' is 'test', you don't neet to set 'config.forward_only' and 'config.resume' to true test model on test set """ #init logger and seed start_time = time.strftime('%Y-%m-%d,%H:%M:%S', time.localtime(time.time())) logger.info('[START TESTING]\n{}\n{}'.format(start_time, '=' * 90)) logger.info(config) logger.info('load data...') #load data test_dataset=RafDB(mode="test") test_loader=RafDBLoader(dataset=test_dataset,batch_size=config.bsz,shuffle=True) logger.info('create net...') #define net net_class=getattr(networks,config.net) net=net_class() logger.info('check and set GPU...') #check gpu if config.use_gpu==True and torch.cuda.is_available(): # os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # os.environ["CUDA_VISIBLE_DEVICES"] = config.gpu_ids device=torch.device('cuda') net.to(device) if torch.cuda.device_count()>1: device_ids=[idx for idx in range(torch.cuda.device_count())] torch.distributed.init_process_group(backend='nccl', init_method=f'tcp://localhost:{config.localhost}', rank=0, world_size=1) net=torch.nn.parallel.DistributedDataParallel(net, device_ids=device_ids, find_unused_parameters=True) logger.info('create loss instance...') #define loss net_criterion=getattr(losses,config.loss) if config.loss_is_weighted: weights=torch.tensor([float(weight) for weight in config.loss_weights.split(',')]) if config.use_gpu: weights=weights.to(device) criterion=net_criterion(weight=weights/torch.sum(weights,dim=0)) else: criterion=net_criterion() #load checkpoint if needed start_n_iter=0 start_epoch=0 if len(config.ckpt_path)>0: logger.info(f'load checkpoint from {config.ckpt_path}...') ckpt=load_checkpoint(config.ckpt_path) start_n_iter=ckpt['n_iter'] start_epoch=ckpt['epoch'] net.load_state_dict(ckpt['net']) logger.info(f"Epoch={start_epoch}, N_iter={start_n_iter}") #tensorboardX logger.info("set tensorboardX...") writer_dir=os.path.join(config.output_dir,'boardX') if not os.path.exists(writer_dir): os.makedirs(writer_dir) writer=SummaryWriter(writer_dir) logger.info(f"test on test set...") result=np.zeros((7,7)) net.eval() with torch.no_grad(): pbar=tqdm(enumerate(test_loader),total=len(test_loader)) start_time=time.time() tot_loss=0 TOT,TP=0,0 for i,data in pbar: #prepare x_data,y_data=data if len(x_data.size())!=4: logger.error(f"x_data size wrong! Now the size is {x_data.size()}") raise if len(y_data.size())>1: y_data=y_data.squeeze(-1) if config.use_gpu==True: x_data=x_data.to(device) y_data=y_data.to(device) prepare_time=time.time()-start_time #forward and predict pred_data=net(x_data) loss=criterion(pred_data,y_data) loss=torch.mean(loss,dim=0) embedding_addable=False if isinstance(pred_data,list): embedding=pred_data[1] pred_data=pred_data[0] embedding_addable=True pred_data=torch.argmax(pred_data,dim=1) result,tp=log_result(result,pred_data,y_data) #log tot_loss+=loss.item() TOT+=y_data.size(0) TP+=tp process_time=time.time()-start_time-prepare_time pbar.set_description("Compute efficiency: {:.2f}, iter: {}/{}:".format( process_time/(process_time+prepare_time), i, len(test_loader))) writer.add_scalars('loss',{'Test':loss.item()},i) if config.add_embedding and embedding_addable: add_embedding(writer,mat=embedding,metadata=y_data,label_img=x_data,global_step=i,tag="test set") #write and log writer.add_figure('confusion_matrix_on_test_set',figure=plot_confusion_matrix(result, classes=test_dataset.CLASSNAMES, normalize=True,title='confusion matrix on test set'),global_step=start_n_iter) normalized_result = result.astype('float') / (0.0001+result.sum(axis=1)[:, np.newaxis]) mdv=sum([normalized_result[i][i] for i in range(7)])/7 writer.add_scalar('mean_diagonal_value',mdv,start_n_iter) writer.add_scalar('accuracy',TP/TOT,start_n_iter) logger.info(f"loss of checkpoint in {config.ckpt_path}: {tot_loss/len(test_loader)}") logger.info(f"mean diagonal value of checkpoint in {config.ckpt_path}: {mdv}") logger.info(f"accuracy of checkpoint in {config.ckpt_path}: {TP/TOT}") logger.info(f"confusion matrix on test set: {normalized_result}") logger.info("exit 0.")
class Logger(object): def __init__(self, log_dir): self.writer = SummaryWriter(log_dir) self.train_stats = {} self.eval_stats = {} def tb_model_weights(self, model, step): layer_num = 1 for name, param in model.named_parameters(): if param.numel() == 1: self.writer.add_scalar( "layer{}-{}/value".format(layer_num, name), param.max(), step) else: self.writer.add_scalar( "layer{}-{}/max".format(layer_num, name), param.max(), step) self.writer.add_scalar( "layer{}-{}/min".format(layer_num, name), param.min(), step) self.writer.add_scalar( "layer{}-{}/mean".format(layer_num, name), param.mean(), step) self.writer.add_scalar( "layer{}-{}/std".format(layer_num, name), param.std(), step) self.writer.add_histogram( "layer{}-{}/param".format(layer_num, name), param, step) self.writer.add_histogram( "layer{}-{}/grad".format(layer_num, name), param.grad, step) layer_num += 1 def dict_to_tb_scalar(self, scope_name, stats, step): for key, value in stats.items(): self.writer.add_scalar('{}/{}'.format(scope_name, key), value, step) def dict_to_tb_figure(self, scope_name, figures, step): for key, value in figures.items(): self.writer.add_figure('{}/{}'.format(scope_name, key), value, step) def dict_to_tb_audios(self, scope_name, audios, step, sample_rate): for key, value in audios.items(): try: self.writer.add_audio('{}/{}'.format(scope_name, key), value, step, sample_rate=sample_rate) except: traceback.print_exc() def tb_train_iter_stats(self, step, stats): self.dict_to_tb_scalar("TrainIterStats", stats, step) def tb_train_epoch_stats(self, step, stats): self.dict_to_tb_scalar("TrainEpochStats", stats, step) def tb_train_figures(self, step, figures): self.dict_to_tb_figure("TrainFigures", figures, step) def tb_train_audios(self, step, audios, sample_rate): self.dict_to_tb_audios("TrainAudios", audios, step, sample_rate) def tb_eval_stats(self, step, stats): self.dict_to_tb_scalar("EvalStats", stats, step) def tb_eval_figures(self, step, figures): self.dict_to_tb_figure("EvalFigures", figures, step) def tb_eval_audios(self, step, audios, sample_rate): self.dict_to_tb_audios("EvalAudios", audios, step, sample_rate) def tb_test_audios(self, step, audios, sample_rate): self.dict_to_tb_audios("TestAudios", audios, step, sample_rate) def tb_test_figures(self, step, figures): self.dict_to_tb_figure("TestFigures", figures, step)
def train(config,logger): #init logger and seed start_time = time.strftime('%Y-%m-%d,%H:%M:%S', time.localtime(time.time())) logger.info('[START TRAINING]\n{}\n{}'.format(start_time, '=' * 90)) logger.info(config) logger.info('load data...') #load data train_dataset=RafDB(mode="train") val_dataset=RafDB(mode="test") train_loader=RafDBLoader(dataset=train_dataset,batch_size=config.bsz,shuffle=True) val_loader=RafDBLoader(dataset=val_dataset,batch_size=config.bsz,shuffle=True) logger.info('create net instance...') #define net net_class=getattr(networks,config.net) net=net_class() logger.info('check and set GPU...') #check gpu if config.use_gpu==True and torch.cuda.is_available(): # os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # os.environ["CUDA_VISIBLE_DEVICES"] = config.gpu_ids device=torch.device('cuda') net.to(device) if torch.cuda.device_count()>1: device_ids=[idx for idx in range(torch.cuda.device_count())] torch.distributed.init_process_group(backend='nccl', init_method=f'tcp://localhost:{config.localhost}', rank=0, world_size=1) net=torch.nn.parallel.DistributedDataParallel(net, device_ids=device_ids, find_unused_parameters=True) logger.info('create loss instance...') #define loss net_criterion=getattr(losses,config.loss) if config.loss_is_weighted: weights=torch.tensor([float(weight) for weight in config.loss_weights.split(',')]) if config.use_gpu: weights=weights.to(device) criterion=net_criterion(weight=weights/torch.sum(weights,dim=0)) else: criterion=net_criterion() logger.info('create optimizer...') #define optimizer optimizer=config_optimizer(net.parameters(),config) logger.info("check LR scheduler...") #define lr_scheduler schedule_on_iter=config.schedule_on_iter schedule_on_epoch=config.schedule_on_epoch if schedule_on_iter: iter_scheduler=config_scheduler(optimizer,config,mode='iter') if schedule_on_epoch: epoch_scheduler=config_scheduler(optimizer,config,mode='epoch') #load checkpoint if needed start_n_iter=0 start_epoch=0 if config.resume==True: logger.info(f'load checkpoint from {config.ckpt_path}...') ckpt=load_checkpoint(config.ckpt_path) start_n_iter=ckpt['n_iter'] start_epoch=ckpt['epoch'] net.load_state_dict(ckpt['net']) optimizer.load_state_dict(ckpt['optim']) #tensorboardX logger.info("set tensorboardX...") writer_dir=os.path.join(config.output_dir,'boardX') if not os.path.exists(writer_dir): os.makedirs(writer_dir) writer=SummaryWriter(writer_dir) #ckpt dir ckpt_dir=os.path.join(config.output_dir,'ckpt') if not os.path.exists(ckpt_dir): os.makedirs(ckpt_dir) #start logger.info(f'start with epoch range of [{start_epoch}, {config.epoch})...') n_iter=start_n_iter mmdv=0 m_n_iter=0 if n_iter==0: optimizer.zero_grad() for epoch in range(start_epoch,config.epoch): if config.forward_only==False: net.train() pbar=tqdm(enumerate(train_loader),total=len(train_loader)) start_time=time.time() tot_loss=0 for i,data in pbar: #prepare x_data,y_data=data if len(x_data.size())!=4: logger.error(f"x_data size wrong! Now the size is {x_data.size()}") raise if len(y_data.size())>1: y_data=y_data.squeeze(-1) if config.use_gpu==True: x_data=x_data.to(device) y_data=y_data.to(device) prepare_time=time.time()-start_time #forward and backward pred_data=net(x_data) loss=criterion(pred_data,y_data) loss=torch.mean(loss,dim=0) loss/=config.gd_acc loss.backward() if n_iter%config.gd_acc==config.gd_acc-1: optimizer.step() optimizer.zero_grad() if schedule_on_iter: iter_scheduler.step() #log tot_loss+=loss.item()*config.gd_acc writer.add_scalars('loss',{'Train':loss.item()*config.gd_acc},n_iter) process_time=time.time()-start_time-prepare_time pbar.set_description("Compute efficiency: {:.2f}, epoch: {}/{}:".format( process_time/(process_time+prepare_time), epoch, config.epoch)) if config.add_embedding and isinstance(pred_data,list): add_embedding(writer,mat=pred_data[1],metadata=y_data,label_img=x_data,global_step=n_iter,tag="train set") n_iter+=1 logger.info(f"[Epoch: {epoch}]TrainLoss:{tot_loss/len(train_loader)}") if schedule_on_epoch: epoch_scheduler.step() #val and save if epoch%config.save_per_epoch==config.save_per_epoch-1: logger.info(f"suspend to save ckpt and validate on val set when epoch[{epoch}] is complete...") result=np.zeros((7,7)) net.eval() with torch.no_grad(): pbar=tqdm(enumerate(val_loader),total=len(val_loader)) start_time=time.time() tot_loss=0 TOT,TP=0,0 for i,data in pbar: #prepare x_data,y_data=data if len(x_data.size())!=4: logger.error(f"x_data size wrong! Now the size is {x_data.size()}") raise if len(y_data.size())>1: y_data=y_data.squeeze(-1) if config.use_gpu==True: x_data=x_data.to(device) y_data=y_data.to(device) prepare_time=time.time()-start_time #forward and predict pred_data=net(x_data) loss=criterion(pred_data,y_data) loss=torch.mean(loss,dim=0) if isinstance(pred_data,list): pred_data=pred_data[0] pred_data=torch.argmax(pred_data,dim=1) result,tp=log_result(result,pred_data,y_data) #log tot_loss+=loss.item() TP+=tp TOT+=y_data.size(0) process_time=time.time()-start_time-prepare_time pbar.set_description("Compute efficiency: {:.2f}, epoch: {}/{}:".format( process_time/(process_time+prepare_time), epoch, config.epoch)) writer.add_scalars('loss',{'Val':tot_loss/len(val_loader)},n_iter) normalized_result = result.astype('float') / (0.0001+result.sum(axis=1)[:, np.newaxis]) tmdv=sum([normalized_result[i][i] for i in range(7)])/7 writer.add_scalar('mean_diagonal_value',tmdv,n_iter) writer.add_figure('confusion_matrix_on_val_set',figure=plot_confusion_matrix(result, classes=val_dataset.CLASSNAMES, normalize=True,title='confusion matrix on val set'),global_step=n_iter) writer.add_scalar('accuracy',TP/TOT,n_iter) if check_save(tmdv,mmdv): mmdv=tmdv if m_n_iter: os.remove(os.path.join(ckpt_dir,f'ckpt-{m_n_iter}.pickle')) m_n_iter=n_iter save_checkpoint(os.path.join(ckpt_dir,f'ckpt-{n_iter}.pickle'),net,optimizer,epoch,n_iter) logger.info(f"[Epoch: {epoch}]ValLoss:{tot_loss/len(val_loader)}") else: logger.info(f"forward only! So validate on train set when epoch[{epoch}] is complete...") result=np.zeros((7,7)) net.eval() with torch.no_grad(): pbar=tqdm(enumerate(train_loader),total=len(train_loader)) start_time=time.time() tot_loss=0 TOT,TP=0,0 for i,data in pbar: #prepare x_data,y_data=data if len(x_data.size())!=4: logger.error(f"x_data size wrong! Now the size is {x_data.size()}") raise if len(y_data.size())>1: y_data=y_data.squeeze(-1) if config.use_gpu==True: x_data=x_data.to(device) y_data=y_data.to(device) prepare_time=time.time()-start_time #forward and predict pred_data=net(x_data) loss=criterion(pred_data,y_data) loss=torch.mean(loss,dim=0) embedding_addable=False if isinstance(pred_data,list): embedding=pred_data[1] pred_data=pred_data[0] embedding_addable=True pred_label=torch.argmax(pred_data,dim=1) result,tp=log_result(result,pred_label,y_data) #log tot_loss+=loss.item() TP+=tp TOT+=y_data.size(0) process_time=time.time()-start_time-prepare_time pbar.set_description("Compute efficiency: {:.2f}, epoch: {}/{}:".format( process_time/(process_time+prepare_time), epoch, config.epoch)) writer.add_scalars('loss',{'Train':loss.item()},i) if config.add_embedding and embedding_addable: add_embedding(writer,mat=pred_data,metadata=y_data,label_img=x_data,global_step=i,tag="7-dim vectors on train set") writer.add_figure('confusion_matrix_on_train_set',figure=plot_confusion_matrix(result, classes=train_dataset.CLASSNAMES, normalize=True,title='confusion matrix on train set'),global_step=n_iter) normalized_result = result.astype('float') / (0.0001+result.sum(axis=1)[:, np.newaxis]) writer.add_scalar('mean_diagonal_value',sum([normalized_result[i][i] for i in range(7)])/7,n_iter) writer.add_scalar('accuracy',TP/TOT,n_iter) logger.info(f"[Epoch: {epoch}]TrainLoss:{tot_loss/len(train_loader)}") break logger.info("exit 0.")
def main(model_restart, lr, hidden_layers, epochs): # Load params if model_restart: with open(op.join("models", model_restart+'.json'), "r") as f: params = json.load(f) else: params= dict( hidden_layers = [4,4], epochs = 100000, batch_size = 500, lr=6e-4 ) if lr: params['lr'] = lr if epochs: params['epochs'] = epochs if hidden_layers: hidden_layers = hidden_layers[1:-1] if hidden_layers[0]=='[' else hidden_layers hidden_layers = [int(l) for l in hidden_layers.split(',')] # Load data loader, dataset, inp, out, msk, inp_mean, out_mean, inp_std, out_std, e231 = load_data("e231.csv", params) writer = SummaryWriter() # Build model model = build_model(params["hidden_layers"]) if model_restart: model.load_state_dict(torch.load(op.join("models", model_restart + ".mdl"))) # model = nn.Linear(2,3) opt = torch.optim.Adam(model.parameters(), lr=params['lr'], weight_decay=0.001, betas=(0.9, 0.999), eps=1e-08 ) # momentum=0.9, # dampening=0., # nesterov=True) # Train model.train() # plt.plot(e231.alpha, e231.Cl, '+-') f, a = plt.subplots() a.plot(inp.data[:,0], model(inp)[:,0].data, '+') a.plot(inp.data[:,0], out[:,0].data, '+') f.savefig("tmp.png") f_, a_ = plt.subplots() LL = [] try: for e in range(params["epochs"]): L = 0 ite = 0 for i, o, m in loader: loss = (((model(i) - o)*m)**2).sum() opt.zero_grad() loss.backward() opt.step() L += loss.data ite += i.shape[0] LL.append(L/ite) if e%10 ==0: writer.add_scalar('loss',L/ite, e) if e%100 == 0: a.clear() a.plot(inp.data[:,0], model(inp)[:,0].data, '+') a.plot(inp.data[:,0], out[:,0].data, '+') writer.add_figure("data_fit",f) # f.savefig("tmp.png") # a_.clear() # a_.loglog(LL) # f_.savefig(learning.png") # writer.add_figure(f_) except KeyboardInterrupt: pass # Save model model_file_name = os.path.join("models", os.path.basename(writer.logdir)+ ".mdl") torch.save(model.state_dict(), model_file_name) with open(model_file_name[:-4]+'.json', "w") as f: params["epochs"] = e json.dump(params, f)
def svm_fit_and_test(config,logger): """ Note: only when config.mode=='test' and config.classifier=='svm' """ #init logger and seed start_time = time.strftime('%Y-%m-%d,%H:%M:%S', time.localtime(time.time())) logger.info('[START TESTING]\n{}\n{}'.format(start_time, '=' * 90)) logger.info(config) logger.info('load data...') #load data train_dataset=RafDB(mode="train") train_loader=RafDBLoader(dataset=train_dataset,batch_size=config.bsz,shuffle=True) test_dataset=RafDB(mode="test") test_loader=RafDBLoader(dataset=test_dataset,batch_size=config.bsz,shuffle=True) logger.info('create net...') #define net net_class=getattr(networks,config.net) net=net_class() logger.info('check and set GPU...') #check gpu if config.use_gpu==True and torch.cuda.is_available(): # os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # os.environ["CUDA_VISIBLE_DEVICES"] = config.gpu_ids device=torch.device('cuda') net.to(device) if torch.cuda.device_count()>1: device_ids=[idx for idx in range(torch.cuda.device_count())] torch.distributed.init_process_group(backend='nccl', init_method=f'tcp://localhost:{config.localhost}', rank=0, world_size=1) net=torch.nn.parallel.DistributedDataParallel(net, device_ids=device_ids, find_unused_parameters=True) logger.info('create loss instance...') #load checkpoint if needed start_n_iter=0 start_epoch=0 if len(config.ckpt_path)>0: logger.info(f'load checkpoint from {config.ckpt_path}...') ckpt=load_checkpoint(config.ckpt_path) start_n_iter=ckpt['n_iter'] start_epoch=ckpt['epoch'] net.load_state_dict(ckpt['net']) logger.info(f"Epoch={start_epoch}, N_iter={start_n_iter}") #tensorboardX logger.info("set tensorboardX...") writer_dir=os.path.join(config.output_dir,'boardX') if not os.path.exists(writer_dir): os.makedirs(writer_dir) writer=SummaryWriter(writer_dir) #train svm logger.info(f"extract features on train set...") net.eval() with torch.no_grad(): pbar=tqdm(enumerate(train_loader),total=len(train_loader)) start_time=time.time() feats,labels=[],[] for i,data in pbar: #prepare x_data,y_data=data if len(x_data.size())!=4: logger.error(f"x_data size wrong! Now the size is {x_data.size()}") raise if len(y_data.size())>1: y_data=y_data.squeeze(-1) if config.use_gpu==True: x_data=x_data.to(device) prepare_time=time.time()-start_time #forward and predict pred_data=net(x_data) embedding_addable=False if isinstance(pred_data,list): embedding=pred_data[1].detach().cpu() pred_data=pred_data[0].detach().cpu() embedding_addable=True feats+=[embedding] labels+=[y_data] process_time=time.time()-start_time-prepare_time pbar.set_description("Compute efficiency: {:.2f}, iter: {}/{}:".format( process_time/(process_time+prepare_time), i, len(test_loader))) if config.add_embedding and embedding_addable: add_embedding(writer,mat=embedding,metadata=y_data,label_img=x_data,global_step=i,tag="train set") logger.info("create svm...") svmX=torch.cat(feats,dim=0).numpy() svmY=torch.cat(labels,dim=0).numpy() sample_weight=class_weight=None if config.loss_is_weighted: weights=np.array([float(weight) for weight in config.loss_weights.split(',')]) weights=weights/sum(weights) class_weight=dict(enumerate(weights)) sample_weight=np.array([class_weight[c] for c in svmY]) msvm_class=getattr(sklearn.svm,config.classifier) if config.classifier!='LinearSVC': msvm=msvm_class(class_weight=class_weight,decision_function_shape='ovo') else: msvm=msvm_class(class_weight=class_weight) logger.info(f"fit svm on train set...") msvm.fit(svmX,svmY,sample_weight=sample_weight) logger.info("predict on train set...") result=np.zeros((7,7)) TOT=svmY.shape[0] if config.classifier!='LinearSVC': msvm.decision_function_shape="ovr" pred_data=msvm.decision_function(svmX) pred_data=torch.argmax(torch.from_numpy(pred_data),dim=1) result,TP=log_result(result,pred_data,torch.from_numpy(svmY)) #write and log writer.add_figure('confusion_matrix_on_train_set',figure=plot_confusion_matrix(result, classes=train_dataset.CLASSNAMES, normalize=True,title='confusion matrix on train set'),global_step=start_n_iter) normalized_result = result.astype('float') / (0.0001+result.sum(axis=1)[:, np.newaxis]) mdv=sum([normalized_result[i][i] for i in range(7)])/7 writer.add_scalar('mean_diagonal_value_on_train_set',mdv,start_n_iter) writer.add_scalar('accuracy_on_train_set',TP/TOT,start_n_iter) logger.info(f"train mean diagonal value of checkpoint in {config.ckpt_path}: {mdv}") logger.info(f"train accuracy of checkpoint in {config.ckpt_path}: {TP/TOT}") logger.info(f"confusion matrix on train set: {normalized_result}") #predict on test set logger.info("extract features on test set...") pbar=tqdm(enumerate(test_loader),total=len(test_loader)) start_time=time.time() feats,labels=[],[] for i,data in pbar: #prepare x_data,y_data=data if len(x_data.size())!=4: logger.error(f"x_data size wrong! Now the size is {x_data.size()}") raise if len(y_data.size())>1: y_data=y_data.squeeze(-1) if config.use_gpu==True: x_data=x_data.to(device) prepare_time=time.time()-start_time #forward and predict pred_data=net(x_data) embedding_addable=False if isinstance(pred_data,list): embedding=pred_data[1].detach().cpu() pred_data=pred_data[0].detach().cpu() embedding_addable=True feats+=[embedding] labels+=[y_data] process_time=time.time()-start_time-prepare_time pbar.set_description("Compute efficiency: {:.2f}, iter: {}/{}:".format( process_time/(process_time+prepare_time), i, len(test_loader))) if config.add_embedding and embedding_addable: add_embedding(writer,mat=embedding,metadata=y_data,label_img=x_data,global_step=i,tag="test set") logger.info("predict on test set...") svmX=torch.cat(feats,dim=0).numpy() svmY=torch.cat(labels,dim=0).numpy() result=np.zeros((7,7)) TOT=svmY.shape[0] pred_data=msvm.decision_function(svmX) pred_data=torch.argmax(torch.from_numpy(pred_data),dim=1) result,TP=log_result(result,pred_data,torch.from_numpy(svmY)) #write and log writer.add_figure('confusion_matrix_on_test_set',figure=plot_confusion_matrix(result, classes=test_dataset.CLASSNAMES, normalize=True,title='confusion matrix on test set'),global_step=start_n_iter) normalized_result = result.astype('float') / (0.0001+result.sum(axis=1)[:, np.newaxis]) mdv=sum([normalized_result[i][i] for i in range(7)])/7 writer.add_scalar('mean_diagonal_value_on_test_set',mdv,start_n_iter) writer.add_scalar('accuracy_on_test_set',TP/TOT,start_n_iter) logger.info(f"test mean diagonal value of checkpoint in {config.ckpt_path}: {mdv}") logger.info(f"test accuracy of checkpoint in {config.ckpt_path}: {TP/TOT}") logger.info(f"confusion matrix on test set: {normalized_result}") logger.info("exit 0.")
num_iters += 1 epoch_loss = running_loss / len(dataloader[stage].dataset) epoch_acc = running_corrects.double() / len(dataloader[stage].dataset) writer.add_scalar('{}/epoch_loss'.format(stage), epoch_loss, ep + 1) writer.add_scalar('{}/epoch_acc'.format(stage), epoch_acc, ep + 1) print('{} Loss: {:.4f}, acc: {:.4f}'.format(stage, epoch_loss, epoch_acc)) if stage == 'test' and epoch_acc > best_acc: best_acc = epoch_acc best_model_wts = copy.deepcopy(model.state_dict()) if args.lr_range_test: fig = plt.figure(figsize=(12, 10)) plt.plot(lr_history, step_acc_history) writer.add_figure('train/clr', fig) time_elapsed = time.time() - since print('Training complete in {:.0f}h {:.0f}m {:.0f}s'.format( time_elapsed // 3600, time_elapsed // 60, time_elapsed % 60)) print('Best val Acc: {:4f}'.format(best_acc)) with open( os.path.join(args.summary_dir, 'model_bestValACC_{:.3f}.pkl'.format(best_acc)), 'wb') as f: pkl.dump(best_model_wts, f)
def train(resume=False): writer = SummaryWriter('../runs/' + hparams.exp_name) for k in hparams.__dict__.keys(): writer.add_text(str(k), str(hparams.__dict__[k])) train_dataset = AudioData( data_csv=hparams.train_csv, data_file=hparams.dev_file, ds_type='train', # augment=False, transform=transforms.Compose([ transforms.ToTensor(), ])) validation_dataset = AudioData(data_csv=hparams.valid_csv, data_file=hparams.dev_file, ds_type='valid', augment=False, transform=transforms.Compose([ transforms.ToTensor(), ])) # train_sampler = WeightedRandomSampler() train_loader = DataLoader(train_dataset, batch_size=hparams.batch_size, shuffle=True, num_workers=2) validation_loader = DataLoader(validation_dataset, batch_size=hparams.batch_size, shuffle=True, num_workers=2) print('loaded train data of length : {}'.format(len(train_dataset))) adversarial_loss = torch.nn.CrossEntropyLoss().to(hparams.gpu_device) discriminator = Discriminator().to(hparams.gpu_device) if hparams.cuda: discriminator = nn.DataParallel(discriminator, device_ids=hparams.device_ids) params_count = 0 for param in discriminator.parameters(): params_count += np.prod(param.size()) print('Model has {0} trainable parameters'.format(params_count)) if not hparams.pretrained: discriminator.apply(weights_init_normal) optimizer_D = torch.optim.Adam(discriminator.parameters(), lr=hparams.learning_rate) scheduler_D = ReduceLROnPlateau(optimizer_D, mode='min', factor=0.3, patience=4, verbose=True, cooldown=0) Tensor = torch.cuda.FloatTensor if hparams.cuda else torch.FloatTensor def validation(discriminator, send_stats=False, epoch=0): print('Validating model on {0} examples. '.format( len(validation_dataset))) discriminator_ = discriminator.eval() with torch.no_grad(): pred_logits_list = [] labels_list = [] for (inp, labels, imgs_names) in tqdm(validation_loader): inp = Variable(inp.float(), requires_grad=False) labels = Variable(labels.long(), requires_grad=False) if hparams.dim3: inp = inp.view(-1, 1, 640, 64) inp = torch.cat([inp] * 3, dim=1) inp = inp.to(hparams.gpu_device) labels = labels.to(hparams.gpu_device) pred_logits = discriminator_(inp) pred_logits_list.append(pred_logits) labels_list.append(labels) pred_logits = torch.cat(pred_logits_list, dim=0) labels = torch.cat(labels_list, dim=0) val_loss = adversarial_loss(pred_logits, labels) return accuracy_metrics( labels.long(), pred_logits ), val_loss #, plot_auc='train_val_'+str(epoch+1), plot_path=hparams.result_dir+'train_val_{}_'.format(epoch)), val_loss print('Starting training.. (log saved in:{})'.format(hparams.exp_name)) start_time = time.time() best_valid_acc = 0 # print(model) for epoch in range(hparams.num_epochs): train_logits = [] train_labels = [] for batch, (inp, labels, imgs_name) in enumerate(tqdm(train_loader)): inp = Variable(inp.float(), requires_grad=False) labels = Variable(labels.long(), requires_grad=False) inp = inp.to(hparams.gpu_device) labels = labels.to(hparams.gpu_device) if hparams.dim3: inp = inp.view(-1, 1, 640, 64) inp = torch.cat([inp] * 3, dim=1) # --------------------- # Train Discriminator # --------------------- optimizer_D.zero_grad() pred_logits = discriminator(inp) train_logits.append(pred_logits) train_labels.append(labels) d_loss = adversarial_loss(pred_logits, labels) d_loss.backward() optimizer_D.step() writer.add_scalar('d_loss', d_loss.item(), global_step=batch + epoch * len(train_loader)) # if batch % hparams.print_interval == 0: # pred_labels = (pred_logits >= hparams.thresh) # pred_labels = pred_labels.float() # auc, f1, acc, _, _ = accuracy_metrics(pred_labels, labels.long(), pred_logits) # print('[Epoch - {0:.1f}, batch - {1:.3f}, d_loss - {2:.6f}, acc - {3:.4f}, f1 - {4:.5f}, auc - {5:.4f}]'.\ # format(1.0*epoch, 100.0*batch/len(train_loader), d_loss.item(), acc['avg'], f1[hparams.avg_mode], auc[hparams.avg_mode])) (val_auc, val_f1, val_acc, val_conf_mat), val_loss = validation(discriminator, epoch=epoch) train_logits = torch.cat(train_logits, dim=0) train_labels = torch.cat(train_labels, dim=0) train_auc, train_f1, train_acc, train_conf_mat = accuracy_metrics( train_labels.long(), train_logits) fig = plot_cf(val_conf_mat) writer.add_figure('val_conf', fig, global_step=epoch) plt.close(fig) for lbl in range(hparams.num_classes): writer.add_scalar('val_f1_{}'.format(hparams.id_to_class[lbl]), val_f1[lbl], global_step=epoch) writer.add_scalar('val_auc_{}'.format(hparams.id_to_class[lbl]), val_auc[lbl], global_step=epoch) writer.add_scalar('val_acc_{}'.format(hparams.id_to_class[lbl]), val_acc[lbl], global_step=epoch) writer.add_scalar('val_f1_{}'.format('micro'), val_f1['micro'], global_step=epoch) writer.add_scalar('val_auc_{}'.format('micro'), val_auc['micro'], global_step=epoch) writer.add_scalar('val_f1_{}'.format('macro'), val_f1['macro'], global_step=epoch) writer.add_scalar('val_auc_{}'.format('macro'), val_auc['macro'], global_step=epoch) writer.add_scalar('val_loss', val_loss, global_step=epoch) writer.add_scalar('val_f1', val_f1[hparams.avg_mode], global_step=epoch) writer.add_scalar('val_auc', val_auc[hparams.avg_mode], global_step=epoch) writer.add_scalar('val_acc', val_acc['avg'], global_step=epoch) scheduler_D.step(val_loss) writer.add_scalar('learning_rate', optimizer_D.param_groups[0]['lr'], global_step=epoch) # torch.save({ # 'epoch': epoch, # 'discriminator_state_dict': discriminator.state_dict(), # 'optimizer_D_state_dict': optimizer_D.state_dict(), # }, hparams.model+'.'+str(epoch)) if best_valid_acc <= val_acc['avg']: best_valid_acc = val_acc['avg'] fig = plot_cf(val_conf_mat) writer.add_figure('best_val_conf', fig, global_step=epoch) plt.close(fig) torch.save( { 'epoch': epoch, 'discriminator_state_dict': discriminator.state_dict(), 'optimizer_D_state_dict': optimizer_D.state_dict(), }, hparams.model + '.best') print('best model on validation set saved.') print('[Epoch - {0:.1f} ---> train_acc - {1:.4f}, current_lr - {2:.6f}, val_loss - {3:.4f}, best_val_acc - {4:.4f}, val_acc - {5:.4f}, val_f1 - {6:.4f}] - time - {7:.1f}'\ .format(1.0*epoch, train_acc['avg'], optimizer_D.param_groups[0]['lr'], val_loss, best_valid_acc, val_acc['avg'], val_f1[hparams.avg_mode], time.time()-start_time)) start_time = time.time()
def lr_search(train_eval_data): train_eval_dataset = TrainEvalDataset(train_eval_data, transform=train_transform) train_eval_data_loader = torch.utils.data.DataLoader( train_eval_dataset, batch_size=config.batch_size, drop_last=True, shuffle=True, num_workers=args.workers, worker_init_fn=worker_init_fn) min_lr = 1e-7 max_lr = 10. gamma = (max_lr / min_lr)**(1 / len(train_eval_data_loader)) lrs = [] losses = [] lim = None model = Model(config.model, NUM_CLASSES) model = model.to(DEVICE) optimizer = build_optimizer(config.opt, model.parameters()) for param_group in optimizer.param_groups: param_group['lr'] = min_lr scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma) optimizer.train() update_transforms(1.) model.train() optimizer.zero_grad() for i, (images, feats, _, labels, real, _) in enumerate(tqdm(train_eval_data_loader, desc='lr search'), 1): images, feats, labels, real = images.to(DEVICE), feats.to(DEVICE), labels.to(DEVICE), real.to(DEVICE) logits = model(images, feats, labels) loss = compute_loss(input=logits, target=labels, real=real) lrs.append(np.squeeze(scheduler.get_lr())) losses.append(loss.data.cpu().numpy().mean()) if lim is None: lim = losses[0] * 1.1 if lim < losses[-1]: break (loss.mean() / config.opt.acc_steps).backward() if i % config.opt.acc_steps == 0: optimizer.step() optimizer.zero_grad() scheduler.step() writer = SummaryWriter(os.path.join(args.experiment_path, 'lr_search')) with torch.no_grad(): losses = np.clip(losses, 0, lim) minima_loss = losses[np.argmin(utils.smooth(losses))] minima_lr = lrs[np.argmin(utils.smooth(losses))] step = 0 for loss, loss_sm in zip(losses, utils.smooth(losses)): writer.add_scalar('search_loss', loss, global_step=step) writer.add_scalar('search_loss_sm', loss_sm, global_step=step) step += config.batch_size fig = plt.figure() plt.plot(lrs, losses) plt.plot(lrs, utils.smooth(losses)) plt.axvline(minima_lr) plt.xscale('log') plt.title('loss: {:.8f}, lr: {:.8f}'.format(minima_loss, minima_lr)) writer.add_figure('search', fig, global_step=0) return minima_lr
class Writer: _STDOUT = sys.stdout _STDERR = sys.stderr def __init__(self, logdir, make_subdir, tag_group): if make_subdir: os.makedirs(logdir, exist_ok=True) timestamp = f"{datetime.datetime.now().strftime('%b%d_%H-%M-%S')}" logdir = os.path.join(logdir, timestamp) self._writer = SummaryWriter(logdir=logdir) assert logdir == self._writer.logdir self._logdir = logdir self._tag_group = tag_group LINE_BUFFERING = 1 sys.stdout = Tee(primary_file=self._STDOUT, secondary_file=open(os.path.join(logdir, "stdout"), "a", buffering=LINE_BUFFERING)) sys.stderr = Tee(primary_file=self._STDERR, secondary_file=open(os.path.join(logdir, "stderr"), "a", buffering=LINE_BUFFERING)) def write_scalar(self, tag, scalar_value, global_step=None): self._writer.add_scalar(self._tag(tag), scalar_value, global_step=global_step) def write_image(self, tag, img_tensor, global_step=None): self._writer.add_image(self._tag(tag), img_tensor, global_step=global_step) def write_figure(self, tag, figure, global_step=None): self._writer.add_figure(self._tag(tag), figure, global_step=global_step) def write_hparams(self, hparam_dict=None, metric_dict=None): self._writer.add_hparams(hparam_dict=hparam_dict, metric_dict=metric_dict) def write_json(self, tag, data): text = json.dumps(data, indent=4) self._writer.add_text( self._tag(tag), 4 * " " + text.replace("\n", "\n" + 4 * " ") # Indent by 4 to ensure codeblock formatting ) json_path = os.path.join(self._logdir, f"{tag}.json") with open(json_path, "w") as f: f.write(text) def write_textfile(self, tag, text): path = os.path.join(self._logdir, f"{tag}.txt") with open(path, "w") as f: f.write(text) def write_checkpoint(self, tag, data): os.makedirs(self._checkpoints_dir, exist_ok=True) checkpoint_path = self._checkpoint_path(tag) tmp_checkpoint_path = os.path.join( os.path.dirname(checkpoint_path), f"{os.path.basename(checkpoint_path)}.tmp") torch.save(data, tmp_checkpoint_path) # replace is atomic, so we guarantee our checkpoints are always good os.replace(tmp_checkpoint_path, checkpoint_path) def load_checkpoint(self, tag, device): return torch.load(self._checkpoint_path(tag), map_location=device) def _checkpoint_path(self, tag): return os.path.join(self._checkpoints_dir, f"{tag}.pt") @property def _checkpoints_dir(self): return os.path.join(self._logdir, "checkpoints") def _tag(self, tag): return f"{self._tag_group}/{tag}"
def run(seed): assert torch.cuda.is_available() device = torch.device('cuda') torch.set_default_tensor_type('torch.cuda.FloatTensor') np.random.seed(seed) torch.manual_seed(seed) # Create training data. data_transform = tvtransforms.Compose( [tvtransforms.ToTensor(), tvtransforms.Lambda(torch.bernoulli)]) if args.dataset_name == 'mnist': dataset = datasets.MNIST(root=os.path.join(utils.get_data_root(), 'mnist'), train=True, download=True, transform=data_transform) test_dataset = datasets.MNIST(root=os.path.join( utils.get_data_root(), 'mnist'), train=False, download=True, transform=data_transform) elif args.dataset_name == 'fashion-mnist': dataset = datasets.FashionMNIST(root=os.path.join( utils.get_data_root(), 'fashion-mnist'), train=True, download=True, transform=data_transform) test_dataset = datasets.FashionMNIST(root=os.path.join( utils.get_data_root(), 'fashion-mnist'), train=False, download=True, transform=data_transform) elif args.dataset_name == 'omniglot': dataset = data_.OmniglotDataset(split='train', transform=data_transform) test_dataset = data_.OmniglotDataset(split='test', transform=data_transform) elif args.dataset_name == 'emnist': rotate = partial(tvF.rotate, angle=-90) hflip = tvF.hflip data_transform = tvtransforms.Compose([ tvtransforms.Lambda(rotate), tvtransforms.Lambda(hflip), tvtransforms.ToTensor(), tvtransforms.Lambda(torch.bernoulli) ]) dataset = datasets.EMNIST(root=os.path.join(utils.get_data_root(), 'emnist'), split='letters', train=True, transform=data_transform, download=True) test_dataset = datasets.EMNIST(root=os.path.join( utils.get_data_root(), 'emnist'), split='letters', train=False, transform=data_transform, download=True) else: raise ValueError if args.dataset_name == 'omniglot': split = -1345 elif args.dataset_name == 'emnist': split = -20000 else: split = -10000 indices = np.arange(len(dataset)) np.random.shuffle(indices) train_indices, val_indices = indices[:split], indices[split:] train_sampler = SubsetRandomSampler(train_indices) val_sampler = SubsetRandomSampler(val_indices) train_loader = data.DataLoader( dataset=dataset, batch_size=args.batch_size, sampler=train_sampler, num_workers=4 if args.dataset_name == 'emnist' else 0) train_generator = data_.batch_generator(train_loader) val_loader = data.DataLoader(dataset=dataset, batch_size=1024, sampler=val_sampler, shuffle=False, drop_last=False) val_batch = next(iter(val_loader))[0] test_loader = data.DataLoader( test_dataset, batch_size=16, shuffle=False, drop_last=False, ) def create_linear_transform(): if args.linear_type == 'lu': return transforms.CompositeTransform([ transforms.RandomPermutation(args.latent_features), transforms.LULinear(args.latent_features, identity_init=True) ]) elif args.linear_type == 'svd': return transforms.SVDLinear(args.latent_features, num_householder=4, identity_init=True) elif args.linear_type == 'perm': return transforms.RandomPermutation(args.latent_features) else: raise ValueError def create_base_transform(i, context_features=None): if args.prior_type == 'affine-coupling': return transforms.AffineCouplingTransform( mask=utils.create_alternating_binary_mask( features=args.latent_features, even=(i % 2 == 0)), transform_net_create_fn=lambda in_features, out_features: nn_. ResidualNet(in_features=in_features, out_features=out_features, hidden_features=args.hidden_features, context_features=context_features, num_blocks=args.num_transform_blocks, activation=F.relu, dropout_probability=args.dropout_probability, use_batch_norm=args.use_batch_norm)) elif args.prior_type == 'rq-coupling': return transforms.PiecewiseRationalQuadraticCouplingTransform( mask=utils.create_alternating_binary_mask( features=args.latent_features, even=(i % 2 == 0)), transform_net_create_fn=lambda in_features, out_features: nn_. ResidualNet(in_features=in_features, out_features=out_features, hidden_features=args.hidden_features, context_features=context_features, num_blocks=args.num_transform_blocks, activation=F.relu, dropout_probability=args.dropout_probability, use_batch_norm=args.use_batch_norm), num_bins=args.num_bins, tails='linear', tail_bound=args.tail_bound, apply_unconditional_transform=args. apply_unconditional_transform, ) elif args.prior_type == 'affine-autoregressive': return transforms.MaskedAffineAutoregressiveTransform( features=args.latent_features, hidden_features=args.hidden_features, context_features=context_features, num_blocks=args.num_transform_blocks, use_residual_blocks=True, random_mask=False, activation=F.relu, dropout_probability=args.dropout_probability, use_batch_norm=args.use_batch_norm) elif args.prior_type == 'rq-autoregressive': return transforms.MaskedPiecewiseRationalQuadraticAutoregressiveTransform( features=args.latent_features, hidden_features=args.hidden_features, context_features=context_features, num_bins=args.num_bins, tails='linear', tail_bound=args.tail_bound, num_blocks=args.num_transform_blocks, use_residual_blocks=True, random_mask=False, activation=F.relu, dropout_probability=args.dropout_probability, use_batch_norm=args.use_batch_norm) else: raise ValueError # --------------- # prior # --------------- def create_prior(): if args.prior_type == 'standard-normal': prior = distributions_.StandardNormal((args.latent_features, )) else: distribution = distributions_.StandardNormal( (args.latent_features, )) transform = transforms.CompositeTransform([ transforms.CompositeTransform( [create_linear_transform(), create_base_transform(i)]) for i in range(args.num_flow_steps) ]) transform = transforms.CompositeTransform( [transform, create_linear_transform()]) prior = flows.Flow(transform, distribution) return prior # --------------- # inputs encoder # --------------- def create_inputs_encoder(): if args.approximate_posterior_type == 'diagonal-normal': inputs_encoder = None else: inputs_encoder = nn_.ConvEncoder( context_features=args.context_features, channels_multiplier=16, dropout_probability=args.dropout_probability_encoder_decoder) return inputs_encoder # --------------- # approximate posterior # --------------- def create_approximate_posterior(): if args.approximate_posterior_type == 'diagonal-normal': context_encoder = nn_.ConvEncoder( context_features=args.context_features, channels_multiplier=16, dropout_probability=args.dropout_probability_encoder_decoder) approximate_posterior = distributions_.ConditionalDiagonalNormal( shape=[args.latent_features], context_encoder=context_encoder) else: context_encoder = nn.Linear(args.context_features, 2 * args.latent_features) distribution = distributions_.ConditionalDiagonalNormal( shape=[args.latent_features], context_encoder=context_encoder) transform = transforms.CompositeTransform([ transforms.CompositeTransform([ create_linear_transform(), create_base_transform( i, context_features=args.context_features) ]) for i in range(args.num_flow_steps) ]) transform = transforms.CompositeTransform( [transform, create_linear_transform()]) approximate_posterior = flows.Flow( transforms.InverseTransform(transform), distribution) return approximate_posterior # --------------- # likelihood # --------------- def create_likelihood(): latent_decoder = nn_.ConvDecoder( latent_features=args.latent_features, channels_multiplier=16, dropout_probability=args.dropout_probability_encoder_decoder) likelihood = distributions_.ConditionalIndependentBernoulli( shape=[1, 28, 28], context_encoder=latent_decoder) return likelihood prior = create_prior() approximate_posterior = create_approximate_posterior() likelihood = create_likelihood() inputs_encoder = create_inputs_encoder() model = vae.VariationalAutoencoder( prior=prior, approximate_posterior=approximate_posterior, likelihood=likelihood, inputs_encoder=inputs_encoder) n_params = utils.get_num_parameters(model) print('There are {} trainable parameters in this model.'.format(n_params)) optimizer = optim.Adam(model.parameters(), lr=args.learning_rate) scheduler = optim.lr_scheduler.CosineAnnealingLR( optimizer=optimizer, T_max=args.num_training_steps, eta_min=0) def get_kl_multiplier(step): if args.kl_multiplier_schedule == 'constant': return args.kl_multiplier_initial elif args.kl_multiplier_schedule == 'linear': multiplier = min( step / (args.num_training_steps * args.kl_warmup_fraction), 1.) return args.kl_multiplier_initial * (1. + multiplier) # create summary writer and write to log directory timestamp = cutils.get_timestamp() if cutils.on_cluster(): timestamp += '||{}'.format(os.environ['SLURM_JOB_ID']) log_dir = os.path.join(cutils.get_log_root(), args.dataset_name, timestamp) while True: try: writer = SummaryWriter(log_dir=log_dir, max_queue=20) break except FileExistsError: sleep(5) filename = os.path.join(log_dir, 'config.json') with open(filename, 'w') as file: json.dump(vars(args), file) best_val_elbo = -np.inf tbar = tqdm(range(args.num_training_steps)) for step in tbar: model.train() optimizer.zero_grad() batch = next(train_generator)[0].to(device) elbo = model.stochastic_elbo(batch, kl_multiplier=get_kl_multiplier(step)) loss = -torch.mean(elbo) loss.backward() optimizer.step() scheduler.step(step) if (step + 1) % args.monitor_interval == 0: model.eval() with torch.no_grad(): elbo = model.stochastic_elbo(val_batch.to(device)) mean_val_elbo = elbo.mean() if mean_val_elbo > best_val_elbo: best_val_elbo = mean_val_elbo path = os.path.join( cutils.get_checkpoint_root(), '{}-best-val-{}.t'.format(args.dataset_name, timestamp)) torch.save(model.state_dict(), path) writer.add_scalar(tag='val-elbo', scalar_value=mean_val_elbo, global_step=step) writer.add_scalar(tag='best-val-elbo', scalar_value=best_val_elbo, global_step=step) with torch.no_grad(): samples = model.sample(64) fig, ax = plt.subplots(figsize=(10, 10)) cutils.gridimshow(make_grid(samples.view(64, 1, 28, 28), nrow=8), ax) writer.add_figure(tag='vae-samples', figure=fig, global_step=step) plt.close() # load best val model path = os.path.join( cutils.get_checkpoint_root(), '{}-best-val-{}.t'.format(args.dataset_name, timestamp)) model.load_state_dict(torch.load(path)) model.eval() np.random.seed(5) torch.manual_seed(5) # compute elbo on test set with torch.no_grad(): elbo = torch.Tensor([]) log_prob_lower_bound = torch.Tensor([]) for batch in tqdm(test_loader): elbo_ = model.stochastic_elbo(batch[0].to(device)) elbo = torch.cat([elbo, elbo_]) log_prob_lower_bound_ = model.log_prob_lower_bound( batch[0].to(device), num_samples=1000) log_prob_lower_bound = torch.cat( [log_prob_lower_bound, log_prob_lower_bound_]) path = os.path.join( log_dir, '{}-prior-{}-posterior-{}-elbo.npy'.format( args.dataset_name, args.prior_type, args.approximate_posterior_type)) np.save(path, utils.tensor2numpy(elbo)) path = os.path.join( log_dir, '{}-prior-{}-posterior-{}-log-prob-lower-bound.npy'.format( args.dataset_name, args.prior_type, args.approximate_posterior_type)) np.save(path, utils.tensor2numpy(log_prob_lower_bound)) # save elbo and log prob lower bound mean_elbo = elbo.mean() std_elbo = elbo.std() mean_log_prob_lower_bound = log_prob_lower_bound.mean() std_log_prob_lower_bound = log_prob_lower_bound.std() s = 'ELBO: {:.2f} +- {:.2f}, LOG PROB LOWER BOUND: {:.2f} +- {:.2f}'.format( mean_elbo.item(), 2 * std_elbo.item() / np.sqrt(len(test_dataset)), mean_log_prob_lower_bound.item(), 2 * std_log_prob_lower_bound.item() / np.sqrt(len(test_dataset))) filename = os.path.join(log_dir, 'test-results.txt') with open(filename, 'w') as file: file.write(s)
def train(logger): """ perform the training routine for a given fold. saves plots and selected parameters to the experiment dir specified in the configs. """ logger.info('performing training in {}D over fold {} on experiment {} with model {}'.format( cf.dim, cf.fold, cf.exp_dir, cf.model)) writer = SummaryWriter(os.path.join(cf.exp_dir,'tensorboard')) net = model.net(cf, logger).cuda() #print('finish initial network') optimizer = torch.optim.Adam(net.parameters(), lr=cf.learning_rate[0], weight_decay=cf.weight_decay) #print('finish initial optimizer') model_selector = utils.ModelSelector(cf, logger) train_evaluator = Evaluator(cf, logger, mode='train') val_evaluator = Evaluator(cf, logger, mode=cf.val_mode)#val_sampling starting_epoch = 1 # prepare monitoring #monitor_metrics, TrainingPlot = utils.prepare_monitoring(cf) #print('monitor_metrics',monitor_metrics) if cf.resume_to_checkpoint:#default: False best_epoch = np.load(cf.resume_to_checkpoint + 'epoch_ranking.npy')[0] df = open(cf.resume_to_checkpoint+'monitor_metrics.pickle','rb') monitor_metrics = pickle.load(df) df.close() starting_epoch = utils.load_checkpoint(cf.resume_to_checkpoint, net, optimizer) logger.info('resumed to checkpoint {} at epoch {}'.format(cf.resume_to_checkpoint, starting_epoch)) num_batch = starting_epoch * cf.num_train_batches+1 num_val = starting_epoch * cf.num_val_batches+1 else: monitor_metrics = utils.prepare_monitoring(cf) num_batch = 0#for show loss num_val = 0 logger.info('loading dataset and initializing batch generators...') batch_gen = data_loader.get_train_generators(cf, logger) #for k in batch_gen.keys(): # print('k in batch_gen are {}'.format(k)) best_train_recall,best_val_recall = 0,0 for epoch in range(starting_epoch, cf.num_epochs + 1): logger.info('starting training epoch {}'.format(epoch)) for param_group in optimizer.param_groups: param_group['lr'] = cf.learning_rate[epoch - 1] start_time = time.time() net.train() train_results_list = []#this batch #print('net.train()') for bix in range(cf.num_train_batches):#200 num_batch += 1 batch = next(batch_gen['train'])#data,seg,pid,class_target,bb_target,roi_masks,roi_labels #print('training',batch['pid']) for ii,i in enumerate(batch['roi_labels']): if i[0] > 0: batch['roi_labels'][ii] = [1] else: batch['roi_labels'][ii] = [-1] #for k in batch.keys(): # print('k',k) tic_fw = time.time() results_dict = net.train_forward(batch) tic_bw = time.time() optimizer.zero_grad() results_dict['torch_loss'].backward()#total loss optimizer.step() if (num_batch) % cf.show_train_images == 0: fig = plot_batch_prediction(batch, results_dict, cf,'train') writer.add_figure('/Train/results',fig,num_batch) fig.clear() logger.info('tr. batch {0}/{1} (ep. {2}) fw {3:.3f}s / bw {4:.3f}s / total {5:.3f}s || ' .format(bix + 1, cf.num_train_batches, epoch, tic_bw - tic_fw, time.time() - tic_bw, time.time() - tic_fw) + results_dict['logger_string']) writer.add_scalar('Train/total_loss',results_dict['torch_loss'].item(),num_batch) writer.add_scalar('Train/rpn_class_loss',results_dict['monitor_losses']['rpn_class_loss'],num_batch) writer.add_scalar('Train/rpn_bbox_loss',results_dict['monitor_losses']['rpn_bbox_loss'],num_batch) writer.add_scalar('Train/mrcnn_class_loss',results_dict['monitor_losses']['mrcnn_class_loss'],num_batch) writer.add_scalar('Train/mrcnn_bbox_loss',results_dict['monitor_losses']['mrcnn_bbox_loss'],num_batch) if 'mrcnn' in cf.model_path: writer.add_scalar('Train/mrcnn_mask_loss',results_dict['monitor_losses']['mrcnn_mask_loss'],num_batch) if 'ufrcnn' in cf.model_path: writer.add_scalar('Train/seg_dice_loss',results_dict['monitor_losses']['seg_loss_dice'],num_batch) train_results_list.append([results_dict['boxes'], batch['pid']])#just gt and det monitor_metrics['train']['monitor_values'][epoch].append(results_dict['monitor_values']) count_train = train_evaluator.evaluate_predictions(train_results_list,epoch,cf,flag = 'train') print('tp_patient {}, tp_roi {}, fp_roi {}, total_num {}'.format(count_train[0],count_train[1],count_train[2],count_train[3])) precision = count_train[0]/ (count_train[0]+count_train[2]+0.01) recall = count_train[0]/ (count_train[3]) print('precision:{}, recall:{}'.format(precision,recall)) monitor_metrics['train']['train_recall'].append(recall) monitor_metrics['train']['train_percision'].append(precision) writer.add_scalar('Train/train_precision',precision,epoch) writer.add_scalar('Train/train_recall',recall,epoch) train_time = time.time() - start_time print('*'*50 + 'finish epoch {}'.format(epoch)) logger.info('starting validation in mode {}.'.format(cf.val_mode)) with torch.no_grad(): net.eval() if cf.do_validation: val_results_list = [] val_predictor = Predictor(cf, net, logger, mode='val') dice_val = [] for _ in range(batch_gen['n_val']):#50 num_val += 1 batch = next(batch_gen[cf.val_mode]) #print('valing',batch['pid']) for ii,i in enumerate(batch['roi_labels']): if i[0] > 0: batch['roi_labels'][ii] = [1] else: batch['roi_labels'][ii] = [-1] if cf.val_mode == 'val_patient': results_dict = val_predictor.predict_patient(batch) elif cf.val_mode == 'val_sampling': results_dict = net.train_forward(batch, is_validation=True) if (num_val) % cf.show_val_images == 0: fig = plot_batch_prediction(batch, results_dict, cf,'val') writer.add_figure('Val/results',fig,num_val) fig.clear() this_batch_seg_label = torch.FloatTensor(mutils.get_one_hot_encoding(batch['seg'], cf.num_seg_classes)).cuda() this_batch_dice = DiceLoss() dice = 1- this_batch_dice(F.softmax(results_dict['seg_logits'],dim=1),this_batch_seg_label) #this_batch_dice = batch_dice(F.softmax(results_dict['seg_logits'],dim = 1),this_batch_seg_label,showdice = True) dice_val.append(dice) val_results_list.append([results_dict['boxes'], batch['pid']]) monitor_metrics['val']['monitor_values'][epoch].append(results_dict['monitor_values']) count_val = val_evaluator.evaluate_predictions(val_results_list,epoch,cf,flag = 'val') print('tp_patient {}, tp_roi {}, fp_roi {}, total_num {}'.format(count_val[0],count_val[1],count_val[2],count_val[3])) precision = count_val[0]/ (count_val[0]+count_val[2]+0.01) recall = count_val[0]/ (count_val[3]) print('precision:{}, recall:{}'.format(precision,recall)) monitor_metrics['val']['val_recall'].append(recall) monitor_metrics['val']['val_percision'].append(precision) writer.add_scalar('Val/val_precision',precision,epoch) writer.add_scalar('Val/val_recall',recall,epoch) writer.add_scalar('Val/val_dice',sum(dice_val)/float(len(dice_val)),epoch) model_selector.run_model_selection(net, optimizer, monitor_metrics, epoch) # update monitoring and prediction plots #TrainingPlot.update_and_save(monitor_metrics, epoch) epoch_time = time.time() - start_time logger.info('trained epoch {}: took {} sec. ({} train / {} val)'.format( epoch, epoch_time, train_time, epoch_time-train_time)) writer.close()