def test_durations(self): self.assertEquals(utils.str_to_duration("42"), 42) self.assertEquals(utils.str_to_duration("60m"), 60*60) self.assertEquals(utils.str_to_duration("27h5m"), 27*3600+5*60) self.assertEquals(utils.str_to_duration("2d5m8"), 2*24*3600+5*60+8) self.assertEquals(utils.duration_to_str(60*60),"60m") self.assertEquals(utils.duration_to_str(3*60*60+11),"3h11s") self.assertEquals(utils.duration_to_str(8*60+3*60*60*24+11),"3d8m11s")
def run(self): self.bailing = False # future use, to kill the Thread self.last_copy = "unknown" timer = elapsed.ElapsedTimer() while not self.bailing: self.audit() # TODO: honor different rescans per source if timer.once_every(self.rescan): self.run_all_scanners_once() self.reinventory() self.server_statuses = self.check_on_servers() self.logger.debug(f"JFYI: {self.server_statuses}") if self.full_p(): self.logger.debug("I'm full (?) trying to drop") self.try_to_drop() elif "underserved" in self.server_statuses: self.try_to_copy() elif "available" in self.server_statuses and \ self.last_copy != "not enough space": self.try_to_copy() else: sleep_time = self.rescan - timer.elapsed() sleep_msg = utils.duration_to_str(sleep_time) self.logger.info(f"sleeping {sleep_msg} til next rescan") time.sleep(sleep_time)
def run(self): self.bailout = False # pre-scan self.scanner.scan() self.logger.info("Ready to serve") self.handling = True while not self.bailout: timer = elapsed.ElapsedTimer() self.config.load() self.rescan = utils.get_interval(self.config, "rescan", self.context) self.scanner.scan() sleepy_time = max(self.rescan - timer.elapsed(), 10) sleep_msg = utils.duration_to_str(sleepy_time) self.logger.info(f"sleeping {sleep_msg} til next rescan") time.sleep(sleepy_time)
def tb_write_metrics(trainer): epoch = trainer.state.epoch max_epochs = trainer.state.max_epochs # Run on evaluation validator.run(val_dataloader, 1) # Common time wall_time = time.time() # Log all metrics to TB _write_metrics("train", trainer.state.metrics, epoch, wall_time) _write_metrics("val", validator.state.metrics, epoch, wall_time) train_loss = trainer.state.metrics.get(loss_name, 0) val_loss = validator.state.metrics.get(loss_name, 0) tb_write_histogram(writer, model, epoch, wall_time) print("Finished epoch {}/{}, loss {:.3f}, val loss {:.3f} (took {})". format(epoch, max_epochs, train_loss, val_loss, utils.duration_to_str(int(timer._elapsed()))))
def evaluate_model(run_name, model, optimizer, device, loss_name, loss_params, chosen_diseases, dataloader, experiment_mode="debug", base_dir=utils.BASE_DIR): # Create tester engine tester = Engine(utilsT.get_step_fn(model, optimizer, device, loss_name, loss_params, training=False)) loss_metric = RunningAverage(output_transform=lambda x: x[0], alpha=1) loss_metric.attach(tester, loss_name) utilsT.attach_metrics(tester, chosen_diseases, "prec", Precision, True) utilsT.attach_metrics(tester, chosen_diseases, "recall", Recall, True) utilsT.attach_metrics(tester, chosen_diseases, "acc", Accuracy, True) utilsT.attach_metrics(tester, chosen_diseases, "roc_auc", utilsT.RocAucMetric, False) utilsT.attach_metrics(tester, chosen_diseases, "cm", ConfusionMatrix, get_transform_fn=utilsT.get_transform_cm, metric_args=(2,)) timer = Timer(average=True) timer.attach(tester, start=Events.EPOCH_STARTED, step=Events.EPOCH_COMPLETED) # Save metrics log_metrics = list(ALL_METRICS) # Run test print("Testing...") tester.run(dataloader, 1) # Capture time secs_per_epoch = timer.value() duration_per_epoch = utils.duration_to_str(int(secs_per_epoch)) print("Time elapsed in epoch: ", duration_per_epoch) # Copy metrics dict metrics = dict() original_metrics = tester.state.metrics for metric_name in log_metrics: for disease_name in chosen_diseases: key = metric_name + "_" + disease_name if key not in original_metrics: print("Metric not found in tester, skipping: ", key) continue metrics[key] = original_metrics[key] # Copy CMs for disesase_name in chosen_diseases: key = "cm_" + disease_name if key not in original_metrics: print("CM not found in tester, skipping: ", key) continue cm = original_metrics[key] metrics[key] = cm.numpy().tolist() # Save to file folder = os.path.join(base_dir, "results", experiment_mode) os.makedirs(folder, exist_ok=True) fname = os.path.join(folder, run_name + ".json") with open(fname, "w+") as f: json.dump(metrics, f) print("Saved metrics to: ", fname) return metrics
args = parser.parse_args() # If debugging, automatically set other values is_debug = not args.non_debug if is_debug: args.max_images = args.max_images or 100 return args if __name__ == "__main__": args = parse_args() experiment_mode = "debug" if args.non_debug: experiment_mode = "" start_time = time.time() metrics = main(args.run_name, experiment_mode=experiment_mode, base_dir=args.base_dir, dataset_type=args.dataset_type, max_images=args.max_images, image_format=args.image_format, batch_size=args.batch_size, ) end_time = time.time() print("-"*50) print("Total testing time: ", utils.duration_to_str(end_time - start_time)) print("="*50)
def train_model( name="", resume="", base_dir=utils.BASE_DIR, model_name="v0", chosen_diseases=None, n_epochs=10, batch_size=4, oversample=False, max_os=None, shuffle=False, opt="sgd", opt_params={}, loss_name="wbce", loss_params={}, train_resnet=False, log_metrics=None, flush_secs=120, train_max_images=None, val_max_images=None, test_max_images=None, experiment_mode="debug", save=True, save_cms=True, # Note that in this case, save_cms (to disk) includes write_cms (to TB) write_graph=False, write_emb=False, write_emb_img=False, write_img=False, image_format="RGB", multiple_gpu=False, ): # Choose GPU device = utilsT.get_torch_device() print("Using device: ", device) # Common folders dataset_dir = os.path.join(base_dir, "dataset") # Dataset handling print("Loading train dataset...") train_dataset, train_dataloader = utilsT.prepare_data( dataset_dir, "train", chosen_diseases, batch_size, oversample=oversample, max_os=max_os, shuffle=shuffle, max_images=train_max_images, image_format=image_format, ) train_samples, _ = train_dataset.size() print("Loading val dataset...") val_dataset, val_dataloader = utilsT.prepare_data( dataset_dir, "val", chosen_diseases, batch_size, max_images=val_max_images, image_format=image_format, ) val_samples, _ = val_dataset.size() # Should be the same than chosen_diseases chosen_diseases = list(train_dataset.classes) print("Chosen diseases: ", chosen_diseases) if resume: # Load model and optimizer model, model_name, optimizer, opt, loss_name, loss_params, chosen_diseases = models.load_model( base_dir, resume, experiment_mode="", device=device) model.train(True) else: # Create model model = models.init_empty_model(model_name, chosen_diseases, train_resnet=train_resnet).to(device) # Create optimizer OptClass = optimizers.get_optimizer_class(opt) optimizer = OptClass(model.parameters(), **opt_params) # print("OPT: ", opt_params) # Allow multiple GPUs if multiple_gpu: model = DataParallel(model) # Tensorboard log options run_name = utils.get_timestamp() if name: run_name += "_{}".format(name) if len(chosen_diseases) == 1: run_name += "_{}".format(chosen_diseases[0]) elif len(chosen_diseases) == 14: run_name += "_all" log_dir = get_log_dir(base_dir, run_name, experiment_mode=experiment_mode) print("Run name: ", run_name) print("Saved TB in: ", log_dir) writer = SummaryWriter(log_dir=log_dir, flush_secs=flush_secs) # Create validator engine validator = Engine( utilsT.get_step_fn(model, optimizer, device, loss_name, loss_params, False)) val_loss = RunningAverage(output_transform=lambda x: x[0], alpha=1) val_loss.attach(validator, loss_name) utilsT.attach_metrics(validator, chosen_diseases, "prec", Precision, True) utilsT.attach_metrics(validator, chosen_diseases, "recall", Recall, True) utilsT.attach_metrics(validator, chosen_diseases, "acc", Accuracy, True) utilsT.attach_metrics(validator, chosen_diseases, "roc_auc", utilsT.RocAucMetric, False) utilsT.attach_metrics(validator, chosen_diseases, "cm", ConfusionMatrix, get_transform_fn=utilsT.get_transform_cm, metric_args=(2, )) utilsT.attach_metrics(validator, chosen_diseases, "positives", RunningAverage, get_transform_fn=utilsT.get_count_positives) # Create trainer engine trainer = Engine( utilsT.get_step_fn(model, optimizer, device, loss_name, loss_params, True)) train_loss = RunningAverage(output_transform=lambda x: x[0], alpha=1) train_loss.attach(trainer, loss_name) utilsT.attach_metrics(trainer, chosen_diseases, "acc", Accuracy, True) utilsT.attach_metrics(trainer, chosen_diseases, "prec", Precision, True) utilsT.attach_metrics(trainer, chosen_diseases, "recall", Recall, True) utilsT.attach_metrics(trainer, chosen_diseases, "roc_auc", utilsT.RocAucMetric, False) utilsT.attach_metrics(trainer, chosen_diseases, "cm", ConfusionMatrix, get_transform_fn=utilsT.get_transform_cm, metric_args=(2, )) utilsT.attach_metrics(trainer, chosen_diseases, "positives", RunningAverage, get_transform_fn=utilsT.get_count_positives) timer = Timer(average=True) timer.attach(trainer, start=Events.EPOCH_STARTED, step=Events.EPOCH_COMPLETED) # TODO: Early stopping # def score_function(engine): # val_loss = engine.state.metrics[loss_name] # return -val_loss # handler = EarlyStopping(patience=10, score_function=score_function, trainer=trainer) # validator.add_event_handler(Events.COMPLETED, handler) # Metrics callbacks if log_metrics is None: log_metrics = list(ALL_METRICS) def _write_metrics(run_type, metrics, epoch, wall_time): loss = metrics.get(loss_name, 0) writer.add_scalar("Loss/" + run_type, loss, epoch, wall_time) for metric_base_name in log_metrics: for disease in chosen_diseases: metric_value = metrics.get( "{}_{}".format(metric_base_name, disease), -1) writer.add_scalar( "{}_{}/{}".format(metric_base_name, disease, run_type), metric_value, epoch, wall_time) @trainer.on(Events.EPOCH_COMPLETED) def tb_write_metrics(trainer): epoch = trainer.state.epoch max_epochs = trainer.state.max_epochs # Run on evaluation validator.run(val_dataloader, 1) # Common time wall_time = time.time() # Log all metrics to TB _write_metrics("train", trainer.state.metrics, epoch, wall_time) _write_metrics("val", validator.state.metrics, epoch, wall_time) train_loss = trainer.state.metrics.get(loss_name, 0) val_loss = validator.state.metrics.get(loss_name, 0) tb_write_histogram(writer, model, epoch, wall_time) print("Finished epoch {}/{}, loss {:.3f}, val loss {:.3f} (took {})". format(epoch, max_epochs, train_loss, val_loss, utils.duration_to_str(int(timer._elapsed())))) # Hparam dict hparam_dict = { "resume": resume, "n_diseases": len(chosen_diseases), "diseases": ",".join(chosen_diseases), "n_epochs": n_epochs, "batch_size": batch_size, "shuffle": shuffle, "model_name": model_name, "opt": opt, "loss": loss_name, "samples (train, val)": "{},{}".format(train_samples, val_samples), "train_resnet": train_resnet, "multiple_gpu": multiple_gpu, } def copy_params(params_dict, base_name): for name, value in params_dict.items(): hparam_dict["{}_{}".format(base_name, name)] = value copy_params(loss_params, "loss") copy_params(opt_params, "opt") print("HPARAM: ", hparam_dict) # Train print("-" * 50) print("Training...") trainer.run(train_dataloader, n_epochs) # Capture time secs_per_epoch = timer.value() duration_per_epoch = utils.duration_to_str(int(secs_per_epoch)) print("Average time per epoch: ", duration_per_epoch) print("-" * 50) ## Write all hparams hparam_dict["duration_per_epoch"] = duration_per_epoch # FIXME: this is commented to avoid having too many hparams in TB frontend # metrics # def copy_metrics(engine, engine_name): # for metric_name, metric_value in engine.state.metrics.items(): # hparam_dict["{}_{}".format(engine_name, metric_name)] = metric_value # copy_metrics(trainer, "train") # copy_metrics(validator, "val") print("Writing TB hparams") writer.add_hparams(hparam_dict, {}) # Save model to disk if save: print("Saving model...") models.save_model(base_dir, run_name, model_name, experiment_mode, hparam_dict, trainer, model, optimizer) # Write graph to TB if write_graph: print("Writing TB graph...") tb_write_graph(writer, model, train_dataloader, device) # Write embeddings to TB if write_emb: print("Writing TB embeddings...") image_size = 256 if write_emb_img else 0 # FIXME: be able to select images (balanced, train vs val, etc) image_list = list(train_dataset.label_index["FileName"])[:1000] # disease = chosen_diseases[0] # positive = train_dataset.label_index[train_dataset.label_index[disease] == 1] # negative = train_dataset.label_index[train_dataset.label_index[disease] == 0] # positive_images = list(positive["FileName"])[:25] # negative_images = list(negative["FileName"])[:25] # image_list = positive_images + negative_images all_images, all_embeddings, all_predictions, all_ground_truths = gen_embeddings( model, train_dataset, device, image_list=image_list, image_size=image_size) tb_write_embeddings( writer, chosen_diseases, all_images, all_embeddings, all_predictions, all_ground_truths, global_step=n_epochs, use_images=write_emb_img, tag="1000_{}".format("img" if write_emb_img else "no_img"), ) # Save confusion matrices (is expensive to calculate them afterwards) if save_cms: print("Saving confusion matrices...") # Assure folder cms_dir = os.path.join(base_dir, "cms", experiment_mode) os.makedirs(cms_dir, exist_ok=True) base_fname = os.path.join(cms_dir, run_name) n_diseases = len(chosen_diseases) def extract_cms(metrics): """Extract confusion matrices from a metrics dict.""" cms = [] for disease in chosen_diseases: key = "cm_" + disease if key not in metrics: cm = np.array([[-1, -1], [-1, -1]]) else: cm = metrics[key].numpy() cms.append(cm) return np.array(cms) # Train confusion matrix train_cms = extract_cms(trainer.state.metrics) np.save(base_fname + "_train", train_cms) tb_write_cms(writer, "train", chosen_diseases, train_cms) # Validation confusion matrix val_cms = extract_cms(validator.state.metrics) np.save(base_fname + "_val", val_cms) tb_write_cms(writer, "val", chosen_diseases, val_cms) # All confusion matrix (train + val) all_cms = train_cms + val_cms np.save(base_fname + "_all", all_cms) # Print to console if len(chosen_diseases) == 1: print("Train CM: ") print(train_cms[0]) print("Val CM: ") print(val_cms[0]) # print("Train CM 2: ") # print(trainer.state.metrics["cm_" + chosen_diseases[0]]) # print("Val CM 2: ") # print(validator.state.metrics["cm_" + chosen_diseases[0]]) if write_img: # NOTE: this option is not recommended, use Testing notebook to plot and analyze images print("Writing images to TB...") test_dataset, test_dataloader = utilsT.prepare_data( dataset_dir, "test", chosen_diseases, batch_size, max_images=test_max_images, ) # TODO: add a way to select images? # image_list = list(test_dataset.label_index["FileName"])[:3] # Examples in test_dataset (with bboxes available): image_list = [ # "00010277_000.png", # (Effusion, Infiltrate, Mass, Pneumonia) # "00018427_004.png", # (Atelectasis, Effusion, Mass) # "00021703_001.png", # (Atelectasis, Effusion, Infiltrate) # "00028640_008.png", # (Effusion, Infiltrate) # "00019124_104.png", # (Pneumothorax) # "00019124_090.png", # (Nodule) # "00020318_007.png", # (Pneumothorax) "00000003_000.png", # (0) # "00000003_001.png", # (0) # "00000003_002.png", # (0) "00000732_005.png", # (Cardiomegaly, Pneumothorax) # "00012261_001.png", # (Cardiomegaly, Pneumonia) # "00013249_033.png", # (Cardiomegaly, Pneumonia) # "00029808_003.png", # (Cardiomegaly, Pneumonia) # "00022215_012.png", # (Cardiomegaly, Pneumonia) # "00011402_007.png", # (Cardiomegaly, Pneumonia) # "00019018_007.png", # (Cardiomegaly, Infiltrate) # "00021009_001.png", # (Cardiomegaly, Infiltrate) # "00013670_151.png", # (Cardiomegaly, Infiltrate) # "00005066_030.png", # (Cardiomegaly, Infiltrate, Effusion) "00012288_000.png", # (Cardiomegaly) "00008399_007.png", # (Cardiomegaly) "00005532_000.png", # (Cardiomegaly) "00005532_014.png", # (Cardiomegaly) "00005532_016.png", # (Cardiomegaly) "00005827_000.png", # (Cardiomegaly) # "00006912_007.png", # (Cardiomegaly) # "00007037_000.png", # (Cardiomegaly) # "00007043_000.png", # (Cardiomegaly) # "00012741_004.png", # (Cardiomegaly) # "00007551_020.png", # (Cardiomegaly) # "00007735_040.png", # (Cardiomegaly) # "00008339_010.png", # (Cardiomegaly) # "00008365_000.png", # (Cardiomegaly) # "00012686_003.png", # (Cardiomegaly) ] tb_write_images(writer, model, test_dataset, chosen_diseases, n_epochs, device, image_list) # Close TB writer if experiment_mode != "debug": writer.close() # Run post_train print("-" * 50) print("Running post_train...") print("Loading test dataset...") test_dataset, test_dataloader = utilsT.prepare_data( dataset_dir, "test", chosen_diseases, batch_size, max_images=test_max_images) save_cms_with_names(run_name, experiment_mode, model, test_dataset, test_dataloader, chosen_diseases) evaluate_model(run_name, model, optimizer, device, loss_name, loss_params, chosen_diseases, test_dataloader, experiment_mode=experiment_mode, base_dir=base_dir) # Return values for debugging model_run = ModelRun(model, run_name, model_name, chosen_diseases) if experiment_mode == "debug": model_run.save_debug_data(writer, trainer, validator, train_dataset, train_dataloader, val_dataset, val_dataloader) return model_run