def summarise(group_id, log_dir="data/saved/log", model_dir="data/saved/models"): seeded_runs = sorted(list(Path(log_dir).glob(f"**/{group_id}/seed-*"))) print(f"Found a total of {len(seeded_runs)} seed runs in {group_id}") msg = f"Found no seeded runs for group_id: {group_id} in {log_dir}" assert len(seeded_runs) > 0, msg info_logs = OrderedDict() for seeded_run in seeded_runs: info_log_matches = list(Path(seeded_run).glob("**/info.log")) msg = f"expected to find a single info.log file, found {len(info_log_matches)}" assert len(info_log_matches) == 1, msg info_logs[seeded_run.stem] = info_log_matches[0] summary_log = [] for seeded_run, info_log_path in info_logs.items(): with open(info_log_path, "r") as f: log = f.read().splitlines() summary_log.extend(log) first_info_log = list(info_logs.values())[0] summary_log_name = f"summary-{'_'.join(list(info_logs.keys()))}.json" summary_log_path = first_info_log.parent / summary_log_name with open(summary_log_path, "w") as f: f.write("\n".join(summary_log)) print(f"Wrote concatenated logs to {summary_log_path}") # retrieve the config from the first run rel_path = first_info_log.relative_to(log_dir).parent config_path = Path(model_dir) / rel_path / "config.json" assert config_path.exists(), f"Could not find config at {config_path}" config = read_json(config_path) logger = logging.getLogger("summary") # some care is required with logging to avoid sending all experiment logs # to the same file. We avoid this by essentially resetting the logging utility # Remove all handlers associated with the root logger object for handler in logging.root.handlers[:]: logging.root.removeHandler(handler) logging.basicConfig(filename=summary_log_path, level=logging.INFO) if not logger.handlers: logger.addHandler(logging.StreamHandler()) log_summary( logger=logger, log_path=summary_log_path, eval_mode=config["eval_mode"], fixed_num_epochs=config["trainer"]["epochs"], )
def summarise(group_id, log_dir="data/saved/log", model_dir="data/saved/models"): seeded_runs = sorted(list(Path(log_dir).glob(f"**/{group_id}/seed-*"))) print(f"Found a total of {len(seeded_runs)} seed runs in {group_id}") info_logs = OrderedDict() for seeded_run in seeded_runs: info_log_matches = list(Path(seeded_run).glob("**/info.log")) msg = f"expected to find a single info.log file, found {len(info_log_matches)}" assert len(info_log_matches) == 1, msg info_logs[seeded_run.stem] = info_log_matches[0] summary_log = [] for seeded_run, info_log_path in info_logs.items(): with open(info_log_path, "r") as f: log = f.read().splitlines() summary_log.extend(log) first_info_log = list(info_logs.values())[0] summary_log_name = f"summary-{'_'.join(list(info_logs.keys()))}.json" summary_log_path = first_info_log.parent / summary_log_name with open(summary_log_path, "w") as f: f.write("\n".join(summary_log)) print(f"Wrote summary log to {summary_log_path}") # retrieve the config from the first run rel_path = first_info_log.relative_to(log_dir).parent config_path = Path(model_dir) / rel_path / "config.json" assert config_path.exists(), f"Could not find config at {config_path}" config = read_json(config_path) logger = logging.getLogger("summary") logging.basicConfig(filename=summary_log_path, level=logging.INFO) logger.addHandler(logging.StreamHandler()) log_summary( logger=logger, log_path=summary_log_path, eval_mode=config["eval_mode"], fixed_num_epochs=config["trainer"]["epochs"], )
def run_exp(config): warnings.filterwarnings('ignore') logger = config.get_logger('train') leaderboard_path = config._args.leaderboard Path(leaderboard_path).parent.mkdir(exist_ok=True, parents=True) with open(leaderboard_path, 'a') as f: txt_path = f"{config._log_dir}/preds.txt" print(txt_path, file=f, flush=True) expert_dims, raw_input_dims = compute_dims(config, logger) trn_config = compute_trn_config(config) if config._args.group_seed: seeds = [int(config._args.group_seed)] else: seeds = [int(x) for x in config._args.seeds.split(",")] # set up local filesystem on the cluster if socket.gethostname().endswith("cluster"): os.system(str(Path.home() / "configure_tmp_data.sh")) for ii, seed in enumerate(seeds): tic = time.time() logger.info(f"{ii + 1}/{len(seeds)} Setting experiment random seed to {seed}") set_seeds(seed) config["seed"] = seed # We use cls defaults for backwards compatibility with the MMIT configs. In the # long run this should be handled by the json configs themselves cls_defaults = ["train", "val", "tiny", "challenge"] model = config.init( name='arch', module=module_arch, expert_dims=expert_dims, text_dim=config["experts"]["text_dim"], disable_nan_checks=config["disable_nan_checks"], spatial_feats=config["data_loader"]["args"].get("spatial_feats", False), task=config.get("task", "retrieval"), ce_shared_dim=config["experts"].get("ce_shared_dim", None), feat_aggregation=config["data_loader"]["args"]["feat_aggregation"], trn_config=trn_config, trn_cat=config["data_loader"]["args"].get("trn_cat", 0), ) logger.info(model) data_loaders = config.init( name='data_loader', module=module_data, logger=logger, raw_input_dims=raw_input_dims, text_feat=config["experts"]["text_feat"], text_dim=config["experts"]["text_dim"], text_agg=config["experts"]["text_agg"], use_zeros_for_missing=config["experts"].get("use_zeros_for_missing", False), task=config.get("task", "retrieval"), cls_partitions=config.get("cls_partitions", cls_defaults) ) if config.get("manual_linear_init", False): logger.info("manually setting init for linear layers") def init_weights(m): if isinstance(m, nn.Linear): torch.nn.init.xavier_uniform(m.weight) m.bias.data.fill_(0.01) model.apply(init_weights) loss = config.init(name="loss", module=module_loss) metrics = [getattr(module_metric, met) for met in config['metrics']] trainable_params = filter(lambda p: p.requires_grad, model.parameters()) if config["optimizer"]["type"] == "RAdam": optimizer = config.init('optimizer', radam, trainable_params) elif config["optimizer"]["type"] == "Ranger": optimizer = config.init('optimizer', ranger, trainable_params) elif config["optimizer"]["type"] == "SWATS": optimizer = config.init('optimizer', swats, trainable_params) else: optimizer = config.init('optimizer', torch.optim, trainable_params) if config["lr_scheduler"]["type"] == "StepLR": lr_scheduler = config.init('lr_scheduler', torch.optim.lr_scheduler, optimizer) else: lr_scheduler = config.init('lr_scheduler', cos_restart, optimizer) visualizer = config.init( name='visualizer', module=module_vis, exp_name=config._exper_name, web_dir=config._web_log_dir, ) trainer = Trainer( model, loss, metrics, optimizer, config=config, data_loaders=data_loaders, lr_scheduler=lr_scheduler, mini_train=config._args.mini_train, disable_nan_checks=config["disable_nan_checks"], visualizer=visualizer, val_freq=config["trainer"].get("val_freq", 1), force_cpu_val=config.get("force_cpu_val", False), skip_first_n_saves=config["trainer"].get("skip_first_n_saves", 0), include_optim_in_ckpts=config["trainer"].get("include_optim_in_ckpts", 1), cache_targets=set(config.get("cache_targets", [])), ) trainer.train() best_ckpt_path = config.save_dir / "trained_model.pth" duration = time.strftime('%Hh%Mm%Ss', time.gmtime(time.time() - tic)) logger.info(f"Training took {duration}") if config._config.get("eval_settings", False): eval_config = copy.deepcopy(config) merge(eval_config._config, config["eval_settings"], strategy=Strategy.REPLACE) eval_config._args.resume = best_ckpt_path evaluation(eval_config, logger=logger, trainer=trainer) # If multiple runs were conducted, report relevant statistics if len(seeds) > 1: log_summary( logger=logger, log_path=config.log_path, eval_mode=config["eval_mode"], fixed_num_epochs=config["trainer"]["epochs"], ) print(f"Log file stored at {config.log_path}") # Report the location of the "best" checkpoint of the final seeded run (here # "best" corresponds to the model with the highest geometric mean over the # R@1, R@5 and R@10 metrics when a validation set is used, or simply the final # epoch of training for fixed-length schedules). print(f"The best performing ckpt can be found at {str(best_ckpt_path)}")
def main(config): logger = config.get_logger('train') expert_dims, raw_input_dims = compute_dims(config, logger) seeds = [int(x) for x in config._args.seeds.split(",")] for seed in seeds: # Set the random initial seeds tic = time.time() logger.info(f"Setting experiment random seed to {seed}") random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) data_loaders = config.init( name='data_loader', module=module_data, raw_input_dims=raw_input_dims, text_feat=config["experts"]["text_feat"], text_dim=config["experts"]["text_dim"], ) model = config.init( name='arch', module=module_arch, expert_dims=expert_dims, text_dim=config["experts"]["text_dim"], disable_nan_checks=config["disable_nan_checks"], ) logger.info(model) loss = config.init(name="loss", module=module_loss) metrics = [getattr(module_metric, met) for met in config['metrics']] trainable_params = filter(lambda p: p.requires_grad, model.parameters()) optimizer = config.init('optimizer', torch.optim, trainable_params) lr_scheduler = config.init('lr_scheduler', torch.optim.lr_scheduler, optimizer) visualizer = config.init( name='visualizer', module=module_vis, exp_name=config._exper_name, log_dir=config._web_log_dir, ) trainer = Trainer( model, loss, metrics, optimizer, config=config, data_loaders=data_loaders, lr_scheduler=lr_scheduler, mini_train=config._args.mini_train, disable_nan_checks=config["disable_nan_checks"], visualizer=visualizer, skip_first_n_saves=config["trainer"].get("skip_first_n_saves", 0), include_optim_in_ckpts=config["trainer"].get( "include_optim_in_ckpts", False), ) trainer.train() best_ckpt_path = config.save_dir / "trained_model.pth" duration = time.strftime('%Hh%Mm%Ss', time.gmtime(time.time() - tic)) logger.info(f"Training took {duration}") # If the dataset supports separate validation/test splits, the training config # json should specify an `eval_config` entry with the path to the test # configuration if config._config.get("eval_config", False): eval_args = argparse.ArgumentParser() eval_args.add_argument("--config", default=config["eval_config"]) eval_args.add_argument("--device", default=config._args.device) eval_args.add_argument("--resume", default=best_ckpt_path) eval_config = ConfigParser(eval_args, slave_mode=True) evaluation(eval_config, logger=logger) # If multiple runs were conducted, report relevant statistics if len(seeds) > 1: log_summary( logger=logger, log_path=config.log_path, eval_mode=config["eval_mode"], fixed_num_epochs=config["trainer"]["epochs"], ) print(f"Log file stored at {config.log_path}") # Report the location of the "best" checkpoint of the final seeded run (here # "best" corresponds to the model with the highest geometric mean over the # R@1, R@5 and R@10 metrics when a validation set is used, or simply the final # epoch of training for fixed-length schedules). print(f"The best performing ckpt can be found at {str(best_ckpt_path)}")