示例#1
0
def summarise(group_id, log_dir="data/saved/log", model_dir="data/saved/models"):
    seeded_runs = sorted(list(Path(log_dir).glob(f"**/{group_id}/seed-*")))
    print(f"Found a total of {len(seeded_runs)} seed runs in {group_id}")
    msg = f"Found no seeded runs for group_id: {group_id} in {log_dir}"
    assert len(seeded_runs) > 0, msg

    info_logs = OrderedDict()
    for seeded_run in seeded_runs:
        info_log_matches = list(Path(seeded_run).glob("**/info.log"))
        msg = f"expected to find a single info.log file, found {len(info_log_matches)}"
        assert len(info_log_matches) == 1, msg
        info_logs[seeded_run.stem] = info_log_matches[0]

    summary_log = []
    for seeded_run, info_log_path in info_logs.items():
        with open(info_log_path, "r") as f:
            log = f.read().splitlines()
        summary_log.extend(log)
    first_info_log = list(info_logs.values())[0]
    summary_log_name = f"summary-{'_'.join(list(info_logs.keys()))}.json"
    summary_log_path = first_info_log.parent / summary_log_name
    with open(summary_log_path, "w") as f:
        f.write("\n".join(summary_log))
    print(f"Wrote concatenated logs to {summary_log_path}")

    # retrieve the config from the first run
    rel_path = first_info_log.relative_to(log_dir).parent
    config_path = Path(model_dir) / rel_path / "config.json"
    assert config_path.exists(), f"Could not find config at {config_path}"
    config = read_json(config_path)

    logger = logging.getLogger("summary")

    # some care is required with logging to avoid sending all experiment logs
    # to the same file.  We avoid this by essentially resetting the logging utility

    # Remove all handlers associated with the root logger object
    for handler in logging.root.handlers[:]:
        logging.root.removeHandler(handler)
    logging.basicConfig(filename=summary_log_path, level=logging.INFO)
    if not logger.handlers:
        logger.addHandler(logging.StreamHandler())

    log_summary(
        logger=logger,
        log_path=summary_log_path,
        eval_mode=config["eval_mode"],
        fixed_num_epochs=config["trainer"]["epochs"],
    )
def summarise(group_id,
              log_dir="data/saved/log",
              model_dir="data/saved/models"):
    seeded_runs = sorted(list(Path(log_dir).glob(f"**/{group_id}/seed-*")))
    print(f"Found a total of {len(seeded_runs)} seed runs in {group_id}")

    info_logs = OrderedDict()
    for seeded_run in seeded_runs:
        info_log_matches = list(Path(seeded_run).glob("**/info.log"))
        msg = f"expected to find a single info.log file, found {len(info_log_matches)}"
        assert len(info_log_matches) == 1, msg
        info_logs[seeded_run.stem] = info_log_matches[0]

    summary_log = []
    for seeded_run, info_log_path in info_logs.items():
        with open(info_log_path, "r") as f:
            log = f.read().splitlines()
        summary_log.extend(log)
    first_info_log = list(info_logs.values())[0]
    summary_log_name = f"summary-{'_'.join(list(info_logs.keys()))}.json"
    summary_log_path = first_info_log.parent / summary_log_name
    with open(summary_log_path, "w") as f:
        f.write("\n".join(summary_log))
    print(f"Wrote summary log to {summary_log_path}")

    # retrieve the config from the first run
    rel_path = first_info_log.relative_to(log_dir).parent
    config_path = Path(model_dir) / rel_path / "config.json"
    assert config_path.exists(), f"Could not find config at {config_path}"
    config = read_json(config_path)

    logger = logging.getLogger("summary")
    logging.basicConfig(filename=summary_log_path, level=logging.INFO)
    logger.addHandler(logging.StreamHandler())

    log_summary(
        logger=logger,
        log_path=summary_log_path,
        eval_mode=config["eval_mode"],
        fixed_num_epochs=config["trainer"]["epochs"],
    )
示例#3
0
def run_exp(config):
    
    warnings.filterwarnings('ignore')
    logger = config.get_logger('train')

    leaderboard_path = config._args.leaderboard
    Path(leaderboard_path).parent.mkdir(exist_ok=True, parents=True)
    with open(leaderboard_path, 'a') as f:
        txt_path = f"{config._log_dir}/preds.txt"
        print(txt_path, file=f, flush=True)

    expert_dims, raw_input_dims = compute_dims(config, logger)
    trn_config = compute_trn_config(config)

    if config._args.group_seed:
        seeds = [int(config._args.group_seed)]
    else:
        seeds = [int(x) for x in config._args.seeds.split(",")]

    # set up local filesystem on the cluster
    if socket.gethostname().endswith("cluster"):
        os.system(str(Path.home() / "configure_tmp_data.sh"))

    for ii, seed in enumerate(seeds):
        tic = time.time()
        logger.info(f"{ii + 1}/{len(seeds)} Setting experiment random seed to {seed}")
        set_seeds(seed)
        config["seed"] = seed

        # We use cls defaults for backwards compatibility with the MMIT configs.  In the
        # long run this should be handled by the json configs themselves
        cls_defaults = ["train", "val", "tiny", "challenge"]

        model = config.init(
            name='arch',
            module=module_arch,
            expert_dims=expert_dims,
            text_dim=config["experts"]["text_dim"],
            disable_nan_checks=config["disable_nan_checks"],
            spatial_feats=config["data_loader"]["args"].get("spatial_feats", False),
            task=config.get("task", "retrieval"),
            ce_shared_dim=config["experts"].get("ce_shared_dim", None),
            feat_aggregation=config["data_loader"]["args"]["feat_aggregation"],
            trn_config=trn_config,
            trn_cat=config["data_loader"]["args"].get("trn_cat", 0),
        )
        logger.info(model)

        data_loaders = config.init(
            name='data_loader',
            module=module_data,
            logger=logger,
            raw_input_dims=raw_input_dims,
            text_feat=config["experts"]["text_feat"],
            text_dim=config["experts"]["text_dim"],
            text_agg=config["experts"]["text_agg"],
            use_zeros_for_missing=config["experts"].get("use_zeros_for_missing", False),
            task=config.get("task", "retrieval"),
            cls_partitions=config.get("cls_partitions", cls_defaults)
        )

        if config.get("manual_linear_init", False):
            logger.info("manually setting init for linear layers")
            def init_weights(m):
                if isinstance(m, nn.Linear):
                    torch.nn.init.xavier_uniform(m.weight)
                    m.bias.data.fill_(0.01)
            model.apply(init_weights)

        loss = config.init(name="loss", module=module_loss)
        metrics = [getattr(module_metric, met) for met in config['metrics']]
        trainable_params = filter(lambda p: p.requires_grad, model.parameters())

        if config["optimizer"]["type"] == "RAdam":
            optimizer = config.init('optimizer', radam, trainable_params)
        elif config["optimizer"]["type"] == "Ranger":
            optimizer = config.init('optimizer', ranger, trainable_params)
        elif config["optimizer"]["type"] == "SWATS":
            optimizer = config.init('optimizer', swats, trainable_params)
        else:
            optimizer = config.init('optimizer', torch.optim, trainable_params)

        if config["lr_scheduler"]["type"] == "StepLR":
            lr_scheduler = config.init('lr_scheduler', torch.optim.lr_scheduler,
                                       optimizer)
        else:
            lr_scheduler = config.init('lr_scheduler', cos_restart, optimizer)

        visualizer = config.init(
            name='visualizer',
            module=module_vis,
            exp_name=config._exper_name,
            web_dir=config._web_log_dir,
        )

        trainer = Trainer(
            model,
            loss,
            metrics,
            optimizer,
            config=config,
            data_loaders=data_loaders,
            lr_scheduler=lr_scheduler,
            mini_train=config._args.mini_train,
            disable_nan_checks=config["disable_nan_checks"],
            visualizer=visualizer,
            val_freq=config["trainer"].get("val_freq", 1),
            force_cpu_val=config.get("force_cpu_val", False),
            skip_first_n_saves=config["trainer"].get("skip_first_n_saves", 0),
            include_optim_in_ckpts=config["trainer"].get("include_optim_in_ckpts", 1),
            cache_targets=set(config.get("cache_targets", [])),
        )
        trainer.train()
        best_ckpt_path = config.save_dir / "trained_model.pth"
        duration = time.strftime('%Hh%Mm%Ss', time.gmtime(time.time() - tic))
        logger.info(f"Training took {duration}")

        if config._config.get("eval_settings", False):
            eval_config = copy.deepcopy(config)
            merge(eval_config._config, config["eval_settings"], strategy=Strategy.REPLACE)
            eval_config._args.resume = best_ckpt_path
            evaluation(eval_config, logger=logger, trainer=trainer)

    # If multiple runs were conducted, report relevant statistics
    if len(seeds) > 1:
        log_summary(
            logger=logger,
            log_path=config.log_path,
            eval_mode=config["eval_mode"],
            fixed_num_epochs=config["trainer"]["epochs"],
        )
    print(f"Log file stored at {config.log_path}")

    # Report the location of the "best" checkpoint of the final seeded run (here
    # "best" corresponds to the model with the highest geometric mean over the
    # R@1, R@5 and R@10 metrics when a validation set is used, or simply the final
    # epoch of training for fixed-length schedules).
    print(f"The best performing ckpt can be found at {str(best_ckpt_path)}")
示例#4
0
def main(config):
    logger = config.get_logger('train')
    expert_dims, raw_input_dims = compute_dims(config, logger)
    seeds = [int(x) for x in config._args.seeds.split(",")]

    for seed in seeds:
        # Set the random initial seeds
        tic = time.time()
        logger.info(f"Setting experiment random seed to {seed}")
        random.seed(seed)
        np.random.seed(seed)
        torch.manual_seed(seed)

        data_loaders = config.init(
            name='data_loader',
            module=module_data,
            raw_input_dims=raw_input_dims,
            text_feat=config["experts"]["text_feat"],
            text_dim=config["experts"]["text_dim"],
        )

        model = config.init(
            name='arch',
            module=module_arch,
            expert_dims=expert_dims,
            text_dim=config["experts"]["text_dim"],
            disable_nan_checks=config["disable_nan_checks"],
        )
        logger.info(model)

        loss = config.init(name="loss", module=module_loss)
        metrics = [getattr(module_metric, met) for met in config['metrics']]
        trainable_params = filter(lambda p: p.requires_grad,
                                  model.parameters())

        optimizer = config.init('optimizer', torch.optim, trainable_params)
        lr_scheduler = config.init('lr_scheduler', torch.optim.lr_scheduler,
                                   optimizer)
        visualizer = config.init(
            name='visualizer',
            module=module_vis,
            exp_name=config._exper_name,
            log_dir=config._web_log_dir,
        )

        trainer = Trainer(
            model,
            loss,
            metrics,
            optimizer,
            config=config,
            data_loaders=data_loaders,
            lr_scheduler=lr_scheduler,
            mini_train=config._args.mini_train,
            disable_nan_checks=config["disable_nan_checks"],
            visualizer=visualizer,
            skip_first_n_saves=config["trainer"].get("skip_first_n_saves", 0),
            include_optim_in_ckpts=config["trainer"].get(
                "include_optim_in_ckpts", False),
        )
        trainer.train()
        best_ckpt_path = config.save_dir / "trained_model.pth"
        duration = time.strftime('%Hh%Mm%Ss', time.gmtime(time.time() - tic))
        logger.info(f"Training took {duration}")

        # If the dataset supports separate validation/test splits, the training config
        # json should specify an `eval_config` entry with the path to the test
        # configuration
        if config._config.get("eval_config", False):
            eval_args = argparse.ArgumentParser()
            eval_args.add_argument("--config", default=config["eval_config"])
            eval_args.add_argument("--device", default=config._args.device)
            eval_args.add_argument("--resume", default=best_ckpt_path)
            eval_config = ConfigParser(eval_args, slave_mode=True)
            evaluation(eval_config, logger=logger)

    # If multiple runs were conducted, report relevant statistics
    if len(seeds) > 1:
        log_summary(
            logger=logger,
            log_path=config.log_path,
            eval_mode=config["eval_mode"],
            fixed_num_epochs=config["trainer"]["epochs"],
        )
    print(f"Log file stored at {config.log_path}")

    # Report the location of the "best" checkpoint of the final seeded run (here
    # "best" corresponds to the model with the highest geometric mean over the
    # R@1, R@5 and R@10 metrics when a validation set is used, or simply the final
    # epoch of training for fixed-length schedules).
    print(f"The best performing ckpt can be found at {str(best_ckpt_path)}")