def __init__(self, config): # TODO: we should somehow say more loudly that we are reserving these properties # Besides, some properties are vital for user to define at he has not idea about it :| # TODO: even I do not know all the options available in config :| if config.has('base_config'): self.config = Config.load(config.base_config) self.config.overwrite(config) else: self.config = config self._init_paths() # Reload config if we continue training if os.path.exists(self.paths.config_path): print( f'Detected existing config: {self.paths.config_path}. Loading it...' ) # A dirty hack that ensures that multiple trainers sync # This is needed for a synced file system # For some reason, portalocker does not work on a shared FS... time.sleep(1) self.config = Config.load(self.paths.config_path) self.config = self.config.overwrite(Config.read_from_cli()) self._init_logger() self._init_devices() if self.is_main_process() and not os.path.exists( self.paths.config_path): self.config.save(self.paths.config_path) if not self.config.get('silent') and self.is_main_process(): self.logger.info( f'Experiment directory: {self.paths.experiment_dir}') self._init_tb_writer() self._init_callbacks() self._init_checkpointing_strategy() self._init_validation_strategy() self._init_stopping_criteria() self.num_iters_done = 0 self.num_epochs_done = 0 self.is_explicitly_stopped = False self.train_dataloader = None self.val_dataloader = None
def load_config(args: argparse.Namespace, config_cli_args: List[str]) -> Config: base_config = Config.load('configs/base.yml') curr_config = Config.load(f'configs/{args.config_name}.yml') # Setting properties from the base config config = base_config.all.clone() config = config.overwrite(base_config.get(args.dataset)) # Setting properties from the current config config = config.overwrite(curr_config.all) config = config.overwrite(curr_config.get(args.dataset, Config({}))) # Setting experiment-specific properties config.set('experiments_dir', args.experiments_dir) config.set('random_seed', args.random_seed) # Overwriting with CLI arguments config = config.overwrite(Config.read_from_cli()) config.set('exp_name', compute_experiment_name(args, config.hp)) return config
def run(config_path: str, tb_port: int = None): config = Config.load(config_path) config = config.overwrite(Config.read_from_cli()) if config.get('distributed_training.enabled'): import horovod.torch as hvd hvd.init() fix_random_seed(config.random_seed + hvd.rank()) else: fix_random_seed(config.random_seed) trainer = GANTrainer(config) if not tb_port is None and trainer.is_main_process(): trainer.logger.info(f'Starting tensorboard on port {tb_port}') run_tensorboard(trainer.paths.experiment_dir, tb_port) trainer.start()
def extract_data(summary_path: os.PathLike, logs_path: os.PathLike) -> Tuple[List[float], Dict, str]: config = Config.load(summary_path).config events_acc = EventAccumulator(logs_path) events_acc.Reload() _, _, val_acc_diffs = zip(*events_acc.Scalars('diff/val/acc')) hp = config.hp.to_dict() hp['n_conv_layers'] = len(config.hp.conv_model_config.conv_sizes) if 'Minimum_test' in events_acc.images.Keys(): image_test = events_acc.Images('Minimum_test')[0].encoded_image_string image_train = events_acc.Images( 'Minimum_train')[0].encoded_image_string else: image_test = None image_train = None return val_acc_diffs, hp, image_test, image_train
def run_trainer(args: argparse.Namespace, config_args: List[str]): # TODO: read some staff from command line and overwrite config config = Config.load('configs/densepose-rcnn.yml') if not args.local_rank is None: config.set('gpus', [args.local_rank]) else: config.set('gpus', args.gpus) config.set('experiments_dir', args.experiments_dir) config_args = process_cli_config_args(config_args) config = config.overwrite( Config(config_args)) # Overwrite with CLI arguments trainer = DensePoseRCNNTrainer(config) if args.validate_only: print('Running validation only...') trainer.init() trainer.val_dataloader = trainer.train_dataloader trainer.validate() else: trainer.start()
type=str, default="stylegan2-ffhq-config-f.pt", help="path to the model checkpoint", ) parser.add_argument( "--channel_multiplier", type=int, default=2, help="channel multiplier of the generator. config-f = 2, else = 1", ) args, _ = parser.parse_known_args() args.latent = 512 args.n_mlp = 8 config = Config.load('config.yml') config = config.overwrite(Config.read_from_cli()) g_ema = Generator(config).to(device) checkpoint = torch.load(args.ckpt) g_ema.load_state_dict(checkpoint["g_ema"]) if args.truncation < 1: with torch.no_grad(): mean_latent = g_ema.mean_latent(args.truncation_mean) else: mean_latent = None generate(args, g_ema, device, mean_latent)
f"experiments/{config.exp_name}/checkpoint/{str(i).zfill(6)}.pt" ) if __name__ == "__main__": device = "cuda" parser = argparse.ArgumentParser(description="StyleGAN2 trainer") parser.add_argument("--config", type=str, help="Path to the config") parser.add_argument("--local_rank", type=int, default=0, help="local rank for distributed training") args, _ = parser.parse_known_args() config = Config.load(args.config) config = config.overwrite(Config.read_from_cli()) n_gpu = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 config.training.distributed = n_gpu > 1 if config.training.distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend="nccl", init_method="env://") synchronize() generator = Generator(config).to(device) discriminator = Discriminator(config).to(device) g_ema = Generator(config).to(device) g_ema.eval()