示例#1
0
    def __init__(self, path, resolution, **kwargs):
        self.path = path
        if not os.path.exists(f"{path}/{resolution}"):
            misc.error(
                f"Dataset folder {path}/{resolution} doesn't exists. Follow data preparation instructions using the prepare_data.py script."
            )

        self.img_files = sorted(glob.glob(f"{path}/{resolution}/*.png"))
        # misc.log(f"Found {len(self.img_files)} images in the dataset.")

        name = os.path.splitext(os.path.basename(self.path))[0]

        shape = [len(self.img_files)] + list(self._load_image(0).shape)
        if resolution is not None and (shape[2] != resolution
                                       or shape[3] != resolution):
            misc.error("Image files do not match the specified resolution")

        super().__init__(name=name, shape=shape, **kwargs)
示例#2
0
def run_cmdline(argv):
    parser = argparse.ArgumentParser(prog = argv[0], description = "Download and prepare data for the GANformer.")
    parser.add_argument("--data-dir",       help = "Directory of created dataset", default = "datasets", type = str)
    parser.add_argument("--max-images",     help = "Maximum number of images to have in the dataset (optional).", default = None, type = int)
    # Default tasks
    parser.add_argument("--clevr",          help = "Prepare the CLEVR dataset (6.41GB download, 100k images)", dest = "tasks", action = "append_const", const = "clevr")
    parser.add_argument("--bedrooms",       help = "Prepare the LSUN-bedrooms dataset (42.8GB download, 3M images)", dest = "tasks", action = "append_const", const = "bedrooms")
    parser.add_argument("--ffhq",           help = "Prepare the FFHQ dataset (13GB download, 70k images)", dest = "tasks", action = "append_const", const = "ffhq")
    parser.add_argument("--cityscapes",     help = "Prepare the cityscapes dataset (1.8GB download, 25k images)", dest = "tasks", action = "append_const", const = "cityscapes")
    # Create a new task with custom images
    parser.add_argument("--task",           help = "New dataset name", type = str, dest = "tasks", action = "append")
    parser.add_argument("--images-dir",     help = "Provide source image directory/file to convert into png-directory dataset (saves varied image resolutions)", default = None, type = str)
    parser.add_argument("--format",         help = "Images format", default = None, choices = ["png", "jpg", "npy", "hdf5", "tfds", "lmdb", "tfrecords"], type = str)
    parser.add_argument("--ratio",          help = "Images height/width", default = 1.0, type = float)

    args = parser.parse_args()
    if not args.tasks:
        misc.error("No tasks specified. Please see '-h' for help.")
    if args.max_images is not None and args.max_images < 50000:
        misc.log(f"Warning: max-images is set to {args.max_images}. We recommend setting it at least to 50,000 to allow statistically correct computation of the FID-50k metric.", "red")

    prepare(**vars(args))
示例#3
0
def setup_savefile(args, run_name, run_dir, config):
    snapshot, kimg, resume = None, 0, False
    pkls = sorted(glob.glob(f"{run_dir}/network*.pkl"))
    # Load a particular snapshot is specified
    if args.pretrained_pkl is not None and args.pretrained_pkl != "None":
        # Soft links support
        if args.pretrained_pkl.startswith("gdrive"):
            if args.pretrained_pkl not in loader.pretrained_networks:
                misc.error(
                    "--pretrained_pkl {} not available in the catalog (see loader.pretrained_networks dict)"
                )

            snapshot = args.pretrained_pkl
        else:
            snapshot = glob.glob(args.pretrained_pkl)[0]
            if os.path.islink(snapshot):
                snapshot = os.readlink(snapshot)

        # Extract training step from the snapshot if specified
        try:
            kimg = int(snapshot.split("-")[-1].split(".")[0])
        except:
            pass

    # Find latest snapshot in the directory
    elif len(pkls) > 0:
        snapshot = pkls[-1]
        kimg = int(snapshot.split("-")[-1].split(".")[0])
        resume = True

    if snapshot:
        misc.log(f"Resuming {run_name}, from {snapshot}, kimg {kimg}", "white")
        config.resume_pkl = snapshot
        config.resume_kimg = kimg
    else:
        misc.log("Start model training from scratch", "white")
示例#4
0
def run(**args):
    args = EasyDict(args)
    train = EasyDict(run_func_name="training.training_loop.training_loop"
                     )  # training loop options
    sched = EasyDict()  # TrainingSchedule options
    vis = EasyDict()  # visualize.eval() options
    grid = EasyDict(size="1080p",
                    layout="random")  # setup_snapshot_img_grid() options
    sc = dnnlib.SubmitConfig()  # dnnlib.submit_run() options

    # If the flag is specified without arguments (--arg), set to True
    for arg in [
            "summarize", "keep_samples", "style", "fused_modconv",
            "local_noise"
    ]:
        if args[arg] is None:
            args[arg] = True

    if not args.train and not args.eval:
        misc.log(
            "Warning: Neither --train nor --eval are provided. Therefore, we only print network shapes",
            "red")

    if args.gansformer_default:
        task = args.dataset
        pretrained = "gdrive:{}-snapshot.pkl".format(task)
        if pretrained not in pretrained_networks.gdrive_urls:
            pretrained = None

        nset(args, "recompile", pretrained is not None)
        nset(args, "pretrained_pkl", pretrained)
        nset(args, "mirror_augment", task in ["cityscapes", "ffhq"])

        nset(args, "transformer", True)
        nset(args, "components_num", {"clevr": 8}.get(task, 16))
        nset(args, "latent_size", {"clevr": 128}.get(task, 512))

        nset(args, "normalize", "layer")
        nset(args, "integration", "mul")
        nset(args, "kmeans", True)
        nset(args, "use_pos", True)
        nset(args, "mapping_ltnt2ltnt", task != "clevr")
        nset(args, "style", task != "clevr")

        nset(args, "g_arch", "resnet")
        nset(args, "mapping_resnet", True)

        gammas = {"ffhq": 10, "cities": 20, "clevr": 40, "bedrooms": 100}
        nset(args, "gamma", gammas.get(task, 10))

    if args.baseline == "GAN":
        nset(args, "style", False)
        nset(args, "latent_stem", True)

    if args.baseline == "SAGAN":
        nset(args, "style", False)
        nset(args, "latent_stem", True)
        nset(args, "g_img2img", 5)

    if args.baseline == "kGAN":
        nset(args, "kgan", True)
        nset(args, "merge_layer", 5)
        nset(args, "merge_type", "softmax")
        nset(args, "components_num", 8)

    # Environment configuration
    tf_config = {
        "rnd.np_random_seed": 1000,
        "allow_soft_placement": True,
        "gpu_options.per_process_gpu_memory_fraction": 1.0
    }
    if args.gpus != "":
        num_gpus = len(args.gpus.split(","))
        os.environ["CUDA_VISIBLE_DEVICES"] = args.gpus
    assert num_gpus in [1, 2, 4, 8]
    sc.num_gpus = num_gpus

    # Networks configuration
    cG = set_net("G", reg_interval=4)
    cD = set_net("D", reg_interval=16)

    # Dataset configuration
    # For bedrooms, we choose the most common ratio in the
    # dataset and crop the other images into that ratio.
    ratios = {
        "clevr": 0.75,
        "bedrooms": 188 / 256,
        "cityscapes": 0.5,
        "ffhq": 1.0
    }
    args.ratio = ratios.get(args.dataset, args.ratio)
    dataset_args = EasyDict(tfrecord_dir=args.dataset,
                            max_imgs=args.train_images_num,
                            num_threads=args.num_threads)
    for arg in ["data_dir", "mirror_augment", "total_kimg", "ratio"]:
        cset(train, arg, args[arg])

    # Training and Optimizations configuration
    for arg in ["eval", "train", "recompile", "last_snapshots"]:
        cset(train, arg, args[arg])

    # Round to the closest multiply of minibatch size for validity
    args.batch_size -= args.batch_size % args.minibatch_size
    args.minibatch_std_size -= args.minibatch_std_size % args.minibatch_size
    args.latent_size -= args.latent_size % args.components_num
    if args.latent_size == 0:
        misc.error(
            "--latent-size is too small. Must best a multiply of components-num"
        )

    sched_args = {
        "G_lrate": "g_lr",
        "D_lrate": "d_lr",
        "minibatch_size": "batch_size",
        "minibatch_gpu": "minibatch_size"
    }
    for arg, cmd_arg in sched_args.items():
        cset(sched, arg, args[cmd_arg])
    cset(train, "clip", args.clip)

    # Logging and metrics configuration
    metrics = [metric_defaults[x] for x in args.metrics]

    cset(cG.args, "truncation_psi", args.truncation_psi)
    for arg in ["keep_samples", "num_heads"]:
        cset(vis, arg, args[arg])
    for arg in ["summarize", "eval_images_num"]:
        cset(train, arg, args[arg])

    # Visualization
    args.vis_imgs = args.vis_images
    args.vis_ltnts = args.vis_latents
    vis_types = [
        "imgs", "ltnts", "maps", "layer_maps", "interpolations", "noise_var",
        "style_mix"
    ]
    # Set of all the set visualization types option
    vis.vis_types = {arg for arg in vis_types if args["vis_{}".format(arg)]}

    vis_args = {
        "attention": "transformer",
        "grid": "vis_grid",
        "num": "vis_num",
        "rich_num": "vis_rich_num",
        "section_size": "vis_section_size",
        "intrp_density": "interpolation_density",
        # "intrp_per_component": "interpolation_per_component",
        "alpha": "blending_alpha"
    }
    for arg, cmd_arg in vis_args.items():
        cset(vis, arg, args[cmd_arg])

    # Networks architecture
    cset(cG.args, "architecture", args.g_arch)
    cset(cD.args, "architecture", args.d_arch)
    cset(cG.args, "tanh", args.tanh)

    # Latent sizes
    if args.components_num > 1:
        if not (args.transformer or args.kgan):
            misc.error(
                "--components-num > 1 but the model is not using components. "
                +
                "Either add --transformer for GANsformer or --kgan for k-GAN.")

        args.latent_size = int(args.latent_size / args.components_num)
    cD.args.latent_size = cG.args.latent_size = cG.args.dlatent_size = args.latent_size
    cset([cG.args, cD.args, vis], "components_num", args.components_num)

    # Mapping network
    for arg in ["layersnum", "lrmul", "dim", "resnet", "shared_dim"]:
        field = "mapping_{}".format(arg)
        cset(cG.args, field, args[field])

    # StyleGAN settings
    for arg in ["style", "latent_stem", "fused_modconv", "local_noise"]:
        cset(cG.args, arg, args[arg])
    cD.args.mbstd_group_size = args.minibatch_std_size

    # GANsformer
    cset(cG.args, "transformer", args.transformer)
    cset(cD.args, "transformer", args.d_transformer)

    args.norm = args.normalize
    for arg in [
            "norm", "integration", "ltnt_gate", "img_gate", "iterative",
            "kmeans", "kmeans_iters", "mapping_ltnt2ltnt"
    ]:
        cset(cG.args, arg, args[arg])

    for arg in ["use_pos", "num_heads"]:
        cset([cG.args, cD.args], arg, args[arg])

    # Positional encoding
    for arg in ["dim", "init", "directions_num"]:
        field = "pos_{}".format(arg)
        cset([cG.args, cD.args], field, args[field])

    # k-GAN
    for arg in ["layer", "type", "same"]:
        field = "merge_{}".format(arg)
        cset(cG.args, field, args[field])
    cset([cG.args, train], "merge", args.kgan)

    if args.kgan and args.transformer:
        misc.error(
            "Either have --transformer for GANsformer or --kgan for k-GAN, not both"
        )

    # Attention
    for arg in ["start_res", "end_res", "ltnt2ltnt",
                "img2img"]:  # , "local_attention"
        cset(cG.args, arg, args["g_{}".format(arg)])
        cset(cD.args, arg, args["d_{}".format(arg)])
    cset(cG.args, "img2ltnt", args.g_img2ltnt)
    # cset(cD.args, "ltnt2img", args.d_ltnt2img)

    # Mixing and dropout
    for arg in [
            "style_mixing", "component_mixing", "component_dropout",
            "attention_dropout"
    ]:
        cset(cG.args, arg, args[arg])

    # Loss and regularization
    gloss_args = {
        "loss_type": "g_loss",
        "reg_weight": "g_reg_weight",
        # "pathreg": "pathreg",
    }
    dloss_args = {"loss_type": "d_loss", "reg_type": "d_reg", "gamma": "gamma"}
    for arg, cmd_arg in gloss_args.items():
        cset(cG.loss_args, arg, args[cmd_arg])
    for arg, cmd_arg in dloss_args.items():
        cset(cD.loss_args, arg, args[cmd_arg])

    ##### Experiments management:
    # Whenever we start a new experiment we store its result in a directory named 'args.expname:000'.
    # When we rerun a training or evaluation command it restores the model from that directory by default.
    # If we wish to restart the model training, we can set --restart and then we will store data in a new
    # directory: 'args.expname:001' after the first restart, then 'args.expname:002' after the second, etc.

    # Find the latest directory that matches the experiment
    exp_dir = sorted(glob.glob("{}/{}-*".format(args.result_dir,
                                                args.expname)))
    run_id = 0
    if len(exp_dir) > 0:
        run_id = int(exp_dir[-1].split("-")[-1])
    # If restart, then work over a new directory
    if args.restart:
        run_id += 1

    run_name = "{}-{:03d}".format(args.expname, run_id)
    train.printname = "{} ".format(misc.bold(args.expname))

    snapshot, kimg, resume = None, 0, False
    pkls = sorted(
        glob.glob("{}/{}/network*.pkl".format(args.result_dir, run_name)))
    # Load a particular snapshot is specified
    if args.pretrained_pkl is not None and args.pretrained_pkl != "None":
        # Soft links support
        if args.pretrained_pkl.startswith("gdrive"):
            if args.pretrained_pkl not in pretrained_networks.gdrive_urls:
                misc.error(
                    "--pretrained_pkl {} not available in the catalog (see pretrained_networks.py)"
                )

            snapshot = args.pretrained_pkl
        else:
            snapshot = glob.glob(args.pretrained_pkl)[0]
            if os.path.islink(snapshot):
                snapshot = os.readlink(snapshot)

        # Extract training step from the snapshot if specified
        try:
            kimg = int(snapshot.split("-")[-1].split(".")[0])
        except:
            pass

    # Find latest snapshot in the directory
    elif len(pkls) > 0:
        snapshot = pkls[-1]
        kimg = int(snapshot.split("-")[-1].split(".")[0])
        resume = True

    if snapshot:
        misc.log(
            "Resuming {}, from {}, kimg {}".format(run_name, snapshot, kimg),
            "white")
        train.resume_pkl = snapshot
        train.resume_kimg = kimg
    else:
        misc.log("Start model training from scratch", "white")

    # Run environment configuration
    sc.run_dir_root = args.result_dir
    sc.run_desc = args.expname
    sc.run_id = run_id
    sc.run_name = run_name
    sc.submit_target = dnnlib.SubmitTarget.LOCAL
    sc.local.do_not_copy_source_files = True

    kwargs = EasyDict(train)
    kwargs.update(cG=cG, cD=cD)
    kwargs.update(dataset_args=dataset_args,
                  vis_args=vis,
                  sched_args=sched,
                  grid_args=grid,
                  metric_arg_list=metrics,
                  tf_config=tf_config)
    kwargs.submit_config = copy.deepcopy(sc)
    kwargs.resume = resume
    kwargs.load_config = args.reload

    dnnlib.submit_run(**kwargs)
示例#5
0
def main():
    parser = argparse.ArgumentParser(description="Train the GANsformer")

    # Framework
    # ------------------------------------------------------------------------------------------------------
    parser.add_argument("--expname",
                        help="Experiment name",
                        default="exp",
                        type=str)
    parser.add_argument("--eval",
                        help="Evaluation mode (default: False)",
                        default=None,
                        action="store_true")
    parser.add_argument("--train",
                        help="Train mode (default: False)",
                        default=None,
                        action="store_true")
    parser.add_argument(
        "--gpus",
        help="Comma-separated list of GPUs to be used (default: %(default)s)",
        default="0",
        type=str)

    ## Default configurations
    parser.add_argument(
        "--gansformer-default",
        help=
        "Select a default GANsformer configuration, either pretrained (default) or from scratch (with --pretrained-pkl None)",
        default=None,
        action="store_true")
    parser.add_argument("--baseline",
                        help="Use a baseline model configuration",
                        default=None,
                        choices=["GAN", "StyleGAN2", "kGAN", "SAGAN"],
                        type=str)

    ## Resumption
    parser.add_argument("--pretrained-pkl",
                        help="Filename for a snapshot to resume (optional)",
                        default=None,
                        type=str)
    parser.add_argument("--restart",
                        help="Restart training from scratch",
                        default=False,
                        action="store_true")
    parser.add_argument(
        "--reload",
        help="Reload options from the original experiment configuration file. "
        +
        "If False, uses the command line arguments when resuming training (default: %(default)s)",
        default=False,
        action="store_true")
    parser.add_argument(
        "--recompile",
        help="Recompile model from source code when resuming training. " +
        "If False, loading modules created when the experiment first started",
        default=None,
        action="store_true")
    parser.add_argument(
        "--last-snapshots",
        help="Number of last snapshots to save. -1 for all (default: 10)",
        default=None,
        type=int)

    ## Dataset
    parser.add_argument("--data-dir",
                        help="Datasets root directory (default: %(default)s)",
                        default="datasets",
                        metavar="DIR")
    parser.add_argument(
        "--dataset",
        help="Training dataset name (subdirectory of data-dir)",
        required=True)
    parser.add_argument("--ratio",
                        help="Image height/width ratio in the dataset",
                        default=1.0,
                        type=float)
    parser.add_argument(
        "--num-threads",
        help="Number of input processing threads (default: %(default)s)",
        default=4,
        type=int)
    parser.add_argument(
        "--mirror-augment",
        help=
        "Perform horizontal flip augmentation for the data (default: %(default)s)",
        default=None,
        action="store_true")
    parser.add_argument(
        "--train-images-num",
        help=
        "Maximum number of images to train on. If not specified, train on the whole dataset.",
        default=None,
        type=int)

    ## Training
    parser.add_argument(
        "--batch-size",
        help="Global batch size (optimization step) (default: %(default)s)",
        default=32,
        type=int)
    parser.add_argument(
        "--minibatch-size",
        help=
        "Batch size per GPU, gradients will be accumulated to match batch-size (default: %(default)s)",
        default=4,
        type=int)
    parser.add_argument(
        "--total-kimg",
        help="Training length in thousands of images (default: %(default)s)",
        metavar="KIMG",
        default=25000,
        type=int)
    parser.add_argument("--gamma",
                        help="R1 regularization weight (default: %(default)s)",
                        default=10,
                        type=float)
    parser.add_argument("--clip",
                        help="Gradient clipping threshold (optional)",
                        default=None,
                        type=float)
    parser.add_argument("--g-lr",
                        help="Generator learning rate (default: %(default)s)",
                        default=0.002,
                        type=float)
    parser.add_argument(
        "--d-lr",
        help="Discriminator learning rate (default: %(default)s)",
        default=0.002,
        type=float)

    ## Logging and evaluation
    parser.add_argument(
        "--result-dir",
        help="Root directory for experiments (default: %(default)s)",
        default="results",
        metavar="DIR")
    parser.add_argument(
        "--metrics",
        help="Comma-separated list of metrics or none (default: %(default)s)",
        default="fid",
        type=_parse_comma_sep)
    parser.add_argument(
        "--summarize",
        help="Create TensorBoard summaries (default: %(default)s)",
        default=True,
        metavar="BOOL",
        type=_str_to_bool,
        nargs="?")
    parser.add_argument(
        "--truncation-psi",
        help="Truncation Psi to be used in producing sample images " +
        "(used only for visualizations, _not used_ in training or for computing metrics) (default: %(default)s)",
        default=0.65,
        type=float)
    parser.add_argument(
        "--keep-samples",
        help=
        "Keep all prior samples during training, or if False, just the most recent ones (default: %(default)s)",
        default=True,
        metavar="BOOL",
        type=_str_to_bool,
        nargs="?")
    parser.add_argument(
        "--eval-images-num",
        help="Number of images to evaluate metrics on (default: 50,000)",
        default=None,
        type=int)

    ## Visualization
    parser.add_argument("--vis-images",
                        help="Save image samples",
                        default=None,
                        action="store_true")
    parser.add_argument("--vis-latents",
                        help="Save latent vectors",
                        default=None,
                        action="store_true")
    parser.add_argument("--vis-maps",
                        help="Save attention maps (for GANsformer only)",
                        default=None,
                        action="store_true")
    parser.add_argument(
        "--vis-layer-maps",
        help="Save attention maps for all layers (for GANsformer only)",
        default=None,
        action="store_true")
    parser.add_argument("--vis-interpolations",
                        help="Create latent interpolations",
                        default=None,
                        action="store_true")
    parser.add_argument("--vis-noise-var",
                        help="Create noise variation visualization",
                        default=None,
                        action="store_true")
    parser.add_argument("--vis-style-mix",
                        help="Create style mixing visualization",
                        default=None,
                        action="store_true")

    parser.add_argument(
        "--vis-grid",
        help=
        "Whether to save the samples in one large grid files (default: True in training)",
        default=None,
        action="store_true")
    parser.add_argument(
        "--vis-num",
        help=
        "Number of images for which visualization will be created (default: grid-size/100 in train/eval)",
        default=None,
        type=int)
    parser.add_argument(
        "--vis-rich-num",
        help=
        "Number of samples for which richer visualizations will be created (default: 5)",
        default=None,
        type=int)
    parser.add_argument(
        "--vis-section-size",
        help=
        "Visualization section size to process at one (section-size <= vis-num) for memory footprint (default: 100)",
        default=None,
        type=int)
    parser.add_argument(
        "--blending-alpha",
        help=
        "Proportion for generated images and attention maps blends (default: 0.3)",
        default=None,
        type=float)
    parser.add_argument(
        "--interpolation-density",
        help=
        "Number of samples in between two end points of an interpolation (default: 8)",
        default=None,
        type=int)
    # parser.add_argument("--interpolation-per-component", help = "Whether to perform interpolation along particular latent components when true, or all of them at once otherwise (default: False)", default = None, action = "store_true")

    # Model
    # ------------------------------------------------------------------------------------------------------

    ## General architecture
    parser.add_argument("--g-arch",
                        help="Generator architecture type (default: skip)",
                        default=None,
                        choices=["orig", "skip", "resnet"],
                        type=str)
    parser.add_argument(
        "--d-arch",
        help="Discriminator architecture type (default: resnet)",
        default=None,
        choices=["orig", "skip", "resnet"],
        type=str)
    parser.add_argument("--tanh",
                        help="tanh on generator output (default: False)",
                        default=None,
                        action="store_true")

    # Mapping network
    parser.add_argument("--mapping-layersnum",
                        help="Number of mapping layers (default: 8)",
                        default=None,
                        type=int)
    parser.add_argument(
        "--mapping-lrmul",
        help="Mapping network learning rate multiplier (default: 0.01)",
        default=None,
        type=float)
    parser.add_argument("--mapping-dim",
                        help="Mapping layers dimension (default: latent_size)",
                        default=None,
                        type=int)
    parser.add_argument(
        "--mapping-resnet",
        help="Use resent connections in mapping layers (default: False)",
        default=None,
        action="store_true")
    parser.add_argument(
        "--mapping-shared-dim",
        help=
        "Perform one shared mapping to all latent components concatenated together using the set dimension (default: disabled)",
        default=None,
        type=int)

    # Loss
    # parser.add_argument("--pathreg",            help = "Use path regularization in generator training (default: False)", default = None, action = "store_true")
    parser.add_argument("--g-loss",
                        help="Generator loss type (default: %(default)s)",
                        default="logistic_ns",
                        choices=["logistic", "logistic_ns", "hinge", "wgan"],
                        type=str)
    parser.add_argument(
        "--g-reg-weight",
        help="Generator regularization weight (default: %(default)s)",
        default=1.0,
        type=float)

    parser.add_argument("--d-loss",
                        help="Discriminator loss type (default: %(default)s)",
                        default="logistic",
                        choices=["wgan", "logistic", "hinge"],
                        type=str)
    parser.add_argument(
        "--d-reg",
        help="Discriminator regularization type (default: %(default)s)",
        default="r1",
        choices=["non", "gp", "r1", "r2"],
        type=str)
    # --gamma effectively functions as discriminator regularization weight

    # Mixing and dropout
    parser.add_argument(
        "--style-mixing",
        help="Style mixing (layerwise) probability (default: %(default)s)",
        default=0.9,
        type=float)
    parser.add_argument(
        "--component-mixing",
        help="Component mixing (objectwise) probability (default: %(default)s)",
        default=0.0,
        type=float)
    parser.add_argument("--component-dropout",
                        help="Component dropout (default: %(default)s)",
                        default=0.0,
                        type=float)
    parser.add_argument("--attention-dropout",
                        help="Attention dropout (default: 0.12)",
                        default=None,
                        type=float)

    # StyleGAN additions
    parser.add_argument("--style",
                        help="Global style modulation (default: %(default)s)",
                        default=True,
                        metavar="BOOL",
                        type=_str_to_bool,
                        nargs="?")
    parser.add_argument(
        "--latent-stem",
        help="Input latent through the generator stem grid (default: False)",
        default=None,
        action="store_true")
    parser.add_argument(
        "--fused-modconv",
        help=
        "Fuse modulation and convolution operations (default: %(default)s)",
        default=True,
        metavar="BOOL",
        type=_str_to_bool,
        nargs="?")
    parser.add_argument(
        "--local-noise",
        help="Add stochastic local noise each layer (default: %(default)s)",
        default=True,
        metavar="BOOL",
        type=_str_to_bool,
        nargs="?")
    parser.add_argument(
        "--minibatch-std-size",
        help=
        "Add minibatch standard deviation layer in the discriminator, 0 to disable (default: %(default)s)",
        default=4,
        type=int)

    ## GANsformer
    parser.add_argument(
        "--transformer",
        help=
        "Add transformer layers to the generator: top-down latents-to-image (default: False)",
        default=None,
        action="store_true")
    parser.add_argument(
        "--latent-size",
        help=
        "Latent size, summing the dimension of all components (default: %(default)s)",
        default=512,
        type=int)
    parser.add_argument(
        "--components-num",
        help=
        "Components number. Each component has latent dimension of 'latent-size / components-num'. "
        +
        "1 for StyleGAN since it has one global latent vector (default: %(default)s)",
        default=1,
        type=int)
    parser.add_argument(
        "--num-heads",
        help="Number of attention heads (default: %(default)s)",
        default=1,
        type=int)
    parser.add_argument("--normalize",
                        help="Feature normalization type (optional)",
                        default=None,
                        choices=["batch", "instance", "layer"],
                        type=str)
    parser.add_argument(
        "--integration",
        help=
        "Feature integration type: additive, multiplicative or both (default: %(default)s)",
        default="add",
        choices=["add", "mul", "both"],
        type=str)

    # Generator attention layers
    # Transformer resolution layers
    parser.add_argument(
        "--g-start-res",
        help=
        "Transformer minimum generator resolution (logarithmic): first layer in which transformer will be applied (default: %(default)s)",
        default=0,
        type=int)
    parser.add_argument(
        "--g-end-res",
        help=
        "Transformer maximum generator resolution (logarithmic): last layer in which transformer will be applied (default: %(default)s)",
        default=8,
        type=int)

    # Discriminator attention layers
    parser.add_argument(
        "--d-transformer",
        help=
        "Add transformer layers to the discriminator (bottom-up image-to-latents) (default: False)",
        default=None,
        action="store_true")
    parser.add_argument(
        "--d-start-res",
        help=
        "Transformer minimum discriminator resolution (logarithmic): first layer in which transformer will be applied (default: %(default)s)",
        default=0,
        type=int)
    parser.add_argument(
        "--d-end-res",
        help=
        "Transformer maximum discriminator resolution (logarithmic): last layer in which transformer will be applied (default: %(default)s)",
        default=8,
        type=int)

    # Attention
    parser.add_argument(
        "--ltnt-gate",
        help=
        "Gate attention from latents, such that components may not send information "
        + "when gate value is low (default: False)",
        default=None,
        action="store_true")
    parser.add_argument(
        "--img-gate",
        help=
        "Gate attention for images, such that some image positions may not get updated "
        + "or receive information when gate value is low (default: False)",
        default=None,
        action="store_true")
    parser.add_argument(
        "--kmeans",
        help=
        "Track and update image-to-latents assignment centroids, used in the duplex attention (default: False)",
        default=None,
        action="store_true")
    parser.add_argument(
        "--kmeans-iters",
        help=
        "Number of K-means iterations per transformer layer. Note that centroids are carried from layer to layer (default: %(default)s)",
        default=1,
        type=int)  # -per-layer
    parser.add_argument(
        "--iterative",
        help=
        "Whether to carry over attention assignments across transformer layers of different resolutions (default: False)",
        default=None,
        action="store_true")

    # Attention directions
    # format is A2B: Elements _from_ B attend _to_ elements in A, and B elements get updated accordingly.
    # Note that it means that information propagates in the following direction: A -> B
    parser.add_argument(
        "--mapping-ltnt2ltnt",
        help=
        "Add self-attention over latents in the mapping network (default: False)",
        default=None,
        action="store_true")
    parser.add_argument(
        "--g-ltnt2ltnt",
        help=
        "Add self-attention over latents in the synthesis network (default: False)",
        default=None,
        action="store_true")
    parser.add_argument(
        "--g-img2img",
        help=
        "Add self-attention between images positions in that layer of the generator (SAGAN) (default: disabled)",
        default=0,
        type=int)
    parser.add_argument(
        "--g-img2ltnt",
        help=
        "Add image to latents attention (bottom-up) (default: %(default)s)",
        default=None,
        action="store_true")
    # g-ltnt2img: default information flow direction when using --transformer

    # parser.add_argument("--d-ltnt2img",       help = "Add latents to image attention (top-down) (default: %(default)s)", default = None, action = "store_true")
    parser.add_argument(
        "--d-ltnt2ltnt",
        help=
        "Add self-attention over latents in the discriminator (default: False)",
        default=None,
        action="store_true")
    parser.add_argument(
        "--d-img2img",
        help=
        "Add self-attention over images positions in that layer of the discriminator (SAGAN) (default: disabled)",
        default=0,
        type=int)
    # d-img2ltnt: default information flow direction when using --d-transformer

    # Local attention operations (replacing convolution)
    # parser.add_argument("--g-local-attention",  help = "Local attention operations in the generation up to this layer (default: disabled)", default = None, type = int)
    # parser.add_argument("--d-local-attention",  help = "Local attention operations in the discriminator up to this layer (default: disabled)", default = None, type = int)

    # Positional encoding
    parser.add_argument("--use-pos",
                        help="Use positional encoding (default: False)",
                        default=None,
                        action="store_true")
    parser.add_argument(
        "--pos-dim",
        help="Positional encoding dimension (default: latent-size)",
        default=None,
        type=int)
    parser.add_argument(
        "--pos-type",
        help="Positional encoding type (default: %(default)s)",
        default="sinus",
        choices=["linear", "sinus", "trainable", "trainable2d"],
        type=str)
    parser.add_argument(
        "--pos-init",
        help=
        "Positional encoding initialization distribution (default: %(default)s)",
        default="uniform",
        choices=["uniform", "normal"],
        type=str)
    parser.add_argument(
        "--pos-directions-num",
        help=
        "Positional encoding number of spatial directions (default: %(default)s)",
        default=2,
        type=int)

    ## k-GAN
    parser.add_argument(
        "--kgan",
        help=
        "Generate components-num images and then merge them (k-GAN) (default: False)",
        default=None,
        action="store_true")
    parser.add_argument(
        "--merge-layer",
        help=
        "Merge layer, where images get combined through alpha-composition (default: %(default)s)",
        default=-1,
        type=int)
    parser.add_argument("--merge-type",
                        help="Merge type (default: sum)",
                        default=None,
                        choices=["sum", "softmax", "max", "leaves"],
                        type=str)
    parser.add_argument(
        "--merge-same",
        help=
        "Merge images with same alpha weights across all spatial positions (default: %(default)s)",
        default=None,
        action="store_true")

    args = parser.parse_args()

    if not os.path.exists(args.data_dir):
        misc.error("Dataset root directory does not exist")

    if not os.path.exists("{}/{}".format(args.data_dir, args.dataset)):
        misc.error("The dataset {}/{} directory does not exist".format(
            args.data_dir, args.dataset))

    for metric in args.metrics:
        if metric not in metric_defaults:
            misc.error("Unknown metric: {}".format(metric))

    run(**vars(args))
示例#6
0
def run_cmdline(argv):
    parser = argparse.ArgumentParser(
        prog=argv[0],
        description="Download and prepare data for the GANsformer.")
    parser.add_argument("--data-dir",
                        help="Directory of created dataset",
                        default="datasets",
                        type=str)
    parser.add_argument(
        "--shards-num",
        help="Number of shards to split each dataset to (optional)",
        default=1,
        type=int)
    parser.add_argument(
        "--max-images",
        help=
        "Maximum number of images to have in the dataset (optional). Use to reduce the produced tfrecords file size",
        default=None,
        type=int)
    # Default tasks
    parser.add_argument(
        "--clevr",
        help=
        "Prepare the CLEVR dataset (18GB download, up to 15.5GB tfrecords, 100k images)",
        dest="tasks",
        action="append_const",
        const="clevr")
    parser.add_argument(
        "--bedrooms",
        help=
        "Prepare the LSUN-bedrooms dataset (42.8GB, up to 480GB tfrecords, 3M images)",
        dest="tasks",
        action="append_const",
        const="bedrooms")
    parser.add_argument(
        "--ffhq",
        help=
        "Prepare the FFHQ dataset (13GB download, 13GB tfrecords, 70k images)",
        dest="tasks",
        action="append_const",
        const="ffhq")
    parser.add_argument(
        "--cityscapes",
        help=
        "Prepare the cityscapes dataset (1.8GB, 8GB tfrecords, 25k images)",
        dest="tasks",
        action="append_const",
        const="cityscapes")
    # Create a new task with custom images
    parser.add_argument("--task",
                        help="New dataset name",
                        type=str,
                        dest="tasks",
                        action="append")
    parser.add_argument(
        "--images-dir",
        help=
        "Provide source image directory to convert into tfrecords (will be searched recursively)",
        default=None,
        type=str)
    parser.add_argument("--format",
                        help="Images format",
                        default=None,
                        choices=["png", "jpg", "npy", "hdf5", "tfds", "lmdb"],
                        type=str)
    parser.add_argument("--ratio",
                        help="Images height/width",
                        default=1.0,
                        type=float)

    args = parser.parse_args()
    if not args.tasks:
        misc.error("No tasks specified. Please see '-h' for help.")

    if args.max_images < 50000:
        misc.log(
            "Warning: max-images is set to {}. We recommend setting it at least to 50,000 to allow statistically correct computation of the FID-50k metric."
            .format(args.max_images), "red")

    prepare(**vars(args))
示例#7
0
def setup_config(run_dir, **args):
    args = EasyDict(args)  # command-line options
    train = EasyDict(run_dir=run_dir)  # training loop options
    vis = EasyDict(run_dir=run_dir)  # visualization loop options

    if args.reload:
        config_fn = os.path.join(run_dir, "training_options.json")
        if os.path.exists(config_fn):
            # Load config form the experiment existing file (and so ignore command-line arguments)
            with open(config_fn, "rt") as f:
                config = json.load(f)
            return config
        misc.log(
            f"Warning: --reload is set for a new experiment {args.expname}," +
            f" but configuration file to reload from {config_fn} doesn't exist.",
            "red")

    # GANformer and baselines default settings
    # ----------------------------------------------------------------------------

    if args.ganformer_default:
        task = args.dataset
        nset(args, "mirror_augment", task in ["cityscapes", "ffhq"])

        nset(args, "transformer", True)
        nset(args, "components_num", {"clevr": 8}.get(task, 16))
        nset(args, "latent_size", {"clevr": 128}.get(task, 512))

        nset(args, "normalize", "layer")
        nset(args, "integration", "mul")
        nset(args, "kmeans", True)
        nset(args, "use_pos", True)
        nset(args, "mapping_ltnt2ltnt", task != "clevr")
        nset(args, "style", task != "clevr")

        nset(args, "g_arch", "resnet")
        nset(args, "mapping_resnet", True)

        gammas = {"ffhq": 10, "cityscapes": 20, "clevr": 40, "bedrooms": 100}
        nset(args, "gamma", gammas.get(task, 10))

    if args.baseline == "GAN":
        nset(args, "style", False)
        nset(args, "latent_stem", True)

    ## k-GAN and SAGAN  are not currently supported in the pytorch version.
    ## See the TF version for implementation of these baselines!
    # if args.baseline == "SAGAN":
    #     nset(args, "style", False)
    #     nset(args, "latent_stem", True)
    #     nset(args, "g_img2img", 5)

    # if args.baseline == "kGAN":
    #     nset(args, "kgan", True)
    #     nset(args, "merge_layer", 5)
    #     nset(args, "merge_type", "softmax")
    #     nset(args, "components_num", 8)

    # General setup
    # ----------------------------------------------------------------------------

    # If the flag is specified without arguments (--arg), set to True
    for arg in [
            "cuda_bench", "allow_tf32", "keep_samples", "style", "local_noise"
    ]:
        if args[arg] is None:
            args[arg] = True

    if not any([args.train, args.eval, args.vis]):
        misc.log(
            "Warning: None of --train, --eval or --vis are provided. Therefore, we only print network shapes",
            "red")
    for arg in ["train", "eval", "vis", "last_snapshots"]:
        cset(train, arg, args[arg])

    if args.gpus != "":
        num_gpus = len(args.gpus.split(","))
        os.environ["CUDA_VISIBLE_DEVICES"] = args.gpus
    if not (num_gpus >= 1 and num_gpus & (num_gpus - 1) == 0):
        misc.error("Number of GPUs must be a power of two")
    args.num_gpus = num_gpus

    # CUDA settings
    for arg in ["batch_size", "batch_gpu", "allow_tf32"]:
        cset(train, arg, args[arg])
    cset(train, "cudnn_benchmark", args.cuda_bench)

    # Data setup
    # ----------------------------------------------------------------------------

    # For bedrooms, we choose the most common ratio in the
    # dataset and crop the other images into that ratio.
    ratios = {
        "clevr": 0.75,
        "bedrooms": 188 / 256,
        "cityscapes": 0.5,
        "ffhq": 1.0
    }
    args.ratio = args.ratio or ratios.get(args.dataset, 1.0)
    args.crop_ratio = 0.5 if args.resolution > 256 and args.ratio < 0.5 else None

    args.printname = args.expname
    for arg in ["total_kimg", "printname"]:
        cset(train, arg, args[arg])

    dataset_args = EasyDict(class_name="training.dataset.ImageFolderDataset",
                            path=f"{args.data_dir}/{args.dataset}",
                            max_items=args.train_images_num,
                            resolution=args.resolution,
                            ratio=args.ratio,
                            mirror_augment=args.mirror_augment)
    dataset_args.loader_args = EasyDict(num_workers=args.num_threads,
                                        pin_memory=True,
                                        prefetch_factor=2)

    # Optimization setup
    # ----------------------------------------------------------------------------

    cG = set_net("Generator", ["mapping", "synthesis"], args.g_lr, 4)
    cD = set_net("Discriminator", ["mapping", "block", "epilogue"], args.d_lr,
                 16)
    cset([cG, cD], "crop_ratio", args.crop_ratio)

    mbstd = min(
        args.batch_gpu, 4
    )  # other hyperparams behave more predictably if mbstd group size remains fixed
    cset(cD.epilogue_kwargs, "mbstd_group_size", mbstd)

    # Automatic tuning
    if args.autotune:
        batch_size = max(
            min(args.num_gpus * min(4096 // args.resolution, 32), 64),
            args.num_gpus)  # keep gpu memory consumption at bay
        batch_gpu = args.batch_size // args.num_gpus
        nset(args, "batch_size", batch_size)
        nset(args, "batch_gpu", batch_gpu)

        fmap_decay = 1 if args.resolution >= 512 else 0.5  # other hyperparams behave more predictably if mbstd group size remains fixed
        lr = 0.002 if args.resolution >= 1024 else 0.0025
        gamma = 0.0002 * (args.resolution**
                          2) / args.batch_size  # heuristic formula

        cset([cG.synthesis_kwargs, cD], "dim_base", int(fmap_decay * 32768))
        nset(args, "g_lr", lr)
        cset(cG.opt_args, "lr", args.g_lr)
        nset(args, "d_lr", lr)
        cset(cD.opt_args, "lr", args.d_lr)
        nset(args, "gamma", gamma)

        train.ema_rampup = 0.05
        train.ema_kimg = batch_size * 10 / 32

    if args.batch_size % (args.batch_gpu * args.num_gpus) != 0:
        misc.error(
            "--batch-size should be divided by --batch-gpu * 'num_gpus'")

    # Loss and regularization settings
    loss_args = EasyDict(class_name="training.loss.StyleGAN2Loss",
                         g_loss=args.g_loss,
                         d_loss=args.d_loss,
                         r1_gamma=args.gamma,
                         pl_weight=args.pl_weight)

    # if args.fp16:
    #     cset([cG.synthesis_kwargs, cD], "num_fp16_layers", 4) # enable mixed-precision training
    #     cset([cG.synthesis_kwargs, cD], "conv_clamp", 256) # clamp activations to avoid float16 overflow

    # cset([cG.synthesis_kwargs, cD.block_args], "fp16_channels_last", args.nhwc)

    # Evaluation and visualization
    # ----------------------------------------------------------------------------

    from metrics import metric_main
    for metric in args.metrics:
        if not metric_main.is_valid_metric(metric):
            misc.error(
                f"Unknown metric: {metric}. The valid metrics are: {metric_main.list_valid_metrics()}"
            )

    for arg in ["num_gpus", "metrics", "eval_images_num", "truncation_psi"]:
        cset(train, arg, args[arg])
    for arg in ["keep_samples", "num_heads"]:
        cset(vis, arg, args[arg])

    args.vis_imgs = args.vis_images
    args.vis_ltnts = args.vis_latents
    vis_types = [
        "imgs", "ltnts", "maps", "layer_maps", "interpolations", "noise_var",
        "style_mix"
    ]
    # Set of all the set visualization types option
    vis.vis_types = list({arg for arg in vis_types if args[f"vis_{arg}"]})

    vis_args = {
        "attention": "transformer",
        "grid": "vis_grid",
        "num": "vis_num",
        "rich_num": "vis_rich_num",
        "section_size": "vis_section_size",
        "intrp_density": "interpolation_density",
        # "intrp_per_component": "interpolation_per_component",
        "alpha": "blending_alpha"
    }
    for arg, cmd_arg in vis_args.items():
        cset(vis, arg, args[cmd_arg])

    # Networks setup
    # ----------------------------------------------------------------------------

    # Networks architecture
    cset(cG.synthesis_kwargs, "architecture", args.g_arch)
    cset(cD, "architecture", args.d_arch)

    # Latent sizes
    if args.components_num > 0:
        if not args.transformer:  # or args.kgan):
            misc.error(
                "--components-num > 0 but the model is not using components. "
                +
                "Add --transformer for GANformer (which uses latent components)."
            )
        if args.latent_size % args.components_num != 0:
            misc.error(
                f"--latent-size ({args.latent_size}) should be divisible by --components-num (k={k})"
            )
        args.latent_size = int(args.latent_size / args.components_num)

    cG.z_dim = cG.w_dim = args.latent_size
    cset([cG, vis], "k", args.components_num +
         1)  # We add a component to modulate features globally

    # Mapping network
    args.mapping_layer_dim = args.mapping_dim
    for arg in ["num_layers", "layer_dim", "resnet", "shared", "ltnt2ltnt"]:
        field = f"mapping_{arg}"
        cset(cG.mapping_kwargs, arg, args[field])

    # StyleGAN settings
    for arg in ["style", "latent_stem", "local_noise"]:
        cset(cG.synthesis_kwargs, arg, args[arg])

    # GANformer
    cset([cG.synthesis_kwargs, cG.mapping_kwargs], "transformer",
         args.transformer)

    # Attention related settings
    for arg in ["use_pos", "num_heads", "ltnt_gate", "attention_dropout"]:
        cset([cG.mapping_kwargs, cG.synthesis_kwargs], arg, args[arg])

    # Attention types and layers
    for arg in ["start_res", "end_res"
                ]:  # , "local_attention" , "ltnt2ltnt", "img2img", "img2ltnt"
        cset(cG.synthesis_kwargs, arg, args[f"g_{arg}"])

    # Mixing and dropout
    for arg in ["style_mixing", "component_mixing"]:
        cset(loss_args, arg, args[arg])
    cset(cG, "component_dropout", args["component_dropout"])

    # Extra transformer options
    args.norm = args.normalize
    for arg in [
            "norm", "integration", "img_gate", "iterative", "kmeans",
            "kmeans_iters"
    ]:
        cset(cG.synthesis_kwargs, arg, args[arg])

    # Positional encoding
    # args.pos_dim = args.pos_dim or args.latent_size
    for arg in ["dim", "type", "init", "directions_num"]:
        field = f"pos_{arg}"
        cset(cG.synthesis_kwargs, field, args[field])

    # k-GAN
    # for arg in ["layer", "type", "same"]:
    #     field = "merge_{}".format(arg)
    #     cset(cG.args, field, args[field])
    # cset(cG.synthesis_kwargs, "merge", args.kgan)
    # if args.kgan and args.transformer:
    # misc.error("Either have --transformer for GANformer or --kgan for k-GAN, not both")

    config = EasyDict(train)
    config.update(cG=cG,
                  cD=cD,
                  loss_args=loss_args,
                  dataset_args=dataset_args,
                  vis_args=vis)

    # Save config file
    with open(os.path.join(run_dir, "training_options.json"), "wt") as f:
        json.dump(config, f, indent=2)

    return config