Пример #1
0
def train_manifold_flow_sequential(args, dataset, model, simulator):
    """ MFMF-A training """

    assert not args.specified

    trainer = ManifoldFlowTrainer(model) if simulator.parameter_dim(
    ) is None else ConditionalManifoldFlowTrainer(model)

    common_kwargs = {
        "dataset": dataset,
        "batch_size": args.batchsize,
        "initial_lr": args.lr,
        "scheduler": optim.lr_scheduler.CosineAnnealingLR,
        "clip_gradient": args.clip,
        "validation_split": args.validationsplit,
    }
    if args.weightdecay is not None:
        common_kwargs["optimizer_kwargs"] = {
            "weight_decay": float(args.weightdecay)
        }

    logger.info("Starting training MF, phase 1: manifold training")
    learning_curves = trainer.train(
        loss_functions=[losses.mse],
        loss_labels=["MSE"],
        loss_weights=[args.msefactor],
        epochs=args.epochs // 2,
        parameters=list(model.outer_transform.parameters()) +
        list(model.encoder.parameters())
        if args.algorithm == "emf" else model.outer_transform.parameters(),
        callbacks=[
            callbacks.save_model_after_every_epoch(
                create_filename("checkpoint", None, args)[:-3] +
                "_epoch_A{}.pt")
        ],
        forward_kwargs={"mode": "projection"},
        **common_kwargs,
    )
    learning_curves = np.vstack(learning_curves).T

    logger.info("Starting training MF, phase 2: density training")
    learning_curves_ = trainer.train(
        loss_functions=[losses.nll],
        loss_labels=["NLL"],
        loss_weights=[args.nllfactor],
        epochs=args.epochs - (args.epochs // 2),
        parameters=model.inner_transform.parameters(),
        callbacks=[
            callbacks.save_model_after_every_epoch(
                create_filename("checkpoint", None, args)[:-3] +
                "_epoch_B{}.pt")
        ],
        forward_kwargs={"mode": "mf-fixed-manifold"},
        **common_kwargs,
    )
    learning_curves_ = np.vstack(learning_curves_).T
    learning_curves = learning_curves_ if learning_curves is None else np.vstack(
        (learning_curves, learning_curves_))

    return learning_curves
Пример #2
0
def train_pie(args, dataset, model, simulator):
    """ PIE training """
    trainer = ForwardTrainer(model) if simulator.parameter_dim(
    ) is None else ConditionalForwardTrainer(
        model) if args.scandal is None else SCANDALForwardTrainer(model)
    common_kwargs, scandal_loss, scandal_label, scandal_weight = make_training_kwargs(
        args, dataset)
    callbacks_ = [
        callbacks.save_model_after_every_epoch(
            create_filename("checkpoint", None, args))
    ]
    if simulator.is_image():
        callbacks_.append(
            callbacks.plot_sample_images(
                create_filename("training_plot", None, args),
                context=None if simulator.parameter_dim() is None else
                torch.zeros(30, simulator.parameter_dim())))
        callbacks_.append(
            callbacks.plot_reco_images(
                create_filename("training_plot", "reco_epoch", args)))

    logger.info("Starting training PIE on NLL")
    learning_curves = trainer.train(
        loss_functions=[losses.nll] + scandal_loss,
        loss_labels=["NLL"] + scandal_label,
        loss_weights=[args.nllfactor * nat_to_bit_per_dim(args.datadim)] +
        scandal_weight,
        epochs=args.epochs,
        callbacks=callbacks_,
        forward_kwargs={"mode": "pie"},
        initial_epoch=args.startepoch,
        **common_kwargs,
    )
    learning_curves = np.vstack(learning_curves).T
    return learning_curves
Пример #3
0
def main():
    # NOTE: All the following information will come from the UI once it's created
    template_fname = (
        r'C:/Users/erins/OneDrive - University of North Carolina at Chapel Hill/Protocols and SOPs/'
        r'MARG-PTC-001 Primary Cell Spinoculation and Latency/MARG-PTC-001a-v1_1-Lewin Template - Copy.xlsx'
    )

    out_dir = r'C:/Users/erins/OneDrive - University of North Carolina at Chapel Hill/Protocols and SOPs/' \
              r'MARG-PTC-001 Primary Cell Spinoculation and Latency/'

    template_name: str = 'test'
    template_desc: str = 'test'

    ########
    outfile: str = create_filename(out_dir,
                                   'formulas and names',
                                   extension='xlsx')
    template_data: dict = process_template_file(template_fname, outfile)

    for name, items in template_data:
        output_formulas_to_excel(outfile, name, items)

    filename: str = create_filename(out_dir, template_desc)
    # NOTE: Before the document is created, will need to update the formulas with:
    #   - Manual formula variable names
    #   - LaTeX for manual formula
    #   - How to account for potential multiple results?
    #       - These are formulas with Ifs, MIN, MAX
    #   - Dealing with Table formulas
    document: Document = create_document(template_data, template_name,
                                         template_desc)

    document.save(filename)
Пример #4
0
def evaluate_model_samples(args, simulator, x_gen):
    """ Evaluate model samples and save results """

    logger.info("Calculating likelihood of generated samples")

    try:
        if simulator.parameter_dim() is None:
            log_likelihood_gen = simulator.log_density(x_gen)
        else:
            params = simulator.default_parameters(true_param_id=args.trueparam)
            params = np.asarray([params for _ in range(args.generate)])
            log_likelihood_gen = simulator.log_density(x_gen,
                                                       parameters=params)
        log_likelihood_gen[np.isnan(log_likelihood_gen)] = -1.0e-12
        np.save(create_filename("results", "samples_likelihood", args),
                log_likelihood_gen)
    except IntractableLikelihoodError:
        logger.info("True simulator likelihood is intractable for dataset %s",
                    args.dataset)

    # Distance from manifold
    try:
        logger.info("Calculating distance from manifold of generated samples")
        distances_gen = simulator.distance_from_manifold(x_gen)
        np.save(create_filename("results", "samples_manifold_distance", args),
                distances_gen)
    except NotImplementedError:
        logger.info("Cannot calculate distance from manifold for dataset %s",
                    args.dataset)
Пример #5
0
def evaluate_model_samples(args, simulator, x_gen):
    """ Evaluate model samples and save results """

    logger.info("Calculating likelihood of generated samples")
    try:
        if simulator.parameter_dim() is None:
            log_likelihood_gen = simulator.log_density(x_gen)
        else:
            params = simulator.default_parameters(true_param_id=args.trueparam)
            params = np.asarray([params for _ in range(args.generate)])
            log_likelihood_gen = simulator.log_density(x_gen,
                                                       parameters=params)
        log_likelihood_gen[np.isnan(log_likelihood_gen)] = -1.0e-12
        np.save(create_filename("results", "samples_likelihood", args),
                log_likelihood_gen)
    except IntractableLikelihoodError:
        logger.info("True simulator likelihood is intractable for dataset %s",
                    args.dataset)

    logger.info("Calculating distance from manifold of generated samples")
    try:
        distances_gen = simulator.distance_from_manifold(x_gen)
        np.save(create_filename("results", "samples_manifold_distance", args),
                distances_gen)
    except NotImplementedError:
        logger.info("Cannot calculate distance from manifold for dataset %s",
                    args.dataset)

    if simulator.is_image():
        if calculate_fid_given_paths is None:
            logger.warning(
                "Cannot compute FID score, did not find FID implementation")
            return

        logger.info("Calculating FID score of generated samples")
        # The FID script needs an image folder
        with tempfile.TemporaryDirectory() as gen_dir:
            logger.debug(
                f"Storing generated images in temporary folder {gen_dir}")
            array_to_image_folder(x_gen, gen_dir)

            true_dir = create_filename("dataset", None, args) + "/test"
            os.makedirs(os.path.dirname(true_dir), exist_ok=True)
            if not os.path.exists(f"{true_dir}/0.jpg"):
                array_to_image_folder(
                    simulator.load_dataset(train=False,
                                           numpy=True,
                                           dataset_dir=create_filename(
                                               "dataset", None, args),
                                           true_param_id=args.trueparam)[0],
                    true_dir)

            logger.debug("Beginning FID calculation with batchsize 50")
            fid = calculate_fid_given_paths([gen_dir, true_dir], 50, "", 2048)
            logger.info(f"FID = {fid}")

            np.save(create_filename("results", "samples_fid", args), [fid])
Пример #6
0
def train_specified_manifold_flow(args, dataset, model, simulator):
    """ FOM training """

    trainer = ForwardTrainer(model) if simulator.parameter_dim(
    ) is None else ConditionalForwardTrainer(
        model) if args.scandal is None else SCANDALForwardTrainer(model)
    common_kwargs, scandal_loss, scandal_label, scandal_weight = make_training_kwargs(
        args, dataset)

    logger.info("Starting training MF with specified manifold on NLL")
    learning_curves = trainer.train(
        loss_functions=[losses.mse, losses.nll] + scandal_loss,
        loss_labels=["MSE", "NLL"] + scandal_label,
        loss_weights=[
            0.0, args.nllfactor * nat_to_bit_per_dim(args.modellatentdim)
        ] + scandal_weight,
        epochs=args.epochs,
        callbacks=[
            callbacks.save_model_after_every_epoch(
                create_filename("checkpoint", None, args))
        ],
        forward_kwargs={"mode": "mf"},
        initial_epoch=args.startepoch,
        **common_kwargs,
    )
    learning_curves = np.vstack(learning_curves).T

    return learning_curves
Пример #7
0
def train_generative_adversarial_manifold_flow(args, dataset, model,
                                               simulator):
    """ MFMF-OT training """

    gen_trainer = AdversarialTrainer(model) if simulator.parameter_dim(
    ) is None else ConditionalAdversarialTrainer(model)
    common_kwargs, scandal_loss, scandal_label, scandal_weight = make_training_kwargs(
        args, dataset)
    common_kwargs["batch_size"] = args.genbatchsize

    logger.info("Starting training GAMF: Sinkhorn-GAN")

    callbacks_ = [
        callbacks.save_model_after_every_epoch(
            create_filename("checkpoint", None, args))
    ]
    if args.debug:
        callbacks_.append(callbacks.print_mf_weight_statistics())

    learning_curves_ = gen_trainer.train(
        loss_functions=[losses.make_sinkhorn_divergence()],
        loss_labels=["GED"],
        loss_weights=[args.sinkhornfactor],
        epochs=args.epochs,
        callbacks=callbacks_,
        compute_loss_variance=True,
        initial_epoch=args.startepoch,
        **common_kwargs,
    )

    learning_curves = np.vstack(learning_curves_).T
    return learning_curves
Пример #8
0
def run_mcmc(args, simulator, model=None):
    """ MCMC """

    logger.info(
        "Starting MCMC based on %s after %s observed samples, generating %s posterior samples with %s for parameter point number %s",
        "true simulator likelihood" if model is None else "neural likelihood estimate",
        args.observedsamples,
        args.mcmcsamples,
        "slice sampler" if args.slicesampler else "Metropolis-Hastings sampler (step = {})".format(args.mcmcstep),
        args.trueparam,
    )

    # Data
    true_parameters = simulator.default_parameters(true_param_id=args.trueparam)
    x_obs, _ = simulator.load_dataset(
        train=False, numpy=True, dataset_dir=create_filename("dataset", None, args), true_param_id=args.trueparam, joint_score=False, limit_samplesize=args.observedsamples
    )
    x_obs_ = torch.tensor(x_obs, dtype=torch.float)

    if model is None:
        # MCMC based on ground truth likelihood
        def log_posterior(params):
            log_prob = np.sum(simulator.log_density(x_obs, parameters=params))
            log_prob += simulator.evaluate_log_prior(params)
            return float(log_prob)

    else:
        # MCMC based on neural likelihood estimator
        def log_posterior(params):
            params_ = np.broadcast_to(params.reshape((-1, params.shape[-1])), (x_obs.shape[0], params.shape[-1]))
            params_ = torch.tensor(params_, dtype=torch.float)

            if args.algorithm == "flow":
                log_prob = np.sum(model.log_prob(x_obs_, context=params_).detach().numpy())
            elif args.algorithm in ["pie", "slice"]:
                log_prob = np.sum(model.log_prob(x_obs_, context=params_, mode=args.algorithm).detach().numpy())
            elif not args.conditionalouter:
                # Slow part of Jacobian drops out in LLR / MCMC acceptance ratio
                log_prob = np.sum(model.log_prob(x_obs_, context=params_, mode="mf-fixed-manifold").detach().numpy())
            else:
                log_prob = np.sum(model.log_prob(x_obs_, context=params_, mode="mf").detach().numpy())

            log_prob += simulator.evaluate_log_prior(params)
            return float(log_prob)

    if args.slicesampler:
        logger.debug("Initializing slice sampler")
        sampler = mcmc.SliceSampler(true_parameters, log_posterior, thin=args.thin)
    else:
        logger.debug("Initializing Gaussian Metropolis-Hastings sampler")
        sampler = mcmc.GaussianMetropolis(true_parameters, log_posterior, step=args.mcmcstep, thin=args.thin)

    if args.burnin > 0:
        logger.info("Starting burn in")
        sampler.gen(args.burnin)
    logger.info("Burn in done, starting main chain")
    posterior_samples = sampler.gen(args.mcmcsamples)
    logger.info("MCMC done")

    return posterior_samples
Пример #9
0
def train_pie(args, dataset, model, simulator):
    """ PIE training """

    trainer = ManifoldFlowTrainer(model) if simulator.parameter_dim(
    ) is None else ConditionalManifoldFlowTrainer(model)
    logger.info("Starting training PIE on NLL")
    common_kwargs = {
        "dataset": dataset,
        "batch_size": args.batchsize,
        "initial_lr": args.lr,
        "scheduler": optim.lr_scheduler.CosineAnnealingLR,
        "clip_gradient": args.clip,
        "validation_split": args.validationsplit,
    }
    if args.weightdecay is not None:
        common_kwargs["optimizer_kwargs"] = {
            "weight_decay": float(args.weightdecay)
        }

    learning_curves = trainer.train(
        loss_functions=[losses.nll],
        loss_labels=["NLL"],
        loss_weights=[args.nllfactor],
        epochs=args.epochs,
        callbacks=[
            callbacks.save_model_after_every_epoch(
                create_filename("checkpoint", None, args)[:-3] +
                "_epoch_{}.pt")
        ],
        forward_kwargs={"mode": "pie"},
        **common_kwargs,
    )
    learning_curves = np.vstack(learning_curves).T
    return learning_curves
Пример #10
0
def sample_from_model(args, model, simulator, batchsize=200):
    """ Generate samples from model and store """

    logger.info("Sampling from model")

    x_gen_all = []
    while len(x_gen_all) < args.generate:
        n = min(batchsize, args.generate - len(x_gen_all))

        if simulator.parameter_dim() is None:
            x_gen = model.sample(n=n).detach().numpy()

        elif args.trueparam is None:  # Sample from prior
            params = simulator.sample_from_prior(n)
            params = torch.tensor(params, dtype=torch.float)
            x_gen = model.sample(n=n, context=params).detach().numpy()

        else:
            params = simulator.default_parameters(true_param_id=args.trueparam)
            params = np.asarray([params for _ in range(n)])
            params = torch.tensor(params, dtype=torch.float)
            x_gen = model.sample(n=n, context=params).detach().numpy()

        x_gen_all += list(x_gen)

    x_gen_all = np.array(x_gen_all)
    np.save(create_filename("results", "samples", args), x_gen_all)
    return x_gen_all
Пример #11
0
def train_generative_adversarial_manifold_flow_alternating(
        args, dataset, model, simulator):
    """ MFMF-OTA training """

    assert not args.specified

    gen_trainer = GenerativeTrainer(model) if simulator.parameter_dim(
    ) is None else ConditionalGenerativeTrainer(model)
    likelihood_trainer = ManifoldFlowTrainer(model) if simulator.parameter_dim(
    ) is None else ConditionalManifoldFlowTrainer(model)
    metatrainer = AlternatingTrainer(model, gen_trainer, likelihood_trainer)

    meta_kwargs = {
        "dataset": dataset,
        "initial_lr": args.lr,
        "scheduler": optim.lr_scheduler.CosineAnnealingLR,
        "validation_split": args.validationsplit
    }
    if args.weightdecay is not None:
        meta_kwargs["optimizer_kwargs"] = {
            "weight_decay": float(args.weightdecay)
        }

    phase1_kwargs = {"clip_gradient": args.clip}
    phase2_kwargs = {
        "forward_kwargs": {
            "mode": "mf-fixed-manifold"
        },
        "clip_gradient": args.clip
    }

    phase1_parameters = model.parameters()
    phase2_parameters = model.inner_transform.parameters()

    logger.info(
        "Starting training GAMF, alternating between Sinkhorn divergence and log likelihood"
    )
    learning_curves_ = metatrainer.train(
        loss_functions=[losses.make_sinkhorn_divergence(), losses.nll],
        loss_function_trainers=[0, 1],
        loss_labels=["GED", "NLL"],
        loss_weights=[args.sinkhornfactor, args.nllfactor],
        batch_sizes=[args.genbatchsize, args.batchsize],
        epochs=args.epochs // 2,
        parameters=[phase1_parameters, phase2_parameters],
        callbacks=[
            callbacks.save_model_after_every_epoch(
                create_filename("checkpoint", None, args)[:-3] +
                "_epoch_{}.pt")
        ],
        trainer_kwargs=[phase1_kwargs, phase2_kwargs],
        subsets=args.subsets,
        subset_callbacks=[callbacks.print_mf_weight_statistics()]
        if args.debug else None,
        **meta_kwargs,
    )
    learning_curves = np.vstack(learning_curves_).T

    return learning_curves
Пример #12
0
def sample_from_model(args, model, simulator):
    """ Generate samples from model and store """

    logger.info("Sampling from model")
    if simulator.parameter_dim() is None:
        x_gen = model.sample(n=args.generate).detach().numpy()
    else:
        params = simulator.default_parameters(true_param_id=args.trueparam)
        params = np.asarray([params for _ in range(args.generate)])
        params = torch.tensor(params, dtype=torch.float)
        x_gen = model.sample(n=args.generate, context=params).detach().numpy()
    np.save(create_filename("results", "samples", args), x_gen)
    return x_gen
Пример #13
0
    def objective(trial):
        global counter

        counter += 1

        # Hyperparameters
        margs = pick_parameters(args, trial, counter)

        logger.info("Starting training for the following hyperparameters:")
        for k, v in margs.__dict__.items():
            logger.info("  %s: %s", k, v)

        # Bug fix related to some num_workers > 1 and CUDA. Bad things happen otherwise!
        torch.multiprocessing.set_start_method("spawn", force=True)

        # Load data
        simulator = load_simulator(margs)
        dataset = load_training_dataset(simulator, margs)

        # Create model
        model = create_model(margs, simulator)

        # Train
        _ = train.train_model(margs, dataset, model, simulator)

        # Save
        torch.save(model.state_dict(), create_filename("model", None, margs))

        # Evaluate
        model.eval()

        # Evaluate test samples
        log_likelihood_test, reconstruction_error_test, _ = evaluate.evaluate_test_samples(
            margs, simulator, model, paramscan=True)
        mean_log_likelihood_test = np.mean(log_likelihood_test)
        mean_reco_error_test = np.mean(reconstruction_error_test)

        # Generate samples
        x_gen = evaluate.sample_from_model(margs, model, simulator)
        distances_gen = simulator.distance_from_manifold(x_gen)
        mean_gen_distance = np.mean(distances_gen)

        # Report results
        logger.info("Results:")
        logger.info("  test log p:    %s", mean_log_likelihood_test)
        logger.info("  test reco err: %s", mean_reco_error_test)
        logger.info("  gen distance:  %s", mean_gen_distance)

        return (-1.0 * margs.metricnllfactor * mean_log_likelihood_test +
                margs.metricrecoerrorfactor * mean_reco_error_test +
                margs.metricdistancefactor * mean_gen_distance)
Пример #14
0
def timing(args):
    logger.info(
        "Timing algorithm %s with %s outer layers with transformation %s and %s inner layers with transformation %s",
        args.algorithm,
        args.outerlayers,
        args.outertransform,
        args.innerlayers,
        args.innertransform,
    )

    # Bug fix related to some num_workers > 1 and CUDA. Bad things happen otherwise!
    torch.multiprocessing.set_start_method("spawn", force=True)

    if torch.cuda.is_available():
        torch.set_default_tensor_type("torch.cuda.DoubleTensor")

    # Loop over data dims
    all_times = []
    for datadim in args.datadims:
        logger.info("Starting timing for %s-dimensional data", datadim)
        args.datadim = datadim

        # Data
        data = torch.randn(args.batchsize, datadim)
        data.requires_grad = True

        # Model
        model = create_model(args, context_features=None)
        if torch.cuda.is_available():
            model = model.to(torch.device("cuda"))

        # Time forward pass
        times = []
        for _ in range(args.repeats):
            time_before = time.time()
            _ = model(data)
            times.append(time.time() - time_before)

        logger.info("Mean time: %s s", np.mean(times))

        all_times.append(times)

    # Save results
    logger.info("Saving results")
    np.save(create_filename("timing", None, args), all_times)
Пример #15
0
def create_index_file(root_dir, width_tiles, height_tiles, xchunks, ychunks,
                      xtiles_per_chunk, ytiles_per_chunk):
    filename = os.path.join(root_dir, "world.dat")
    with open(filename, "wb") as f:
        tile_size = np.array([width_tiles, height_tiles], dtype=np.uint32)
        other_data = np.array(
            [xchunks, ychunks, xtiles_per_chunk, ytiles_per_chunk],
            dtype=np.uint16)
        fname_length = np.array([utils.FILENAME_LENGTH], dtype=np.uint8)

        f.write(tile_size.tobytes())
        f.write(other_data.tobytes())
        f.write(fname_length.tobytes())

        f.write(bytes(pad_filename(utils.CHUNK_DIRECTORY), encoding="utf-8"))
        for y in range(ychunks):
            for x in range(xchunks):
                f.write(
                    bytes(pad_filename(utils.create_filename(x, y, xchunks)),
                          encoding="utf-8"))
Пример #16
0
def create_empty(directory, width_tiles, height_tiles, xtiles_per_chunk,
                 ytiles_per_chunk):
    xchunks = int(width_tiles / xtiles_per_chunk) + (
        1 if width_tiles % xtiles_per_chunk != 0 else 0)
    ychunks = int(height_tiles / ytiles_per_chunk) + (
        1 if height_tiles % ytiles_per_chunk != 0 else 0)

    ensure_directory_exists(directory)
    clean_directory(directory)
    ensure_directory_exists(os.path.join(directory, utils.CHUNK_DIRECTORY))
    create_index_file(directory, width_tiles, height_tiles, xchunks, ychunks,
                      xtiles_per_chunk, ytiles_per_chunk)

    data = np.ones(xtiles_per_chunk * ytiles_per_chunk,
                   dtype=utils.TILE_DTYPE) * tiles.TILE_NONE
    for y in range(ychunks):
        for x in range(xchunks):
            filename = os.path.join(directory, utils.CHUNK_DIRECTORY,
                                    utils.create_filename(x, y, xchunks))
            with open(filename, "wb") as f:
                write_chunk(data, f)
Пример #17
0
def train_generative_adversarial_manifold_flow(args, dataset, model,
                                               simulator):
    """ MFMF-OT training """

    gen_trainer = GenerativeTrainer(model) if simulator.parameter_dim(
    ) is None else ConditionalGenerativeTrainer(model)
    common_kwargs = {
        "dataset": dataset,
        "initial_lr": args.lr,
        "scheduler": optim.lr_scheduler.CosineAnnealingLR,
        "clip_gradient": args.clip,
        "validation_split": args.validationsplit,
    }
    if args.weightdecay is not None:
        common_kwargs["optimizer_kwargs"] = {
            "weight_decay": float(args.weightdecay)
        }

    logger.info("Starting training GAMF: Sinkhorn-GAN")

    callbacks_ = [
        callbacks.save_model_after_every_epoch(
            create_filename("checkpoint", None, args)[:-3] + "_epoch_{}.pt")
    ]
    if args.debug:
        callbacks_.append(callbacks.print_mf_weight_statistics())

    learning_curves_ = gen_trainer.train(
        loss_functions=[losses.make_sinkhorn_divergence()],
        loss_labels=["GED"],
        loss_weights=[args.sinkhornfactor],
        epochs=args.epochs,
        callbacks=callbacks_,
        batch_size=args.genbatchsize,
        compute_loss_variance=True,
        **common_kwargs,
    )

    learning_curves = np.vstack(learning_curves_).T
    return learning_curves
Пример #18
0
def download_file(item, path, course, output_dir):
    filepath = create_filepath(course, path)
    description = item["Description"]["Html"]
    topic_type = item["TopicType"]
    title = item["Title"]
    if topic_type == 1:
        filename = create_filename(item)
        full_path = f"{output_dir}/{filepath}/{filename}"
        # These documents are real files that we want to download
        download_from_url(f"""{ufora}{item["Url"]}""", full_path)
        if item["Url"].endswith(".html"):
            # HTML files on Ufora need a little special treatment
            # We'll prepend a title, <base> tag and convert them to pdf
            with open(full_path, "r") as f:
                content = f.read()
            filename_without_extension = ".".join(filename.split(".")[:-1])
            description_path = f"{output_dir}/{filepath}/{filename_without_extension}.pdf"
            create_metadata(description_path, content,
                            filename_without_extension)
            new_content = f"<base href={ufora}><h1>{title}</h1>{content}"
            with open(full_path, "w") as f:
                f.write(new_content)
        elif description:
            # Choosing this filename might cause an overlap...
            filename_without_extension = ".".join(filename.split(".")[:-1])
            description_path = f"{output_dir}/{filepath}/{filename_without_extension}.pdf"
            create_metadata(description_path, description,
                            filename_without_extension)
    elif topic_type == 3:
        # These documents are just clickable links, we'll render them in a pdf
        url = item["Url"]
        filename = create_filename_without_extension(item)
        full_path = f"{output_dir}/{filepath}/{filename}"
        create_metadata(f"{full_path}.pdf",
                        f"<a href={url}>{url}</a>{description}", item["Title"])
    else:
        print(f"Don't know this topic type: {topic_type}")
        exit()
Пример #19
0
def train_dough(args, dataset, model, simulator):
    """ PIE with variable epsilons training """

    trainer = VariableDimensionManifoldFlowTrainer(
        model) if simulator.parameter_dim(
        ) is None else ConditionalVariableDimensionManifoldFlowTrainer(model)
    common_kwargs = {
        "dataset": dataset,
        "batch_size": args.batchsize,
        "initial_lr": args.lr,
        "scheduler": optim.lr_scheduler.CosineAnnealingLR,
        "clip_gradient": args.clip,
        "validation_split": args.validationsplit,
    }
    if args.weightdecay is not None:
        common_kwargs["optimizer_kwargs"] = {
            "weight_decay": float(args.weightdecay)
        }

    logger.info(
        "Starting training dough, phase 1: NLL without latent regularization")
    learning_curves = trainer.train(
        loss_functions=[losses.nll],
        loss_labels=["NLL"],
        loss_weights=[args.nllfactor],
        epochs=args.epochs,
        callbacks=[
            callbacks.save_model_after_every_epoch(
                create_filename("checkpoint", None, args)[:-3] +
                "_epoch_{}.pt")
        ],
        l1=args.doughl1reg,
        **common_kwargs,
    )
    learning_curves = np.vstack(learning_curves).T
    return learning_curves
Пример #20
0
def evaluate_test_samples(args, simulator, filename, model=None, ood=False, n_save_reco=100):
    """ Likelihood evaluation """

    logger.info(
        "Evaluating %s samples according to %s, %s likelihood evaluation, saving in %s",
        "the ground truth" if model is None else "a trained model",
        "ood" if ood else "test",
        "with" if not args.skiplikelihood else "without",
        filename,
    )

    # Prepare
    x, _ = simulator.load_dataset(
        train=False, numpy=True, ood=ood, dataset_dir=create_filename("dataset", None, args), true_param_id=args.trueparam, joint_score=False, limit_samplesize=args.evaluate,
    )
    parameter_grid = [None] if simulator.parameter_dim() is None else simulator.eval_parameter_grid(resolution=args.gridresolution)

    log_probs = []
    x_recos = None
    reco_error = None

    # Evaluate
    for i, params in enumerate(parameter_grid):
        logger.debug("Evaluating grid point %s / %s", i + 1, len(parameter_grid))
        if model is None:
            params_ = None if params is None else np.asarray([params for _ in x])
            log_prob = simulator.log_density(x, parameters=params_)

        else:
            log_prob = []
            reco_error_ = []
            x_recos_ = []
            n_batches = (args.evaluate - 1) // args.evalbatchsize + 1
            for j in range(n_batches):
                x_ = torch.tensor(x[j * args.evalbatchsize : (j + 1) * args.evalbatchsize], dtype=torch.float)
                if params is None:
                    params_ = None
                else:
                    params_ = np.asarray([params for _ in x_])
                    params_ = torch.tensor(params_, dtype=torch.float)

                if args.algorithm == "flow":
                    x_reco, log_prob_, _ = model(x_, context=params_)
                elif args.algorithm in ["pie", "slice"]:
                    x_reco, log_prob_, _ = model(x_, context=params_, mode=args.algorithm if not args.skiplikelihood else "projection")
                else:
                    x_reco, log_prob_, _ = model(x_, context=params_, mode="mf" if not args.skiplikelihood else "projection")

                if not args.skiplikelihood:
                    log_prob.append(log_prob_.detach().numpy())
                reco_error_.append((sum_except_batch((x_ - x_reco) ** 2) ** 0.5).detach().numpy())
                x_recos_.append(x_reco.detach().numpy())

            if not args.skiplikelihood:
                log_prob = np.concatenate(log_prob, axis=0)
            if reco_error is None:
                reco_error = np.concatenate(reco_error_, axis=0)
            if x_recos is None:
                x_recos = np.concatenate(x_recos_, axis=0)

        if not args.skiplikelihood:
            log_probs.append(log_prob)

    # Save results
    if len(log_probs) > 0:
        if simulator.parameter_dim() is None:
            log_probs = log_probs[0]

        np.save(create_filename("results", filename.format("log_likelihood"), args), log_probs)

    if len(x_recos) > 0:
        np.save(create_filename("results", filename.format("x_reco"), args), x_recos[:n_save_reco])

    if reco_error is not None:
        np.save(create_filename("results", filename.format("reco_error"), args), reco_error)

    if parameter_grid is not None:
        np.save(create_filename("results", "parameter_grid_test", args), parameter_grid)
Пример #21
0
        create_modelname(args)
        logger.info("Evaluating simulator truth")
    else:
        create_modelname(args)
        logger.info("Evaluating model %s", args.modelname)

    # Bug fix related to some num_workers > 1 and CUDA. Bad things happen otherwise!
    torch.multiprocessing.set_start_method("spawn", force=True)

    # Data set
    simulator = load_simulator(args)

    # Load model
    if not args.truth:
        model = create_model(args, simulator=simulator)
        model.load_state_dict(torch.load(create_filename("model", None, args), map_location=torch.device("cpu")))
        model.eval()
    else:
        model = None

    # Evaluate generative performance
    if args.skipgeneration:
        logger.info("Skipping generative evaluation")
    elif not args.truth:
        x_gen = sample_from_model(args, model, simulator)
        evaluate_model_samples(args, simulator, x_gen)

    if args.skipinference:
        logger.info("Skipping all inference tasks. Have a nice day!")
        exit()
Пример #22
0
                if containing_file == path_item:
                    is_file_found = True
                    break

            if is_file_found:
                utils.chdir(path_item)
                is_file_found = False
            else:
                print(f'NOT FOUND FILE: {path_item}')
                is_allow_continue = False

        return is_allow_continue


    file_locs = utils.get_all_file_locs(TARGETS)
    file_name = utils.create_filename(FILE_NAME_PREFIX, FILE_EXT)

    path_array = DESTINATION.split('\\')
    path_root = path_array[0]
    utils.chdir(path_root)

    utils.log_folders_destination_targets(DESTINATION, TARGETS, file_name)

    if validate_target_loc():
        with utils.zipfile.ZipFile(file_name, 'x', utils.zipfile.ZIP_LZMA, True) as backup_file:
            backup_file.close()

        # Starting pool with x workers.
        with Pool(processes=MAX_PROCESSES) as pool:
            # Launching multiple evaluations asynchronously *may* use more processes
            multiple_results = [pool.apply_async(utils.f, (i, file_name,)) for i in file_locs]
Пример #23
0
                    help='save the agent and the results')
parser.add_argument('--flipped_terminals',
                    action='store_true',
                    default=False,
                    help='flip the rewards associated '
                    'with terminal 1 and terminal 2')
parser.add_argument('--flipped_actions',
                    action='store_true',
                    default=False,
                    help='Shuffle the actions to cancel '
                    'the effect of model learning')

args = parser.parse_args()
experiment_settings = get_experiment_setting(args)
domain_settings = get_domain_setting(args)
filename = create_filename(args)
print("file: ", filename)
experiment_settings['filename'] = filename

if experiment_settings['method'] == 'sarsa_lambda':
    agent_config = []
    from sarsa_lambda.sarsa_lambda import build_agent, load_agent
elif experiment_settings['method'] == 'MuZero':
    from muzero.MuZeroAgent import MuZeroAgent, build_agent
    from muzero.env import muzero_config
    agent_config = muzero_config
    agent_config.flippedTask = args.flipped_terminals
    agent_config.flippedActions = args.flipped_actions
    from muzero.MuZeroAgent import build_agent, load_agent
else:
    assert False, 'HvS: Invalid method id.'
Пример #24
0
def train_manifold_flow_sequential(args, dataset, model, simulator):
    """ Sequential MFMF-M/D training """

    assert not args.specified

    if simulator.parameter_dim() is None:
        trainer1 = ForwardTrainer(model)
        trainer2 = ForwardTrainer(model)
    else:
        trainer1 = ConditionalForwardTrainer(model)
        if args.scandal is None:
            trainer2 = ConditionalForwardTrainer(model)
        else:
            trainer2 = SCANDALForwardTrainer(model)

    common_kwargs, scandal_loss, scandal_label, scandal_weight = make_training_kwargs(
        args, dataset)

    callbacks1 = [
        callbacks.save_model_after_every_epoch(
            create_filename("checkpoint", "A", args)),
        callbacks.print_mf_latent_statistics(),
        callbacks.print_mf_weight_statistics()
    ]
    callbacks2 = [
        callbacks.save_model_after_every_epoch(
            create_filename("checkpoint", "B", args)),
        callbacks.print_mf_latent_statistics(),
        callbacks.print_mf_weight_statistics()
    ]
    if simulator.is_image():
        callbacks1.append(
            callbacks.plot_sample_images(
                create_filename("training_plot", "sample_epoch_A", args),
                context=None if simulator.parameter_dim() is None else
                torch.zeros(30, simulator.parameter_dim()),
            ))
        callbacks2.append(
            callbacks.plot_sample_images(
                create_filename("training_plot", "sample_epoch_B", args),
                context=None if simulator.parameter_dim() is None else
                torch.zeros(30, simulator.parameter_dim()),
            ))
        callbacks1.append(
            callbacks.plot_reco_images(
                create_filename("training_plot", "reco_epoch_A", args)))
        callbacks2.append(
            callbacks.plot_reco_images(
                create_filename("training_plot", "reco_epoch_B", args)))

    logger.info("Starting training MF, phase 1: manifold training")
    learning_curves = trainer1.train(
        loss_functions=[losses.smooth_l1_loss if args.l1 else losses.mse] +
        ([] if args.uvl2reg is None else [losses.hiddenl2reg]),
        loss_labels=["L1" if args.l1 else "MSE"] +
        ([] if args.uvl2reg is None else ["L2_lat"]),
        loss_weights=[args.msefactor] +
        ([] if args.uvl2reg is None else [args.uvl2reg]),
        epochs=args.epochs // 2,
        parameters=list(model.outer_transform.parameters()) +
        list(model.encoder.parameters()) if args.algorithm == "emf" else list(
            model.outer_transform.parameters()),
        callbacks=callbacks1,
        forward_kwargs={
            "mode": "projection",
            "return_hidden": args.uvl2reg is not None
        },
        initial_epoch=args.startepoch,
        **common_kwargs,
    )
    learning_curves = np.vstack(learning_curves).T

    logger.info("Starting training MF, phase 2: density training")
    learning_curves_ = trainer2.train(
        loss_functions=[losses.nll] + scandal_loss,
        loss_labels=["NLL"] + scandal_label,
        loss_weights=[
            args.nllfactor * nat_to_bit_per_dim(args.modellatentdim)
        ] + scandal_weight,
        epochs=args.epochs - (args.epochs // 2),
        parameters=list(model.inner_transform.parameters()),
        callbacks=callbacks2,
        forward_kwargs={"mode": "mf-fixed-manifold"},
        initial_epoch=args.startepoch - args.epochs // 2,
        **common_kwargs,
    )
    learning_curves = np.vstack(
        (learning_curves, np.vstack(learning_curves_).T))

    return learning_curves
Пример #25
0
def train_manifold_flow_alternating(args, dataset, model, simulator):
    """ MFMF-A training """

    assert not args.specified

    trainer1 = ForwardTrainer(model) if simulator.parameter_dim(
    ) is None else ConditionalForwardTrainer(model)
    trainer2 = ForwardTrainer(model) if simulator.parameter_dim(
    ) is None else ConditionalForwardTrainer(
        model) if args.scandal is None else SCANDALForwardTrainer(model)
    metatrainer = AlternatingTrainer(model, trainer1, trainer2)

    meta_kwargs = {
        "dataset": dataset,
        "initial_lr": args.lr,
        "scheduler": optim.lr_scheduler.CosineAnnealingLR,
        "validation_split": args.validationsplit
    }
    if args.weightdecay is not None:
        meta_kwargs["optimizer_kwargs"] = {
            "weight_decay": float(args.weightdecay)
        }
    _, scandal_loss, scandal_label, scandal_weight = make_training_kwargs(
        args, dataset)

    phase1_kwargs = {
        "forward_kwargs": {
            "mode": "projection"
        },
        "clip_gradient": args.clip
    }
    phase2_kwargs = {
        "forward_kwargs": {
            "mode": "mf-fixed-manifold"
        },
        "clip_gradient": args.clip
    }

    phase1_parameters = list(model.outer_transform.parameters()) + list(
        model.encoder.parameters(
        )) if args.algorithm == "emf" else model.outer_transform.parameters()
    phase2_parameters = list(model.inner_transform.parameters())

    logger.info(
        "Starting training MF, alternating between reconstruction error and log likelihood"
    )
    learning_curves_ = metatrainer.train(
        loss_functions=[
            losses.smooth_l1_loss if args.l1 else losses.mse, losses.nll
        ] + scandal_loss,
        loss_function_trainers=[0, 1] +
        [1] if args.scandal is not None else [],
        loss_labels=["L1" if args.l1 else "MSE", "NLL"] + scandal_label,
        loss_weights=[
            args.msefactor,
            args.nllfactor * nat_to_bit_per_dim(args.modellatentdim)
        ] + scandal_weight,
        epochs=args.epochs // 2,
        subsets=args.subsets,
        batch_sizes=[args.batchsize, args.batchsize],
        parameters=[phase1_parameters, phase2_parameters],
        callbacks=[
            callbacks.save_model_after_every_epoch(
                create_filename("checkpoint", None, args))
        ],
        trainer_kwargs=[phase1_kwargs, phase2_kwargs],
        **meta_kwargs,
    )
    learning_curves = np.vstack(learning_curves_).T

    return learning_curves
Пример #26
0
def train_manifold_flow(args, dataset, model, simulator):
    """ MFMF-S training """

    assert not args.specified

    trainer = ForwardTrainer(model) if simulator.parameter_dim(
    ) is None else ConditionalForwardTrainer(
        model) if args.scandal is None else SCANDALForwardTrainer(model)
    common_kwargs, scandal_loss, scandal_label, scandal_weight = make_training_kwargs(
        args, dataset)

    logger.info(
        "Starting training MF, phase 1: pretraining on reconstruction error")
    learning_curves = trainer.train(
        loss_functions=[losses.mse],
        loss_labels=["MSE"],
        loss_weights=[args.msefactor],
        epochs=args.epochs // 3,
        callbacks=[
            callbacks.save_model_after_every_epoch(
                create_filename("checkpoint", "A", args))
        ],
        forward_kwargs={"mode": "projection"},
        initial_epoch=args.startepoch,
        **common_kwargs,
    )
    learning_curves = np.vstack(learning_curves).T

    logger.info("Starting training MF, phase 2: mixed training")
    learning_curves_ = trainer.train(
        loss_functions=[losses.mse, losses.nll] + scandal_loss,
        loss_labels=["MSE", "NLL"] + scandal_label,
        loss_weights=[
            args.msefactor,
            args.addnllfactor * nat_to_bit_per_dim(args.modellatentdim)
        ] + scandal_weight,
        epochs=args.epochs - 2 * (args.epochs // 3),
        parameters=list(model.parameters()),
        callbacks=[
            callbacks.save_model_after_every_epoch(
                create_filename("checkpoint", "B", args))
        ],
        forward_kwargs={"mode": "mf"},
        initial_epoch=args.startepoch - (args.epochs // 3),
        **common_kwargs,
    )
    learning_curves_ = np.vstack(learning_curves_).T
    learning_curves = learning_curves_ if learning_curves is None else np.vstack(
        (learning_curves, learning_curves_))

    logger.info(
        "Starting training MF, phase 3: training only inner flow on NLL")
    learning_curves_ = trainer.train(
        loss_functions=[losses.mse, losses.nll] + scandal_loss,
        loss_labels=["MSE", "NLL"] + scandal_label,
        loss_weights=[
            0.0, args.nllfactor * nat_to_bit_per_dim(args.modellatentdim)
        ] + scandal_weight,
        epochs=args.epochs // 3,
        parameters=list(model.inner_transform.parameters()),
        callbacks=[
            callbacks.save_model_after_every_epoch(
                create_filename("checkpoint", "C", args))
        ],
        forward_kwargs={"mode": "mf-fixed-manifold"},
        initial_epoch=args.startepoch - (args.epochs - (args.epochs // 3)),
        **common_kwargs,
    )
    learning_curves_ = np.vstack(learning_curves_).T
    learning_curves = np.vstack(
        (learning_curves, np.vstack(learning_curves_).T))

    return learning_curves
Пример #27
0

if __name__ == "__main__":
    # Logger
    args = parse_args()
    logging.basicConfig(
        format="%(asctime)-5.5s %(name)-20.20s %(levelname)-7.7s %(message)s",
        datefmt="%H:%M",
        level=logging.DEBUG if args.debug else logging.INFO)
    logger.info("Hi!")
    logger.debug("Starting train.py with arguments %s", args)

    create_modelname(args)

    if args.resume is not None:
        resume_filename = create_filename("resume", None, args)
        args.startepoch = args.resume
        logger.info(
            "Resuming training. Loading file %s and continuing with epoch %s.",
            resume_filename, args.resume + 1)
    elif args.load is None:
        logger.info("Training model %s with algorithm %s on data set %s",
                    args.modelname, args.algorithm, args.dataset)
    else:
        logger.info(
            "Loading model %s and training it as %s with algorithm %s on data set %s",
            args.load, args.modelname, args.algorithm, args.dataset)

    # Bug fix related to some num_workers > 1 and CUDA. Bad things happen otherwise!
    torch.multiprocessing.set_start_method("spawn", force=True)
Пример #28
0
        # Generate samples
        logger.info("Evaluating sample closure")
        x_gen = evaluate.sample_from_model(margs, model, simulator)
        distances_gen = simulator.distance_from_manifold(x_gen)
        mean_gen_distance = np.mean(distances_gen)

        # Report results
        logger.info("Results:")
        logger.info("  reco err:     %s", reco_error)
        logger.info("  gen distance: %s", mean_gen_distance)

        return margs.metricrecoerrorfactor * reco_error + margs.metricdistancefactor * mean_gen_distance

    # Load saved study object
    if args.resumestudy:
        filename = create_filename("paramscan", None, args)
        logger.info("Loading parameter scan from %s", filename)

        with open(filename, "rb") as file:
            study = pickle.load(file)

    else:
        study = optuna.create_study(study_name=args.paramscanstudyname,
                                    direction="minimize")

    # Optimize!
    try:
        study.optimize(objective, n_trials=args.trials)
    except (KeyboardInterrupt, SystemExit):
        logger.warning("Optimization interrupted!")
Пример #29
0
    def objective(trial):
        global counter

        counter += 1

        # Hyperparameters
        margs = pick_parameters(args, trial, counter)

        logger.info(f"Starting run {counter} / {args.trials}")
        logger.info(f"Hyperparams:")
        logger.info(f"  outer layers:      {margs.outerlayers}")
        logger.info(f"  inner layers:      {margs.innerlayers}")
        logger.info(f"  linear transform:  {margs.lineartransform}")
        logger.info(f"  spline range:      {margs.splinerange}")
        logger.info(f"  spline bins:       {margs.splinebins}")
        logger.info(f"  batchnorm:         {margs.batchnorm}")
        logger.info(f"  dropout:           {margs.dropout}")
        logger.info(f"  batch size:        {margs.batchsize}")
        logger.info(f"  MSE factor:        {margs.msefactor}")
        logger.info(f"  latent L2 reg:     {margs.uvl2reg}")
        logger.info(f"  weight decay:      {margs.weightdecay}")
        logger.info(f"  gradient clipping: {margs.clip}")

        # Bug fix related to some num_workers > 1 and CUDA. Bad things happen otherwise!
        torch.multiprocessing.set_start_method("spawn", force=True)

        # Load data
        simulator = load_simulator(margs)
        dataset = simulator.load_dataset(train=True,
                                         dataset_dir=create_filename(
                                             "dataset", None, args),
                                         limit_samplesize=margs.samplesize)

        # Create model
        model = create_model(margs, simulator)

        # Train
        trainer1 = ForwardTrainer(model) if simulator.parameter_dim(
        ) is None else ConditionalForwardTrainer(model)
        trainer2 = ForwardTrainer(model) if simulator.parameter_dim(
        ) is None else ConditionalForwardTrainer(model)
        common_kwargs, _, _, _ = train.make_training_kwargs(margs, dataset)

        logger.info("Starting training MF, phase 1: manifold training")
        np.random.seed(123)
        _, val_losses = trainer1.train(
            loss_functions=[losses.mse, losses.hiddenl2reg],
            loss_labels=["MSE", "L2_lat"],
            loss_weights=[
                margs.msefactor,
                0.0 if margs.uvl2reg is None else margs.uvl2reg
            ],
            epochs=margs.epochs,
            parameters=(list(model.outer_transform.parameters()) +
                        list(model.encoder.parameters()) if args.algorithm
                        == "emf" else model.outer_transform.parameters()),
            forward_kwargs={
                "mode": "projection",
                "return_hidden": True
            },
            **common_kwargs,
        )

        logger.info("Starting training MF, phase 2: density training")
        np.random.seed(123)
        _ = trainer2.train(
            loss_functions=[losses.nll],
            loss_labels=["NLL"],
            loss_weights=[args.nllfactor],
            epochs=args.densityepochs,
            parameters=model.inner_transform.parameters(),
            forward_kwargs={"mode": "mf-fixed-manifold"},
            **common_kwargs,
        )

        # Save
        torch.save(model.state_dict(), create_filename("model", None, margs))

        # Evaluate reco error
        logger.info("Evaluating reco error")
        model.eval()
        np.random.seed(123)
        x, params = next(
            iter(
                trainer1.make_dataloader(
                    simulator.load_dataset(train=True,
                                           dataset_dir=create_filename(
                                               "dataset", None, args),
                                           limit_samplesize=args.samplesize),
                    args.validationsplit, 1000, 0)[1]))
        x = x.to(device=trainer1.device, dtype=trainer1.dtype)
        params = None if simulator.parameter_dim() is None else params.to(
            device=trainer1.device, dtype=trainer1.dtype)
        x_reco, _, _ = model(x, context=params, mode="projection")
        reco_error = torch.mean(torch.sum((x - x_reco)**2,
                                          dim=1)**0.5).detach().cpu().numpy()

        # Generate samples
        logger.info("Evaluating sample closure")
        x_gen = evaluate.sample_from_model(margs, model, simulator)
        distances_gen = simulator.distance_from_manifold(x_gen)
        mean_gen_distance = np.mean(distances_gen)

        # Report results
        logger.info("Results:")
        logger.info("  reco err:     %s", reco_error)
        logger.info("  gen distance: %s", mean_gen_distance)

        return margs.metricrecoerrorfactor * reco_error + margs.metricdistancefactor * mean_gen_distance
Пример #30
0
        logger.info("Evaluating simulator truth")
    else:
        create_modelname(args)
        logger.info("Evaluating model %s", args.modelname)

    # Bug fix related to some num_workers > 1 and CUDA. Bad things happen otherwise!
    torch.multiprocessing.set_start_method("spawn", force=True)

    # Data set
    simulator = load_simulator(args)

    # Load model
    if not args.truth:
        model = create_model(args, simulator=simulator)
        model.load_state_dict(
            torch.load(create_filename("model", None, args),
                       map_location=torch.device("cpu")))
        model.eval()
    else:
        model = None

    # Evaluate generative performance
    if args.skipgeneration:
        logger.info("Skipping generative evaluation as per request.")
    elif not args.truth:
        x_gen = sample_from_model(args, model, simulator)
        evaluate_model_samples(args, simulator, x_gen)

    if args.skipinference:
        logger.info(
            "Skipping all inference tasks as per request. Have a nice day!")