Пример #1
0
class CometLogger():
    def __init__(self, enabled, is_existing=False, prev_exp_key=None):
        """
        Handles logging of experiment to comet and also persistence to local file system.
        Supports resumption of stopped experiments.
        """
        disabled = not enabled

        if not is_existing:
            self.experiment = Experiment(api_key=COMET_API_KEY,
                                         workspace=COMET_WORKSPACE,
                                         project_name=PROJECT_NAME,
                                         disabled=disabled)
        else:
            if prev_exp_key is None:
                raise ValueError(
                    "Requested existing experiment, but no key provided")
            print("Continuing existing experiment with key: ", prev_exp_key)
            self.experiment = ExistingExperiment(
                api_key=COMET_API_KEY,
                workspace=COMET_WORKSPACE,
                project_name=PROJECT_NAME,
                disabled=disabled,
                previous_experiment=prev_exp_key)
        self.disabled = disabled

    def get_experiment_key(self):
        return self.experiment.get_key()[:9]

    def add_tag(self, tag):
        self.experiment.add_tag(tag)

    def log_metric(self, name, value, step=None):
        self.experiment.log_metric(name, value, step=step)

    def log_metrics(self, metrics_dict, prefix, step=None):
        self.experiment.log_metrics(metrics_dict, prefix=prefix, step=step)

    def log_params(self, params_dict):
        self.experiment.log_parameters(params_dict)

    def set_name(self, name_str):
        self.experiment.set_name(name_str)

    def log_dataset(self, dataset: SpeakerVerificationDataset):
        if self.disabled:
            return
        dataset_string = ""
        dataset_string += "<b>Speakers</b>: %s\n" % len(dataset.speakers)
        dataset_string += "\n" + dataset.get_logs()
        dataset_string = dataset_string.replace("\n", "<br>")
        self.vis.text(dataset_string, opts={"title": "Dataset"})

    def log_implementation(self, params):
        if self.disabled:
            return
        implementation_string = ""
        for param, value in params.items():
            implementation_string += "<b>%s</b>: %s\n" % (param, value)
            implementation_string = implementation_string.replace("\n", "<br>")
        self.implementation_string = implementation_string
        self.implementation_win = self.vis.text(
            implementation_string, opts={"title": "Training implementation"})

    def draw_projections(self,
                         embeds,
                         utterances_per_speaker,
                         step,
                         out_fpath=None,
                         max_speakers=16):
        if self.disabled:
            return
        max_speakers = min(max_speakers, len(colormap))
        embeds = embeds[:max_speakers * utterances_per_speaker]

        n_speakers = len(embeds) // utterances_per_speaker
        ground_truth = np.repeat(np.arange(n_speakers), utterances_per_speaker)
        colors = [colormap[i] for i in ground_truth]

        reducer = umap.UMAP()
        projected = reducer.fit_transform(embeds)
        plt.scatter(projected[:, 0], projected[:, 1], c=colors)
        plt.gca().set_aspect("equal", "datalim")
        plt.title("UMAP projection (step %d)" % step)
        if out_fpath is not None:
            plt.savefig(out_fpath)
        plt.clf()
        self.experiment.log_image(out_fpath, step=step)
Пример #2
0
def main(args, config=None, init_distributed=False):
    utils.import_user_module(args)

    experiment = None
    if config:
        experiment = ExistingExperiment(
            api_key=config["api_key"],
            previous_experiment=config["experiment_key"],
            auto_output_logging=None,
        )

    assert (
        args.max_tokens is not None or args.max_sentences is not None
    ), "Must specify batch size either with --max-tokens or --max-sentences"

    # Initialize CUDA and distributed training
    if torch.cuda.is_available() and not args.cpu:
        torch.cuda.set_device(args.device_id)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if init_distributed:
        args.distributed_rank = distributed_utils.distributed_init(args)

    if distributed_utils.is_master(args):
        checkpoint_utils.verify_checkpoint_directory(args.save_dir)

    print(args)
    if experiment:
        experiment.log_parameters(vars(args),
                                  prefix="Device {} :: ".format(
                                      args.device_id))

    # Setup task, e.g., translation, language modeling, etc.
    task = tasks.setup_task(args)

    # Load valid dataset (we load training data below, based on the latest checkpoint)
    for valid_sub_split in args.valid_subset.split(","):
        task.load_dataset(valid_sub_split, combine=False, epoch=0)

    # Build model and criterion
    model = task.build_model(args)
    criterion = task.build_criterion(args)
    print(model)
    print("| model {}, criterion {}".format(args.arch,
                                            criterion.__class__.__name__))
    print("| num. model params: {} (num. trained: {})".format(
        sum(p.numel() for p in model.parameters()),
        sum(p.numel() for p in model.parameters() if p.requires_grad),
    ))

    if experiment:
        experiment.log_parameters(
            {
                "criterion":
                criterion.__class__.__name__,
                "num. model params":
                sum(p.numel() for p in model.parameters()),
                "num. trained params":
                sum(p.numel() for p in model.parameters() if p.requires_grad),
            },
            prefix="Device {} :: ".format(args.device_id),
        )

    # Build trainer
    trainer = Trainer(args, task, model, criterion)
    print("| training on {} GPUs".format(args.distributed_world_size))
    print("| max tokens per GPU = {} and max sentences per GPU = {}".format(
        args.max_tokens, args.max_sentences))

    # Load the latest checkpoint if one is available and restore the
    # corresponding train iterator
    extra_state, epoch_itr = checkpoint_utils.load_checkpoint(args, trainer)

    # Train until the learning rate gets too small
    max_epoch = args.max_epoch or math.inf
    max_update = args.max_update or math.inf
    lr = trainer.get_lr()
    train_meter = StopwatchMeter()
    train_meter.start()
    valid_subsets = args.valid_subset.split(",")
    while (lr > args.min_lr and epoch_itr.epoch < max_epoch
           and trainer.get_num_updates() < max_update):
        # train for one epoch
        train(args, trainer, task, epoch_itr, experiment)

        if (not args.disable_validation
                and epoch_itr.epoch % args.validate_interval == 0):
            valid_losses = validate(args, trainer, task, epoch_itr,
                                    valid_subsets, experiment)
        else:
            valid_losses = [None]

        # only use first validation loss to update the learning rate
        lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0])

        # save checkpoint
        if epoch_itr.epoch % args.save_interval == 0:
            checkpoint_utils.save_checkpoint(args, trainer, epoch_itr,
                                             valid_losses[0])

        reload_dataset = ":" in getattr(args, "data", "")
        # sharded data: get train iterator for next epoch
        epoch_itr = trainer.get_train_iterator(epoch_itr.epoch,
                                               load_dataset=reload_dataset)
    train_meter.stop()
    print("| done training in {:.1f} seconds".format(train_meter.sum))

    if experiment:
        experiment.log_metrics(
            {
                "valid_loss": valid_losses[0],
                "lr": lr
            },
            prefix="Device {} ".format(args.device_id),
        )
Пример #3
0
    if args.resume:
        experiment = ExistingExperiment(
                api_key=args.api_key, 
                previous_experiment=checkpoint['experiment_key'],
                auto_param_logging=False,
                auto_metric_logging=False,
                parse_args=False)
    else:
        experiment = Experiment(
            api_key=args.api_key,
            project_name=project_name,
            auto_param_logging=False,
            auto_metric_logging=False,
            parse_args=False)
    experiment.log_other('experiment_name', experiment_name)
    experiment.log_parameters(vars(args))
    for k in hyperparameters:
        if type(hyperparameters[k]) == dict:
            experiment.log_parameters(hyperparameters[k], prefix=k)
        else:
            experiment.log_parameter(k, hyperparameters[k])

# Mapping: {'Cat': 0, 'Dog': 1}
try:
    dataset = torchvision.datasets.ImageFolder(root='./trainset')
except:
    import zipfile
    zip_ref = zipfile.ZipFile('trainset.zip', 'r')
    zip_ref.extractall()
    zip_ref.close()
    dataset = torchvision.datasets.ImageFolder(root='./trainset')
Пример #4
0
def main(opts):
    """
    Opts prevalence:
        1. Load file specified in args.default (or shared/trainer/defaults.yaml
           if none is provided)
        2. Update with file specified in args.config (or no update if none is provided)
        3. Update with parsed command-line arguments

        e.g.
        `python train.py args.config=config/large-lr.yaml data.loaders.batch_size=10`
        loads defaults, overrides with values in large-lr.yaml and sets batch_size to 10
    """

    # -----------------------------
    # -----  Parse arguments  -----
    # -----------------------------

    hydra_opts = Dict(OmegaConf.to_container(opts))
    args = hydra_opts.pop("args", None)
    auto_resumed = {}

    config_path = args.config

    if hydra_opts.train.resume:
        out_ = str(env_to_path(hydra_opts.output_path))
        config_path = Path(out_) / "opts.yaml"
        if not config_path.exists():
            config_path = None
            print("WARNING: could not reuse the opts in {}".format(out_))

    default = args.default or Path(
        __file__).parent / "shared/trainer/defaults.yaml"

    # -----------------------
    # -----  Load opts  -----
    # -----------------------

    opts = load_opts(config_path, default=default, commandline_opts=hydra_opts)
    if args.resume:
        opts.train.resume = True

    opts.jobID = os.environ.get("SLURM_JOBID")
    opts.slurm_partition = os.environ.get("SLURM_JOB_PARTITION")
    opts.output_path = str(env_to_path(opts.output_path))
    print("Config output_path:", opts.output_path)

    exp = comet_previous_id = None

    # -------------------------------
    # -----  Check output_path  -----
    # -------------------------------

    # Auto-continue if same slurm job ID (=job was requeued)
    if not opts.train.resume and opts.train.auto_resume:
        print("\n\nTrying to auto-resume...")
        existing_path = find_existing_training(opts)
        if existing_path is not None and existing_path.exists():
            auto_resumed["original output_path"] = str(opts.output_path)
            auto_resumed["existing_path"] = str(existing_path)
            opts.train.resume = True
            opts.output_path = str(existing_path)

    # Still not resuming: creating new output path
    if not opts.train.resume:
        opts.output_path = str(get_increased_path(opts.output_path))
        Path(opts.output_path).mkdir(parents=True, exist_ok=True)

    # Copy the opts's sbatch_file to output_path
    copy_run_files(opts)
    # store git hash
    opts.git_hash = get_git_revision_hash()
    opts.git_branch = get_git_branch()

    if not args.no_comet:
        # ----------------------------------
        # -----  Set Comet Experiment  -----
        # ----------------------------------

        if opts.train.resume:
            # Is resuming: get existing comet exp id
            assert Path(
                opts.output_path).exists(), "Output_path does not exist"

            comet_previous_id = get_existing_comet_id(opts.output_path)
            # Continue existing experiment
            if comet_previous_id is None:
                print("WARNING could not retreive previous comet id")
                print(f"from {opts.output_path}")
            else:
                print("Continuing previous experiment", comet_previous_id)
                auto_resumed["continuing exp id"] = comet_previous_id
                exp = ExistingExperiment(previous_experiment=comet_previous_id,
                                         **comet_kwargs)
                print("Comet Experiment resumed")

        if exp is None:
            # Create new experiment
            print("Starting new experiment")
            exp = Experiment(project_name="climategan", **comet_kwargs)
            exp.log_asset_folder(
                str(Path(__file__).parent / "climategan"),
                recursive=True,
                log_file_name=True,
            )
            exp.log_asset(str(Path(__file__)))

        # Log note
        if args.note:
            exp.log_parameter("note", args.note)

        # Merge and log tags
        if args.comet_tags or opts.comet.tags:
            tags = set([f"branch:{opts.git_branch}"])
            if args.comet_tags:
                tags.update(args.comet_tags)
            if opts.comet.tags:
                tags.update(opts.comet.tags)
            opts.comet.tags = list(tags)
            print("Logging to comet.ml with tags", opts.comet.tags)
            exp.add_tags(opts.comet.tags)

        # Log all opts
        exp.log_parameters(flatten_opts(opts))
        if auto_resumed:
            exp.log_text("\n".join(f"{k:20}: {v}"
                                   for k, v in auto_resumed.items()))

        # allow some time for comet to get its url
        sleep(1)

        # Save comet exp url
        url_path = get_increased_path(Path(opts.output_path) / "comet_url.txt")
        with open(url_path, "w") as f:
            f.write(exp.url)

        # Save config file
        opts_path = get_increased_path(Path(opts.output_path) / "opts.yaml")
        with (opts_path).open("w") as f:
            yaml.safe_dump(opts.to_dict(), f)

    pprint("Running model in", opts.output_path)

    # -------------------
    # -----  Train  -----
    # -------------------

    trainer = Trainer(opts, comet_exp=exp, verbose=1)
    trainer.logger.time.start_time = time()
    trainer.setup()
    trainer.train()

    # -----------------------------
    # -----  End of training  -----
    # -----------------------------

    pprint("Done training")
    kill_job(opts.jobID)
Пример #5
0
class CometLogger():
    def __init__(self, disabled, is_existing=False, prev_exp_key=None):
        """
        Handles logging of experiment to comet and also persistence to local file system.
        Supports resumption of stopped experiments.
        """

        if not is_existing:
            self.experiment = Experiment(api_key=COMET_API_KEY,
                                         workspace=COMET_WORKSPACE,
                                         project_name=PROJECT_NAME,
                                         disabled=disabled)
        else:
            if prev_exp_key is None:
                raise ValueError("Requested existing experiment, but no key provided")
            print("Continuing existing experiment with key: ", prev_exp_key)
            self.experiment = ExistingExperiment(api_key=COMET_API_KEY,
                                                 workspace=COMET_WORKSPACE,
                                                 project_name=PROJECT_NAME,
                                                 disabled=disabled,
                                                 previous_experiment=prev_exp_key)
        self.disabled = disabled
        self.name = None

    def get_experiment_key(self):
        return self.experiment.get_key()[:9]

    def add_tag(self, tag):
        self.experiment.add_tag(tag)

    def log_metric(self, name, value, step=None):
        self.experiment.log_metric(name, value, step=step)

    def log_metrics(self, metrics_dict, prefix, step=None):
        self.experiment.log_metrics(metrics_dict, prefix=prefix, step=step)

    def log_params(self, params_dict):
        self.experiment.log_parameters(params_dict)

    def set_name(self, name_str):
        self.experiment.set_name(name_str)
        self.name = name_str

    def save_act_grads(self, log_dict):
        """Save a dictionary of activation/gradients records to disk"""
        assert isinstance(log_dict, dict)
        if self.name is None:
            warnings.warn("Experiment name not set, not saving")
            return

        # Save the log dictionary
        file_name = f"./.{self.name}.record"
        with open(file_name, 'wb') as f:
            pickle.dump(log_dict, f)

    # TODO: need to rewrite before can be used for MNIST.
    def draw_projections(self, embeds, utterances_per_speaker, step, out_fpath=None,
                         max_speakers=16):
        import umap
        if self.disabled:
            return
        max_speakers = min(max_speakers, len(colormap))
        embeds = embeds[:max_speakers * utterances_per_speaker]

        n_speakers = len(embeds) // utterances_per_speaker
        ground_truth = np.repeat(np.arange(n_speakers), utterances_per_speaker)
        colors = [colormap[i] for i in ground_truth]

        reducer = umap.UMAP()
        projected = reducer.fit_transform(embeds)
        plt.scatter(projected[:, 0], projected[:, 1], c=colors)
        plt.gca().set_aspect("equal", "datalim")
        plt.title("UMAP projection (step %d)" % step)
        if out_fpath is not None:
            plt.savefig(out_fpath)
        plt.clf()
        self.experiment.log_image(out_fpath, step=step)
Пример #6
0
            else:
                experiment = Experiment(api_key=args.api_key,
                                        project_name='comp767_project',
                                        parse_args=False,
                                        auto_metric_logging=False)
    else:
        raise Exception
else:
    if args.api_key:
        experiment = Experiment(api_key=args.api_key,
                                project_name='comp767_project',
                                parse_args=False,
                                auto_metric_logging=False)
        _, experiment_name = split(dirname(realpath(__file__)))
        experiment.log_other('experiment_name', experiment_name)
        experiment.log_parameters(HYPERPARAMETERS)
    num_steps = 0
    best_score = 0
    last_evaluation = 0
    epsilon = HYPERPARAMETERS['epsilon_start']
print('=> num_steps: {}, best_score: {}, epsilon: {}, last_evaluation: {}\n'.
      format(num_steps, best_score, epsilon, last_evaluation))

target_net = DQN(env.action_space.n).to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

start_time = time.time()
memory = ReplayMemory(HYPERPARAMETERS['memory_size'])
while True:
    observation = reset(env)