Exemplo n.º 1
0
 def __init__(self, *args, run_eval=False, **kwargs):
     super().__init__(*args, **kwargs)
     self.run_eval = run_eval
     if self.run_eval:
         self.executor = slurm.get_executor("eval_policy",
                                            cpus_per_task=8,
                                            cluster="slurm")
         # 1 hour is enough for eval
         self.executor.update_parameters(slurm_time="1:00:00")
Exemplo n.º 2
0
def main():
    config = EvalWatchdogConfig.parse_from_command_line()
    parser = argparse.ArgumentParser(fromfile_prefix_chars="@")

    contains_filter = config.contains_filter.split(",")
    print(contains_filter)

    executor = slurm.get_executor(job_name="eval",
                                  cpus_per_task=8,
                                  cluster=config.cluster)
    executor.update_parameters(slurm_time="2:00:00")

    already_run = []

    first_run = True
    while True:
        checkpoints = get_all_checkpoints(config.dir, contains_filter)
        for checkpoint in checkpoints:
            if checkpoint not in already_run:
                already_run.append(checkpoint)
                if config.debug:
                    print("would run", checkpoint)
                if not config.debug and (not first_run or not config.new_only):
                    job = submit(executor, checkpoint, config)
                    if job is not None:
                        if config.cluster in ["local", "debug"]:
                            print(job)
                            while True:
                                try:
                                    print(job)
                                    print(job.result())
                                    break
                                except submitit.core.utils.UncompletedJobError as e:
                                    print("waiting", str(e))
                        print("job id: ", job.job_id)
        print("done")
        if config.check_interval == -1:
            break
        time.sleep(config.check_interval)
        first_run = False
Exemplo n.º 3
0
            ),
        ],
        logger=logger,
        resume_from_checkpoint=config.training.resume_from_checkpoint,
        weights_save_path=logger.log_dir,
    )

    model = module(config)
    trainer.fit(model, datamodule=datamodule)
    return model


if __name__ == "__main__":
    module = lightning_modules.policy.get_module_from_command_line()
    config = module.Config.parse_from_command_line()
    use_slurm = slurm.parse_from_command_line()
    if use_slurm:
        executor = slurm.get_executor(
            job_name=config.training.experiment_name,
            cpus_per_task=4,
            nodes=config.training.num_nodes,
            gpus=config.training.gpus,
            constraint=config.training.slurm_constraint,
            logs_path=config.training.slurm_logs_path,
            prince=config.training.prince,
        )
        job = executor.submit(main, config)
        print(f"submitted to slurm with job id: {job.job_id}")
    else:
        main(config)
Exemplo n.º 4
0
    if config.alternative_checkpoint_path:
        alternative_module = Module.load_from_checkpoint(
            checkpoint_path=config.alternative_checkpoint_path)
        alternative_module.policy_model.diffs = config.diffs

    evaluator = PolicyEvaluator(
        test_dataset,
        config.num_processes,
        build_gradients=config.save_gradients,
        enable_logging=True,
    )
    result = evaluator.evaluate(
        mpur_module,
        output_dir=config.output_dir,
        alternative_module=alternative_module,
    )
    print(result["stats"])
    return result


if __name__ == "__main__":
    logging.getLogger().setLevel(logging.INFO)
    config = EvalConfig.parse_from_command_line()
    use_slurm = slurm.parse_from_command_line()
    if use_slurm:
        executor = slurm.get_executor("eval", 8)
        job = executor.submit(main, config)
        print(f"submitted to slurm with job id: {job.job_id}")
    else:
        main(config)