Пример #1
0
 def _set_run_obj(self, run_obj):
     self._run_obj = run_obj
     # TODO: Update run summary when resuming?
     self.history._update_step()
     # TODO: It feels weird to call this twice..
     sentry_set_scope("user", run_obj.entity, run_obj.project,
                      self._get_run_url())
Пример #2
0
 def _start_run_threads(self):
     self._fs = file_stream.FileStreamApi(
         self._api,
         self._run.run_id,
         self._run.start_time.ToSeconds(),
         settings=self._api_settings,
     )
     # Ensure the streaming polices have the proper offsets
     self._fs.set_file_policy("wandb-summary.json", file_stream.SummaryFilePolicy())
     self._fs.set_file_policy(
         "wandb-history.jsonl",
         file_stream.JsonlFilePolicy(start_chunk_id=self._resume_state["history"]),
     )
     self._fs.set_file_policy(
         "wandb-events.jsonl",
         file_stream.JsonlFilePolicy(start_chunk_id=self._resume_state["events"]),
     )
     self._fs.set_file_policy(
         "output.log",
         file_stream.CRDedupeFilePolicy(start_chunk_id=self._resume_state["output"]),
     )
     util.sentry_set_scope(
         "internal",
         entity=self._run.entity,
         project=self._run.project,
         email=self._settings.email,
     )
     self._fs.start()
     self._pusher = FilePusher(self._api, silent=self._settings.silent)
     self._dir_watcher = DirWatcher(self._settings, self._api, self._pusher)
     logger.info(
         "run started: %s with start time %s",
         self._run.run_id,
         self._run.start_time.ToSeconds(),
     )
Пример #3
0
    def handle_run(self, data):
        run = data.run
        run_tags = run.tags[:]

        # build config dict
        config_dict = None
        if run.HasField("config"):
            config_dict = _config_dict_from_proto_list(run.config.update)

        ups = self._api.upsert_run(
            name=run.run_id,
            entity=run.entity or None,
            project=run.project or None,
            group=run.run_group or None,
            job_type=run.job_type or None,
            display_name=run.display_name or None,
            notes=run.notes or None,
            tags=run_tags or None,
            config=config_dict or None,
            sweep_name=run.sweep_id or None,
        )

        if data.control.req_resp:
            storage_id = ups.get("id")
            if storage_id:
                data.run.storage_id = storage_id
            display_name = ups.get("displayName")
            if display_name:
                data.run.display_name = display_name
            project = ups.get("project")
            if project:
                project_name = project.get("name")
                if project_name:
                    data.run.project = project_name
                    self._project = project_name
                entity = project.get("entity")
                if entity:
                    entity_name = entity.get("name")
                    if entity_name:
                        data.run.entity = entity_name
                        self._entity = entity_name
            self._resp_q.put(data)

        if self._entity is not None:
            self._api_settings["entity"] = self._entity
        if self._project is not None:
            self._api_settings["project"] = self._project
        self._fs = file_stream.FileStreamApi(self._api,
                                             run.run_id,
                                             settings=self._api_settings)
        self._fs.start()
        self._pusher = FilePusher(self._api)
        self._run_id = run.run_id
        sentry_set_scope("internal", run.entity, run.project)
        logger.info("run started: %s", self._run_id)
Пример #4
0
    def __init__(self, config=None, settings=None):
        self._config = wandb_config.Config()
        self._config._set_callback(self._config_callback)
        self.summary = wandb_summary.Summary()
        self.summary._set_callback(self._summary_callback)
        self.history = wandb_history.History(self)
        self.history._set_callback(self._history_callback)

        _datatypes_set_callback(self._datatypes_callback)

        self._settings = settings
        self._wl = None
        self._backend = None
        self._reporter = None
        self._data = dict()

        self._entity = None
        self._project = None
        self._group = None
        self._job_type = None
        self._run_id = settings.run_id
        self._start_time = time.time()
        self._starting_step = 0
        self._name = None
        self._notes = None
        self._tags = None

        self._hooks = None
        self._redirect_cb = None
        self._out_redir = None
        self._err_redir = None
        self.stdout_redirector = None
        self.stderr_redirector = None
        self._save_stdout = None
        self._save_stderr = None
        self._stdout_slave_fd = None
        self._stderr_slave_fd = None
        self._exit_code = None
        self._exit_result = None
        self._final_summary = None

        # Pull info from settings
        self._init_from_settings(settings)

        # Initial scope setup for sentry. This might get changed when the
        # actual run comes back.
        sentry_set_scope("user", self._entity, self._project)

        # Returned from backend send_run_sync, set from wandb_init?
        self._run_obj = None

        # Created when the run "starts".
        self._run_status_checker = None

        config = config or dict()
        wandb_key = "_wandb"
        config.setdefault(wandb_key, dict())
        config[wandb_key]["cli_version"] = wandb.__version__
        if settings.save_code and settings.program_relpath:
            config[wandb_key]["code_path"] = to_forward_slash_path(
                os.path.join("code", settings.program_relpath))
        self._config._update(config)
        self._atexit_cleanup_called = None
        self._use_redirect = True
Пример #5
0
    def handle_run(self, data):
        run = data.run
        run_tags = run.tags[:]
        error = None
        is_wandb_init = self._run is None

        # build config dict
        config_dict = None
        if run.HasField("config"):
            config_dict = _config_dict_from_proto_list(run.config.update)
            config_path = os.path.join(self._settings.files_dir, CONFIG_FNAME)
            save_config_file_from_dict(config_path, config_dict)

        repo = GitRepo(remote=self._settings.git_remote)

        if is_wandb_init:
            # Only check resume status on `wandb.init`
            error = self._maybe_setup_resume(run)

        if error is not None:
            if data.control.req_resp:
                resp = wandb_internal_pb2.Result(uuid=data.uuid)
                resp.run_result.run.CopyFrom(run)
                resp.run_result.error.CopyFrom(error)
                self._resp_q.put(resp)
            else:
                logger.error("Got error in async mode: %s", error.message)
            return

        # TODO: we don't check inserted currently, ultimately we should make
        # the upsert know the resume state and fail transactionally
        ups, inserted = self._api.upsert_run(
            name=run.run_id,
            entity=run.entity or None,
            project=run.project or None,
            group=run.run_group or None,
            job_type=run.job_type or None,
            display_name=run.display_name or None,
            notes=run.notes or None,
            tags=run_tags or None,
            config=config_dict or None,
            sweep_name=run.sweep_id or None,
            host=run.host or None,
            program_path=self._settings.program or None,
            repo=repo.remote_url,
            commit=repo.last_commit,
        )

        # We subtract the previous runs runtime when resuming
        start_time = run.start_time.ToSeconds() - self._offsets["runtime"]
        self._run = run
        self._run.starting_step = self._offsets["step"]
        self._run.start_time.FromSeconds(start_time)
        storage_id = ups.get("id")
        if storage_id:
            self._run.storage_id = storage_id
        display_name = ups.get("displayName")
        if display_name:
            self._run.display_name = display_name
        project = ups.get("project")
        if project:
            project_name = project.get("name")
            if project_name:
                self._run.project = project_name
                self._project = project_name
            entity = project.get("entity")
            if entity:
                entity_name = entity.get("name")
                if entity_name:
                    self._run.entity = entity_name
                    self._entity = entity_name

        if data.control.req_resp:
            resp = wandb_internal_pb2.Result(uuid=data.uuid)
            resp.run_result.run.CopyFrom(self._run)
            self._resp_q.put(resp)

        if self._entity is not None:
            self._api_settings["entity"] = self._entity
        if self._project is not None:
            self._api_settings["project"] = self._project

        # Only spin up our threads on the first run message
        if is_wandb_init:
            self._fs = file_stream.FileStreamApi(self._api,
                                                 run.run_id,
                                                 start_time,
                                                 settings=self._api_settings)
            # Ensure the streaming polices have the proper offsets
            self._fs.set_file_policy("wandb-summary.json",
                                     file_stream.SummaryFilePolicy())
            self._fs.set_file_policy(
                "wandb-history.jsonl",
                file_stream.JsonlFilePolicy(
                    start_chunk_id=self._offsets["history"]),
            )
            self._fs.set_file_policy(
                "wandb-events.jsonl",
                file_stream.JsonlFilePolicy(
                    start_chunk_id=self._offsets["events"]),
            )
            self._fs.set_file_policy(
                "output.log",
                file_stream.CRDedupeFilePolicy(
                    start_chunk_id=self._offsets["output"]),
            )
            self._fs.start()
            self._pusher = FilePusher(self._api)
            self._dir_watcher = DirWatcher(self._settings, self._api,
                                           self._pusher)
            self._tb_watcher = tb_watcher.TBWatcher(self._settings,
                                                    sender=self)
            if self._run_meta:
                self._run_meta.write()
            sentry_set_scope("internal", run.entity, run.project)
            logger.info("run started: %s with start time %s", self._run.run_id,
                        start_time)
        else:
            logger.info("updated run: %s", self._run.run_id)