def load_path(self, artifact, manifest_entry, local=False): self.init_gcs() bucket, key = self._parse_uri(manifest_entry.ref) version = manifest_entry.extra.get('versionID') extra_args = {} obj = None # First attempt to get the generation specified, this will return None if versioning is not enabled if version is not None: obj = self._client.bucket(bucket).get_blob(key, generation=version) if obj is None: # Object versioning is disabled on the bucket, so just get # the latest version and make sure the MD5 matches. obj = self._client.bucket(bucket).get_blob(key) if obj is None: raise ValueError('Unable to download object %s with generation %s' % (manifest_entry.ref, version)) md5 = obj.md5_hash if md5 != manifest_entry.digest: raise ValueError('Digest mismatch for object %s: expected %s but found %s' % (manifest_entry.ref, manifest_entry.digest, md5)) if not local: return manifest_entry.ref path = '%s/%s' % (artifact.cache_dir, manifest_entry.path) # TODO: We only have etag for this file, so we can't compare to an md5 to skip # downloading. Switching to object caching (caching files by their digest instead # of file name), this would work. Or we can store a list of known etags for local # files. util.mkdir_exists_ok(os.path.dirname(path)) obj.download_to_filename(path) return path
def transform(images, out_dir, fname): """ Combines a list of images into a single sprite returning meta information """ from PIL import Image as PILImage base = os.path.join(out_dir, "media", "images") width, height = images[0].image.size if len(images) > MAX_IMAGES: logging.warn( "The maximum number of images to store per step is %i." % MAX_IMAGES) sprite = PILImage.new(mode='RGB', size=(width * len(images), height), color=(0, 0, 0, 0)) for i, image in enumerate(images[:MAX_IMAGES]): location = width * i sprite.paste(image.image, (location, 0)) util.mkdir_exists_ok(base) sprite.save(os.path.join(base, fname), transparency=0) meta = { "width": width, "height": height, "count": len(images), "_type": "images" } captions = Image.captions(images[:MAX_IMAGES]) if captions: meta["captions"] = captions return meta
def check_etag_obj_path(self, etag, size): path = os.path.join(self._cache_dir, "obj", "etag", etag[:2], etag[2:]) opener = ArtifactsCache._cache_opener(path) if os.path.isfile(path) and os.path.getsize(path) == size: return path, True, opener util.mkdir_exists_ok(os.path.dirname(path)) return path, False, opener
def seq_to_json(cls, seq, run, key, step): audio_list = list(seq) for audio in audio_list: if not audio.is_bound(): audio.bind_to_run(run, key, step) sf = util.get_module( "soundfile", required= "wandb.Audio requires the soundfile package. To get it, run: pip install soundfile" ) base_path = os.path.join(run.dir, "media", "audio") util.mkdir_exists_ok(base_path) meta = { "_type": "audio", "count": len(audio_list), 'audio': [a.to_json(run) for a in audio_list], } sample_rates = cls.sample_rates(audio_list) if sample_rates: meta["sampleRates"] = sample_rates durations = cls.durations(audio_list) if durations: meta["durations"] = durations captions = cls.captions(audio_list) if captions: meta["captions"] = captions return meta
def test_save_live_glob_multi_write( mocked_run, mock_server, sender, start_backend, stop_backend, ): start_backend() sender.publish_files({"files": [("checkpoints/*", "live")]}) mkdir_exists_ok(os.path.join(mocked_run.dir, "checkpoints")) test_file_1 = os.path.join(mocked_run.dir, "checkpoints", "test_1.txt") test_file_2 = os.path.join(mocked_run.dir, "checkpoints", "test_2.txt") with open(test_file_1, "w") as f: f.write("TEST TEST") time.sleep(1.5) with open(test_file_1, "w") as f: f.write("TEST TEST TEST TEST") # File system polling happens every second time.sleep(1.5) with open(test_file_2, "w") as f: f.write("TEST TEST TEST TEST") with open(test_file_1, "w") as f: f.write("TEST TEST TEST TEST TEST TEST") stop_backend() assert len(mock_server.ctx["storage?file=checkpoints/test_1.txt"]) == 3 assert len(mock_server.ctx["storage?file=checkpoints/test_2.txt"]) == 1
def test_settings(test_dir, mocker, live_mock_server): """Settings object for tests""" # TODO: likely not the right thing to do, we shouldn't be setting this wandb._IS_INTERNAL_PROCESS = False wandb.wandb_sdk.wandb_run.EXIT_TIMEOUT = 15 wandb.wandb_sdk.wandb_setup._WandbSetup.instance = None wandb_dir = os.path.join(test_dir, "wandb") mkdir_exists_ok(wandb_dir) # root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) settings = wandb.Settings( _start_datetime=datetime.datetime.now(), _start_time=time.time(), api_key=DUMMY_API_KEY, base_url=live_mock_server.base_url, console="off", host="test", project="test", root_dir=test_dir, run_id=wandb.util.generate_id(), save_code=False, ) yield settings # Just in case someone forgets to join in tests. ...well, please don't! if wandb.run is not None: wandb.run.finish()
def download(self, replace=False, root="."): """Downloads a file previously saved by a run from the wandb server. Args: replace (boolean): If `True`, download will overwrite a local file if it exists. Defaults to `False`. root (str): Local directory to save the file. Defaults to ".". Raises: `ValueError` if file already exists and replace=False """ response = requests.get(self._attrs["url"], auth=("api", Api().api_key), stream=True, timeout=5) response.raise_for_status() path = os.path.join(root, self._attrs["name"]) if os.path.exists(path) and not replace: raise ValueError( "File already exists, pass replace=True to overwrite") if "/" in path: dir = "/".join(path.split("/")[0:-1]) util.mkdir_exists_ok(dir) with open(path, "wb") as file: for data in response.iter_content(chunk_size=1024): file.write(data) return open(path, "r")
def check_md5_obj_path(self, b64_md5, size): hex_md5 = util.bytes_to_hex(base64.b64decode(b64_md5)) path = os.path.join(self._cache_dir, "obj", "md5", hex_md5[:2], hex_md5[2:]) if os.path.isfile(path) and os.path.getsize(path) == size: return path, True util.mkdir_exists_ok(os.path.dirname(path)) return path, False
def test_save_live_glob_multi_write(mocked_run, mock_server, internal_sender, start_backend, stop_backend): start_backend() internal_sender.publish_files({"files": [("checkpoints/*", "live")]}) mkdir_exists_ok(os.path.join(mocked_run.dir, "checkpoints")) test_file_1 = os.path.join(mocked_run.dir, "checkpoints", "test_1.txt") test_file_2 = os.path.join(mocked_run.dir, "checkpoints", "test_2.txt") # To debug this test adds some prints to the dir_watcher.py _on_file_* handlers print("Wrote file 1") with open(test_file_1, "w") as f: f.write("TEST TEST") time.sleep(2) print("Wrote file 1 2nd time") with open(test_file_1, "w") as f: f.write("TEST TEST TEST TEST") # File system polling happens every second time.sleep(1.5) print("Wrote file 2") with open(test_file_2, "w") as f: f.write("TEST TEST TEST TEST") print("Wrote file 1 3rd time") with open(test_file_1, "w") as f: f.write("TEST TEST TEST TEST TEST TEST") print("Stopping backend") stop_backend() print("Backend stopped") print("CTX:", [(k, v) for k, v in mock_server.ctx.items() if k.startswith("storage")]) assert len(mock_server.ctx["storage?file=checkpoints/test_1.txt"]) == 3 assert len(mock_server.ctx["storage?file=checkpoints/test_2.txt"]) == 1
def test_save_glob_multi_write( mocked_run, mock_server, sender, start_backend, stop_backend, ): start_backend() sender.publish_files({"files": [("checkpoints/*", "now")]}) mkdir_exists_ok(os.path.join(mocked_run.dir, "checkpoints")) test_file_1 = os.path.join(mocked_run.dir, "checkpoints", "test_1.txt") test_file_2 = os.path.join(mocked_run.dir, "checkpoints", "test_2.txt") print("Wrote file 1") with open(test_file_1, "w") as f: f.write("TEST TEST") # File system polling happens every second time.sleep(1.5) print("Wrote file 2") with open(test_file_2, "w") as f: f.write("TEST TEST TEST TEST") time.sleep(1.5) print("Stopping backend") stop_backend() print("Backend stopped") print("CTX", [(k, v) for k, v in mock_server.ctx.items() if k.startswith("storage")]) assert len(mock_server.ctx["storage?file=checkpoints/test_1.txt"]) == 1 assert len(mock_server.ctx["storage?file=checkpoints/test_2.txt"]) == 1
def transform(audio_list, out_dir, key, step): if len(audio_list) > Audio.MAX_AUDIO_COUNT: logging.warn( "The maximum number of audio files to store per step is %i." % Audio.MAX_AUDIO_COUNT) sf = util.get_module( "soundfile", required= "wandb.Audio requires the soundfile package. To get it, run: pip install soundfile" ) base_path = os.path.join(out_dir, "media", "audio") util.mkdir_exists_ok(base_path) for i, audio in enumerate(audio_list[:Audio.MAX_AUDIO_COUNT]): sf.write( os.path.join(base_path, "{}_{}_{}.wav".format(key, step, i)), audio.audio_data, audio.sample_rate) meta = { "_type": "audio", "count": min(len(audio_list), Audio.MAX_AUDIO_COUNT) } sample_rates = Audio.sample_rates(audio_list[:Audio.MAX_AUDIO_COUNT]) if sample_rates: meta["sampleRates"] = sample_rates durations = Audio.durations(audio_list[:Audio.MAX_AUDIO_COUNT]) if durations: meta["durations"] = durations captions = Audio.captions(audio_list[:Audio.MAX_AUDIO_COUNT]) if captions: meta["captions"] = captions return meta
def test_settings(test_dir, mocker): """ Settings object for tests""" # TODO: likely not the right thing to do, we shouldn't be setting this wandb._IS_INTERNAL_PROCESS = False wandb.wandb_sdk.wandb_run.EXIT_TIMEOUT = 15 wandb.wandb_sdk.wandb_setup._WandbSetup.instance = None wandb_dir = os.path.join(os.getcwd(), "wandb") mkdir_exists_ok(wandb_dir) # root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) # TODO: consider making a debugable directory that stays around... settings = wandb.Settings( _start_time=time.time(), base_url="http://localhost", root_dir=os.getcwd(), save_code=True, project="test", console="off", host="test", api_key=DUMMY_API_KEY, run_id=wandb.util.generate_id(), _start_datetime=datetime.datetime.now(), ) settings.setdefaults() yield settings # Just incase someone forgets to join in tests if wandb.run is not None: wandb.run.join()
def local_settings(mocker): """Place global settings in an isolated dir""" with CliRunner().isolated_filesystem(): cfg_path = os.path.join(os.getcwd(), ".config", "wandb", "settings") mkdir_exists_ok(os.path.join(".config", "wandb")) mocker.patch("wandb.old.settings.Settings._global_path", return_value=cfg_path) yield
def check_etag_obj_path(self, etag: str, size: int) -> Tuple[str, bool, Callable]: path = os.path.join(self._cache_dir, "obj", "etag", etag[:2], etag[2:]) opener = self._cache_opener(path) if os.path.isfile(path) and os.path.getsize(path) == size: return path, True, opener util.mkdir_exists_ok(os.path.dirname(path)) return path, False, opener
def termlog(string='', newline=True, repeat=True): """Log to standard error with formatting. Args: string (str, optional): The string to print newline (bool, optional): Print a newline at the end of the string repeat (bool, optional): If set to False only prints the string once per process """ if string: line = '\n'.join( ['{}: {}'.format(LOG_STRING, s) for s in string.split('\n')]) else: line = '' if not repeat and line in PRINTED_MESSAGES: return # Repeated line tracking limited to 1k messages if len(PRINTED_MESSAGES) < 1000: PRINTED_MESSAGES.add(line) if os.getenv(env.SILENT): from wandb import util util.mkdir_exists_ok(os.path.dirname(util.get_log_file_path())) with open(util.get_log_file_path(), 'w') as log: click.echo(line, file=log, nl=newline) else: click.echo(line, file=sys.stderr, nl=newline)
def __init__(self, cache_dir): self._cache_dir = cache_dir util.mkdir_exists_ok(self._cache_dir) self._md5_obj_dir = os.path.join(self._cache_dir, "obj", "md5") self._etag_obj_dir = os.path.join(self._cache_dir, "obj", "etag") self._artifacts_by_id = {} self._random = random.Random() self._random.seed()
def check_md5_obj_path(self, b64_md5: str, size: int) -> Tuple[str, bool, Callable]: hex_md5 = util.bytes_to_hex(base64.b64decode(b64_md5)) path = os.path.join(self._cache_dir, "obj", "md5", hex_md5[:2], hex_md5[2:]) opener = self._cache_opener(path) if os.path.isfile(path) and os.path.getsize(path) == size: return path, True, opener util.mkdir_exists_ok(os.path.dirname(path)) return path, False, opener
def new_file(self, name, mode="w"): self._ensure_can_add() path = os.path.join(self._artifact_dir.name, name.lstrip("/")) if os.path.exists(path): raise ValueError('File with name "%s" already exists' % name) util.mkdir_exists_ok(os.path.dirname(path)) self._added_new = True return open(path, mode)
def test_log_artifact_simple(runner, wandb_init_run): util.mkdir_exists_ok("artsy") open("artsy/file1.txt", "w").write("hello") open("artsy/file2.txt", "w").write("goodbye") with pytest.raises(ValueError): wandb.log_artifact("artsy") art = wandb.log_artifact("artsy", type="dataset") assert art.name == "run-" + wandb_init_run.id + "-artsy"
def test_save_now_relative_path(mocked_run, mock_server, sender, sm, process_q): sender.send_files({"files": [("foo/test.txt", "now")]}) sm.send(process_q.get()) test_file = os.path.join(mocked_run.dir, "foo", "test.txt") mkdir_exists_ok(os.path.dirname(test_file)) with open(test_file, "w") as f: f.write("TEST TEST") sm.finish() print("DAMN DUDE", mock_server.ctx) assert len(mock_server.ctx["storage?file=foo/test.txt"]) == 1
def test_dir(request): orig_dir = os.getcwd() root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) test_dir = os.path.join(root, "tests", "logs", request.node.name) if os.path.exists(test_dir): shutil.rmtree(test_dir) mkdir_exists_ok(test_dir) os.chdir(test_dir) yield runner os.chdir(orig_dir)
def test_save_now_relative_path(mocked_run, mock_server, sender, start_backend, stop_backend,): start_backend() sender.publish_files({"files": [("foo/test.txt", "now")]}) test_file = os.path.join(mocked_run.dir, "foo", "test.txt") mkdir_exists_ok(os.path.dirname(test_file)) with open(test_file, "w") as f: f.write("TEST TEST") stop_backend() print("DAMN DUDE", mock_server.ctx) assert len(mock_server.ctx["storage?file=foo/test.txt"]) == 1
def git_repo(runner): with runner.isolated_filesystem(): r = git.Repo.init(".") mkdir_exists_ok("wandb") # Because the forked process doesn't use my monkey patch above with open("wandb/settings", "w") as f: f.write("[default]\nproject: test") open("README", "wb").close() r.index.add(["README"]) r.index.commit("Initial commit") yield GitRepo(lazy=False)
def test_git_untracked_notebook_env_subdir(monkeypatch, git_repo, mocker): mocker.patch('wandb._get_python_type', lambda: "jupyter") util.mkdir_exists_ok("sub") with open("sub/test.ipynb", "w") as f: f.write("{}") os.environ[env.NOTEBOOK_NAME] = "sub/test.ipynb" meta = Meta(InternalApi()) assert meta.data["program"] == "sub/test.ipynb" assert meta.data["codePath"] == "sub/test.ipynb" assert os.path.exists("code/sub/test.ipynb") del os.environ[env.NOTEBOOK_NAME]
def seq_to_json(cls, html_list, run, key, step): base_path = os.path.join(run.dir, cls.get_media_subdir()) util.mkdir_exists_ok(base_path) for i, h in enumerate(html_list): if not h.is_bound(): h.bind_to_run(run, key, step, id_=i) meta = { "_type": "html", "count": len(html_list), 'html': [h.to_json(run) for h in html_list] } return meta
def new_file(self, name: str, mode: str = "w"): self._ensure_can_add() path = os.path.join(self._artifact_dir.name, name.lstrip("/")) if os.path.exists(path): raise ValueError('File with name "%s" already exists at "%s"' % (name, path)) util.mkdir_exists_ok(os.path.dirname(path)) with util.fsync_open(path, mode) as f: yield f self.add_file(path, name=name)
def _setup_code_program(self): logger.debug("save program starting") program = os.path.join(self.data["root"], os.path.relpath(os.getcwd(), start=self.data["root"]), self.data["program"]) logger.debug("save program starting: {}".format(program)) if os.path.exists(program): relative_path = os.path.relpath(program, start=self.data["root"]) util.mkdir_exists_ok(os.path.join(self.out_dir, "code", os.path.dirname(relative_path))) saved_program = os.path.join(self.out_dir, "code", relative_path) logger.debug("save program saved: {}".format(saved_program)) if not os.path.exists(saved_program): logger.debug("save program") copyfile(program, saved_program) self.data["codePath"] = relative_path
def seq_to_json(cls, videos, run, key, step): base_path = os.path.join(run.dir, cls.get_media_subdir()) util.mkdir_exists_ok(base_path) for i, v in enumerate(videos): if not v.is_bound(): v.bind_to_run(run, key, step, id_=i) meta = { "_type": "videos", "count": len(videos), 'videos': [v.to_json(run) for v in videos], "captions": Video.captions(videos) } return meta
def _setup_code_program(self): logger.debug("scan for untracked program") program = os.path.join(self.data["root"], self.data["program"]) if os.path.exists(program) and self._api.git.is_untracked( self.data["program"]): util.mkdir_exists_ok( os.path.join(self.out_dir, "code", os.path.dirname(self.data["program"]))) saved_program = os.path.join(self.out_dir, "code", self.data["program"]) if not os.path.exists(saved_program): logger.debug("save untracked program") copyfile(program, saved_program) self.data["codeSaved"] = True
def save(glob_str, base_path=None, policy="live"): """ Ensure all files matching *glob_str* are synced to wandb with the policy specified. base_path: the base path to run the glob relative to policy: live: upload the file as it changes, overwriting the previous version end: only upload file when the run ends """ global _saved_files if run is None: raise ValueError("You must call `wandb.init` before calling save") if policy not in ("live", "end"): raise ValueError( 'Only "live" and "end" policies are currently supported.') if isinstance(glob_str, bytes): glob_str = glob_str.decode('utf-8') if not isinstance(glob_str, string_types): raise ValueError("Must call wandb.save(glob_str) with glob_str a str") if base_path is None: base_path = os.path.dirname(glob_str) wandb_glob_str = os.path.relpath(glob_str, base_path) if "../" in wandb_glob_str: raise ValueError("globs can't walk above base_path") if (glob_str, base_path, policy) in _saved_files: return [] if glob_str.startswith("gs://") or glob_str.startswith("s3://"): termlog("%s is a cloud storage url, can't save file to wandb." % glob_str) return [] run.send_message( {"save_policy": { "glob": wandb_glob_str, "policy": policy }}) files = [] for path in glob.glob(glob_str): file_name = os.path.relpath(path, base_path) abs_path = os.path.abspath(path) wandb_path = os.path.join(run.dir, file_name) util.mkdir_exists_ok(os.path.dirname(wandb_path)) # We overwrite existing symlinks because namespaces can change in Tensorboard if os.path.islink(wandb_path) and abs_path != os.readlink(wandb_path): os.remove(wandb_path) os.symlink(abs_path, wandb_path) elif not os.path.exists(wandb_path): os.symlink(abs_path, wandb_path) files.append(wandb_path) _saved_files.add((glob_str, base_path, policy)) return files