def main(config): board = expanduser(config["undistort"]["board"]) framerate = int(config["undistort"]["framerate"]) m_corners = int(config["undistort"]["m_corners"]) n_corners = int(config["undistort"]["n_corners"]) target = expanduser(config["undistort"]["target"]) do_debug = config["undistort"]["do_debug"] do_crop = config["undistort"]["do_crop"] cam_calib_results = calibrate_checkerboard(board, m_corners, n_corners, framerate=framerate, do_debug=do_debug) cam_mtx, dist = cam_calib_results["cam_mtx"], cam_calib_results["dist"] if Path(target).is_file(): undistort(target, cam_mtx, dist, framerate, do_crop=do_crop) elif Path(target).is_dir(): vids = [str(path.absolute()) for path in Path(target).rglob("*.mp4")] vids = [ vid for vid in vids if "checkerboards" not in vid and "undistorted" not in vid ] if len(vids) == 0: raise ValueError("No '.mp4' videos were found.") for vid in vids: undistort(vid, cam_mtx, dist, framerate, do_crop=do_crop)
def load_scripts(path): if not isinstance(path, pathlib.Path): path = str(path) path = os.path.expanduser(path) path = pathlib.Path(path) if path.is_dir(): l = path.glob('*.py') # sort b.50.py before a.80.py l = sorted(l, key=lambda p: [p.suffixes[-2][1:] if len(p.suffixes)>1 else "",p]) for p in l: load_scripts(p) elif path.is_file(): with path.open("rt") as f: c = f.read() c = compile(c, str(path), "exec") try: env.Env['__FILE__'] = str(path.absolute()) exec (c, env.Env.dict) del env.Env['__FILE__'] except NotImplementedError: # Script wanted to be ignored pass SCRIPTS.append(path) else: raise Exception("neither file nor dir in load_Scripts", path)
def validate_build_directory(parser: argparse, arg: str) -> str: path: pathlib.Path = pathlib.Path(arg) if path.exists(): return str(path.absolute().as_posix()) else: raise argparse.ArgumentError( "FATAL: path {} not exists".format(arg))
def addAllFiles(targetDir, sourceDir): #Get everything recursively. rootpath = Path(sourceDir) paths = list(rootpath.glob("**/*")) print(targetDir) print(sourceDir) for path in paths: rel = str(path.relative_to(sourceDir)) if(path.is_dir()): ensureFolder(targetDir+slash+rel) else: ensureFolder((targetDir+slash+rel).rpartition(slash)[0]) if not (Path(targetDir+slash+rel).exists()): #in dst, src order makeHardLink(targetDir+slash+rel, str(path.absolute())) addToLinkManifest(targetDir+slash+rel, str(path.absolute()))
def save(self): csv_name = "{}.csv".format(self.name) path = Path.cwd() / 'data' / csv_name with open(path.absolute(), mode='w') as csv_file: playlist_csv = csv.writer(csv_file, delimiter=',') for v in self.objs: playlist_csv.writerow([v.artist, v.album, v.title])
def load_dataframe(uri_to_file: str, encoding: str): path = Path(uri_to_file) try: return pd.read_csv(path.absolute(), sep=";", encoding=encoding, quoting=csv.QUOTE_NONE) except pd.errors.EmptyDataError as err: print(err)
def HandleFile(path): data = [] print(f"path: {path.absolute()}") #with open(path.absolute(), mode="r", encoding="iso-8859-1") as inputFile: with open(path.absolute(), mode="r", encoding="utf-8") as inputFile: data = json.load(inputFile) for component in data: HandleComponent(component)
def find_dandiset_and_subpath(path: Path) -> tuple[Dandiset, Path]: """ Find the Dandiset rooted at ``path`` or one of its parents, and return the Dandiset along with ``path`` made relative to the Dandiset root """ path = path.absolute() ds = Dandiset.find(path) if ds is None: raise ValueError(f"{path}: not a Dandiset") return (ds, path.relative_to(ds.path))
def validate_mount_point(parser: argparse, arg: str) -> str: path: pathlib.Path = pathlib.Path(arg) abs_path_str: str = str(path.absolute().as_posix()) if not path.exists(): os.mkdir(abs_path_str) logging.info( "Creating {}, which does not exists.".format(abs_path_str)) return abs_path_str
def InitializeMunicipalities(path): data = {} with open(path.absolute(), mode="r", encoding="utf-8") as inputFile: data = json.load(inputFile) counties = data["countyList"] for county in counties: municipalities = county["municipalityList"] for municipality in municipalities: municipalityNumberByName[municipality["name"]] = municipality["municipalityNumber"]
def get_contents(path): print(now_time() + "Loading " + path.name) with open(path.absolute()) as f: contents = f.readlines() # Remove trailing whitespace & replace tabs with spaces contents = [line.rstrip() for line in contents] contents = [line.replace('\t', ' ') for line in contents] # Return the blob return ('\n'.join(contents)).strip()
def __call__(self, ctx) -> str: from pathlib import Path path = Path(self._path) if not path.is_absolute(): path = Path(ctx._pwd) / path path = path.absolute() if not path.exists(): path.mkdir(parents=True, exist_ok=True) elif not path.is_dir(): raise ConfigurationError("'%s' is not a directory" % self._path) return str(path)
def get_data_path(): '''Helper function to get data path within project. ''' from pathlib import Path path = Path('.').resolve() path_string = path.absolute().as_posix() if 'src' in path_string: path = path.parent / 'data' elif 'data' in path_string: pass else: path = path / 'data' path_to_data = f'{path.absolute().as_posix()}/' return path_to_data
def __init__(self, name: str, path: Path, count: int, force=True): """[summary] Arguments: path {Path} -- The file path of the token file count {int} -- Number of tokens (overrides previous definitions) force -- If the token has already been created, force to write the maximum number of tokens """ super().__init__() self.path = path self.path.mkdir(exist_ok=True, parents=True) self.cache: Dict[str, TokenFile] = {} self.infopath = path / "token.info" self.ipc_lock = fasteners.InterProcessLock(path / "token.lock") self.lock = threading.Lock() self.name = name # Set the new number of tokens with self.lock, self.ipc_lock: # Get the maximum number of tokens if force or not self.infopath.is_file(): self.total = count self.infopath.write_text(str(count)) self.timestamp = os.path.getmtime(self.path) self._update() # Watched path self.watchedpath = str(path.absolute()) self.watcher = ipcom().fswatch(self, self.path, recursive=True) logger.info("Watching %s", self.watchedpath)
def upload( paths, existing="refresh", validation="require", dandiset_path=None, dandi_instance="dandi", allow_any_path=False, upload_dandiset_metadata=False, devel_debug=False, jobs=None, jobs_per_file=None, sync=False, ): from .dandiapi import DandiAPIClient from .dandiset import APIDandiset, Dandiset from .support.digests import get_digest dandiset = Dandiset.find(dandiset_path) if not dandiset: raise RuntimeError( f"Found no {dandiset_metadata_file} anywhere. " "Use 'dandi register', 'download', or 'organize' first" ) instance = get_instance(dandi_instance) assert instance.api is not None api_url = instance.api client = DandiAPIClient(api_url) client.dandi_authenticate() dandiset = APIDandiset(dandiset.path) # "cast" to a new API based dandiset ds_identifier = dandiset.identifier if not re.match(dandiset_identifier_regex, str(ds_identifier)): raise ValueError( f"Dandiset identifier {ds_identifier} does not follow expected " f"convention {dandiset_identifier_regex!r}. Use " f"'dandi register' to get a legit identifier" ) from .metadata import get_default_metadata, nwb2asset from .pynwb_utils import ignore_benign_pynwb_warnings from .support.pyout import naturalsize from .utils import find_dandi_files, find_files, path_is_subpath from .validate import validate_file ignore_benign_pynwb_warnings() # so validate doesn't whine # # Treat paths # if not paths: paths = [dandiset.path] original_paths = paths # Expand and validate all paths -- they should reside within dandiset paths = find_files(".*", paths) if allow_any_path else find_dandi_files(paths) paths = list(map(Path, paths)) npaths = len(paths) lgr.info(f"Found {npaths} files to consider") for path in paths: if not ( allow_any_path or path.name == dandiset_metadata_file or path.name.endswith(".nwb") ): raise NotImplementedError( f"ATM only .nwb and dandiset.yaml should be in the paths to upload. Got {path}" ) if not path_is_subpath(str(path.absolute()), dandiset.path): raise ValueError(f"{path} is not under {dandiset.path}") # We will keep a shared set of "being processed" paths so # we could limit the number of them until # https://github.com/pyout/pyout/issues/87 # properly addressed process_paths = set() from collections import defaultdict uploaded_paths = defaultdict(lambda: {"size": 0, "errors": []}) def skip_file(msg): return {"status": "skipped", "message": str(msg)} # TODO: we might want to always yield a full record so no field is not # provided to pyout to cause it to halt def process_path(path, relpath): """ Parameters ---------- path: Path Non Pure (OS specific) Path relpath: For location on server. Will be cast to PurePosixPath Yields ------ dict Records for pyout """ # Ensure consistent types path = Path(path) relpath = PurePosixPath(relpath) try: try: path_stat = path.stat() yield {"size": path_stat.st_size} except FileNotFoundError: yield skip_file("ERROR: File not found") return except Exception as exc: # without limiting [:50] it might cause some pyout indigestion yield skip_file("ERROR: %s" % str(exc)[:50]) return # # Validate first, so we do not bother server at all if not kosher # # TODO: enable back validation of dandiset.yaml if path.name != dandiset_metadata_file and validation != "skip": yield {"status": "pre-validating"} validation_errors = validate_file(path) yield {"errors": len(validation_errors)} # TODO: split for dandi, pynwb errors if validation_errors: if validation == "require": yield skip_file("failed validation") return else: yield {"status": "validated"} else: # yielding empty causes pyout to get stuck or crash # https://github.com/pyout/pyout/issues/91 # yield {"errors": '',} pass # # Special handling for dandiset.yaml # Yarik hates it but that is life for now. TODO # if path.name == dandiset_metadata_file: # TODO This is a temporary measure to avoid breaking web UI # dandiset metadata schema assumptions. All edits should happen # online. if upload_dandiset_metadata: yield {"status": "updating metadata"} client.set_dandiset_metadata( dandiset.identifier, metadata=dandiset.metadata ) yield {"status": "updated metadata"} else: yield skip_file("should be edited online") return # # Compute checksums # yield {"status": "digesting"} try: file_etag = get_digest(path, digest="dandi-etag") except Exception as exc: yield skip_file("failed to compute digest: %s" % str(exc)) return extant = client.get_asset_bypath(ds_identifier, "draft", str(relpath)) if extant is not None: # The endpoint used to search by paths doesn't include asset # metadata, so we need to make another API call: metadata = client.get_asset(ds_identifier, "draft", extant["asset_id"]) local_mtime = ensure_datetime(path_stat.st_mtime) remote_mtime_str = metadata.get("blobDateModified") d = metadata.get("digest", {}) if "dandi:dandi-etag" in d: extant_etag = d["dandi:dandi-etag"] else: # TODO: Should this error instead? extant_etag = None if remote_mtime_str is not None: remote_mtime = ensure_datetime(remote_mtime_str) remote_file_status = ( "same" if extant_etag == file_etag and remote_mtime == local_mtime else ( "newer" if remote_mtime > local_mtime else ("older" if remote_mtime < local_mtime else "diff") ) ) else: remote_mtime = None remote_file_status = "no mtime" exists_msg = f"exists ({remote_file_status})" if existing == "error": # as promised -- not gentle at all! raise FileExistsError(exists_msg) if existing == "skip": yield skip_file(exists_msg) return # Logic below only for overwrite and reupload if existing == "overwrite": if extant_etag == file_etag: yield skip_file(exists_msg) return elif existing == "refresh": if extant_etag == file_etag: yield skip_file("file exists") return elif remote_mtime is not None and remote_mtime >= local_mtime: yield skip_file(exists_msg) return elif existing == "force": pass else: raise ValueError(f"invalid value for 'existing': {existing!r}") yield {"message": f"{exists_msg} - reuploading"} # # Extract metadata - delayed since takes time, but is done before # actual upload, so we could skip if this fails # # Extract metadata before actual upload and skip if fails # TODO: allow for for non-nwb files to skip this step # ad-hoc for dandiset.yaml for now yield {"status": "extracting metadata"} try: asset_metadata = nwb2asset( path, digest=file_etag, digest_type="dandi_etag" ) except Exception as exc: lgr.exception("Failed to extract metadata from %s", path) if allow_any_path: yield {"status": "failed to extract metadata"} asset_metadata = get_default_metadata( path, digest=file_etag, digest_type="dandi_etag" ) else: yield skip_file("failed to extract metadata: %s" % str(exc)) return metadata = asset_metadata.json_dict() metadata["path"] = str(relpath) # # Upload file # yield {"status": "uploading"} validating = False for r in client.iter_upload( ds_identifier, "draft", metadata, str(path), jobs=jobs_per_file ): if r["status"] == "uploading": uploaded_paths[str(path)]["size"] = r.pop("current") yield r elif r["status"] == "post-validating": # Only yield the first "post-validating" status if not validating: yield r validating = True else: yield r yield {"status": "done"} except Exception as exc: if devel_debug: raise # Custom formatting for some exceptions we know to extract # user-meaningful message message = str(exc) uploaded_paths[str(path)]["errors"].append(message) yield {"status": "ERROR", "message": message} finally: process_paths.remove(str(path)) # We will again use pyout to provide a neat table summarizing our progress # with upload etc from .support import pyout as pyouts # for the upload speeds we need to provide a custom aggregate t0 = time.time() def upload_agg(*ignored): dt = time.time() - t0 # to help avoiding dict length changes during upload # might be not a proper solution # see https://github.com/dandi/dandi-cli/issues/502 for more info uploaded_recs = list(uploaded_paths.values()) total = sum(v["size"] for v in uploaded_recs) if not total: return "" speed = total / dt if dt else 0 return "%s/s" % naturalsize(speed) pyout_style = pyouts.get_style(hide_if_missing=False) pyout_style["upload"]["aggregate"] = upload_agg rec_fields = ["path", "size", "errors", "upload", "status", "message"] out = pyouts.LogSafeTabular(style=pyout_style, columns=rec_fields, max_workers=jobs) with out, client.session(): for path in paths: while len(process_paths) >= 10: lgr.log(2, "Sleep waiting for some paths to finish processing") time.sleep(0.5) rec = {"path": str(path)} process_paths.add(str(path)) try: relpath = path.absolute().relative_to(dandiset.path) rec["path"] = str(relpath) if devel_debug: # DEBUG: do serially for v in process_path(path, relpath): print(str(v), flush=True) else: rec[tuple(rec_fields[1:])] = process_path(path, relpath) except ValueError as exc: if "does not start with" in str(exc): # if top_path is not the top path for the path # Provide more concise specific message without path details rec.update(skip_file("must be a child of top path")) else: rec.update(skip_file(exc)) out(rec) if sync: relpaths = [] for p in original_paths: rp = os.path.relpath(p, dandiset.path) relpaths.append("" if rp == "." else rp) path_prefix = reduce(os.path.commonprefix, relpaths) to_delete = [] for asset in client.get_dandiset_assets( ds_identifier, "draft", path=path_prefix ): if ( any(p == "" or path_is_subpath(asset["path"], p) for p in relpaths) and not Path(dandiset.path, asset["path"]).exists() ): to_delete.append(asset["asset_id"]) if to_delete and click.confirm( f"Delete {pluralize(len(to_delete), 'asset')} on server?" ): for asset_id in to_delete: client.delete_asset(ds_identifier, "draft", asset_id)
def _new_upload( api_url, dandiset, paths, existing, validation, dandiset_path, allow_any_path, upload_dandiset_metadata, devel_debug, ): from .dandiapi import DandiAPIClient from .dandiset import APIDandiset from .support.digests import Digester client = DandiAPIClient(api_url) client.dandi_authenticate() dandiset = APIDandiset(dandiset.path) # "cast" to a new API based dandiset ds_identifier = dandiset.identifier # this is a path not a girder id if not re.match(dandiset_identifier_regex, str(ds_identifier)): raise ValueError( f"Dandiset identifier {ds_identifier} does not follow expected " f"convention {dandiset_identifier_regex!r}. Use " f"'dandi register' to get a legit identifier") from .metadata import nwb2asset from .pynwb_utils import ignore_benign_pynwb_warnings from .support.pyout import naturalsize from .utils import find_dandi_files, find_files, path_is_subpath from .validate import validate_file ignore_benign_pynwb_warnings() # so validate doesn't whine # # Treat paths # if not paths: paths = [dandiset.path] # Expand and validate all paths -- they should reside within dandiset paths = find_files(".*", paths) if allow_any_path else find_dandi_files(paths) paths = list(map(Path, paths)) npaths = len(paths) lgr.info(f"Found {npaths} files to consider") for path in paths: if not (allow_any_path or path.name == dandiset_metadata_file or path.name.endswith(".nwb")): raise NotImplementedError( f"ATM only .nwb and dandiset.yaml should be in the paths to upload. Got {path}" ) if not path_is_subpath(str(path.absolute()), dandiset.path): raise ValueError(f"{path} is not under {dandiset.path}") # We will keep a shared set of "being processed" paths so # we could limit the number of them until # https://github.com/pyout/pyout/issues/87 # properly addressed process_paths = set() from collections import defaultdict uploaded_paths = defaultdict(lambda: {"size": 0, "errors": []}) def skip_file(msg): return {"status": "skipped", "message": str(msg)} # TODO: we might want to always yield a full record so no field is not # provided to pyout to cause it to halt def process_path(path, relpath): """ Parameters ---------- path: Path Non Pure (OS specific) Path relpath: For location on server. Will be cast to PurePosixPath Yields ------ dict Records for pyout """ # Ensure consistent types path = Path(path) relpath = PurePosixPath(relpath) try: try: path_stat = path.stat() yield {"size": path_stat.st_size} except FileNotFoundError: yield skip_file("ERROR: File not found") return except Exception as exc: # without limiting [:50] it might cause some pyout indigestion yield skip_file("ERROR: %s" % str(exc)[:50]) return # # Compute checksums and possible other digests (e.g. for s3, ipfs - TODO) # yield {"status": "digesting"} try: # TODO: in theory we could also cache the result, but since it is # critical to get correct checksums, safer to just do it all the time. # Should typically be faster than upload itself ;-) digester = Digester(["sha256"]) sha256_digest = digester(path)["sha256"] except Exception as exc: yield skip_file("failed to compute digests: %s" % str(exc)) return extant = client.get_asset_bypath(ds_identifier, "draft", relpath) if extant is not None and extant["sha256"] == sha256_digest: if existing == "error": # as promised -- not gentle at all! raise FileExistsError("file exists") if existing == "skip": yield skip_file("file exists") return # Logic below only for overwrite and reupload if existing == "overwrite": if extant["sha256"] == sha256_digest: yield skip_file("file exists") return elif existing == "refresh": pass elif existing == "force": pass else: raise ValueError("existing") # # Validate first, so we do not bother server at all if not kosher # # TODO: enable back validation of dandiset.yaml if path.name != dandiset_metadata_file and validation != "skip": yield {"status": "validating"} validation_errors = validate_file(path) yield {"errors": len(validation_errors)} # TODO: split for dandi, pynwb errors if validation_errors: if validation == "require": yield skip_file("failed validation") return else: yield {"status": "validated"} else: # yielding empty causes pyout to get stuck or crash # https://github.com/pyout/pyout/issues/91 # yield {"errors": '',} pass # # Special handling for dandiset.yaml # Yarik hates it but that is life for now. TODO # if path.name == dandiset_metadata_file: # TODO This is a temporary measure to avoid breaking web UI # dandiset metadata schema assumptions. All edits should happen # online. if upload_dandiset_metadata: yield {"status": "updating metadata"} client.set_dandiset_metadata(dandiset.identifier, metadata=dandiset.metadata) yield {"status": "updated metadata"} else: yield skip_file("should be edited online") return # # Extract metadata - delayed since takes time, but is done before # actual upload, so we could skip if this fails # # Extract metadata before actual upload and skip if fails # TODO: allow for for non-nwb files to skip this step # ad-hoc for dandiset.yaml for now yield {"status": "extracting metadata"} try: asset_metadata = nwb2asset(path, digest=sha256_digest, digest_type="SHA256") except Exception as exc: if allow_any_path: yield {"status": "failed to extract metadata"} metadata = { "contentSize": os.path.getsize(path), "digest": sha256_digest, "digest_type": "SHA256", # "encodingFormat": # TODO } else: yield skip_file("failed to extract metadata: %s" % str(exc)) return else: # We need to convert to a `dict` this way instead of with # `.dict()` so that enums will be converted to strings. metadata = json.loads( asset_metadata.json(exclude_unset=True, exclude_none=True)) # # Upload file # yield {"status": "uploading"} for r in client.iter_upload(ds_identifier, "draft", str(relpath), metadata, str(path)): if r["status"] == "uploading": uploaded_paths[str(path)]["size"] = r["current"] yield r yield {"status": "done"} except Exception as exc: if devel_debug: raise # Custom formatting for some exceptions we know to extract # user-meaningful message message = str(exc) uploaded_paths[str(path)]["errors"].append(message) yield {"status": "ERROR", "message": message} finally: process_paths.remove(str(path)) # We will again use pyout to provide a neat table summarizing our progress # with upload etc import pyout from .support import pyout as pyouts # for the upload speeds we need to provide a custom aggregate t0 = time.time() def upload_agg(*ignored): dt = time.time() - t0 total = sum(v["size"] for v in uploaded_paths.values()) if not total: return "" speed = total / dt if dt else 0 return "%s/s" % naturalsize(speed) pyout_style = pyouts.get_style(hide_if_missing=False) pyout_style["upload"]["aggregate"] = upload_agg rec_fields = ["path", "size", "errors", "upload", "status", "message"] out = pyout.Tabular(style=pyout_style, columns=rec_fields) with out, client.session(): for path in paths: while len(process_paths) >= 10: lgr.log(2, "Sleep waiting for some paths to finish processing") time.sleep(0.5) rec = {"path": str(path)} process_paths.add(str(path)) try: relpath = path.absolute().relative_to(dandiset.path) rec["path"] = str(relpath) if devel_debug: # DEBUG: do serially for v in process_path(path, relpath): print(str(v), flush=True) else: rec[tuple(rec_fields[1:])] = process_path(path, relpath) except ValueError as exc: if "does not start with" in str(exc): # if top_path is not the top path for the path # Provide more concise specific message without path details rec.update(skip_file("must be a child of top path")) else: rec.update(skip_file(exc)) out(rec)
def _scan(self): """Look through the root folder and compile a flat list of images.""" if not os.path.exists(self.root_folder): raise RuntimeError('%s Folder (%s) does not exist' % (self.CLASS_TAG, self.root_folder)) for path in sorted(Path(self.root_folder).rglob('*.jpg')): self.local_images.append(ProcessedImage(os.path.join(self.root_folder, str(path.absolute()))))
def get_free_file(path, bytes=True, max_files=100, force_suffix=False, start_suffix=None): """ Return a file handle to an unused filename. If 'path' is free, return a handle to that. Otherwise, append a number to it until a free filename is found or the number exceeds 'max_files'. In the latter case, raise 'IOError'. Returning a file handle, rather than just a file name, avoids the possibility of a race condition (a new file of the same name could be created between the time where one finds a free filename and then opens the file). Parameters ---------- path: str Path name. Can be absolute or relative to the current directory. bytes: bool (Default: True) (Optional) Specify whether to open the file for byte (True) or plain text (False) output. Default is to open for byte output, which is suitable for passing to `numpy.save`. max_files: int (Optional) Maximum allowed number of files with the same name. If this number is exceeded, IOError is raised. force_suffix: bool (default False) (Optional) If True, a suffix '_#', where # is a number, is always added to the file name. Forcing suffixes also changes the default value of 'start_suffix' to 1. start_suffix: int (default 2) If creating a file with 'path' is unsuccessful (or 'force_suffix is set to True), this is the first number to try appending to the file name. Returns ------- filehandle Write-only filehandle, as obtained from a call to `open(pathname, 'mode='xb')`. pathname: str Pathname (including the possibly appended number) of the opened file. """ # Get a full path # TODO: is cwd always what we want here ? if isinstance(path, Path): pathname = str(path.absolute()) elif path[0] == '/': #path is already a full path name pathname = path else: #Make a full path from path pathname = os.path.abspath(path) # Set the default value for start_suffix if start_suffix is None: start_suffix = 1 if force_suffix else 2 # Set the mode if bytes: mode = 'xb' else: mode = 'x' # Make sure the directory exists os.makedirs(os.path.dirname(pathname), exist_ok=True) try: if force_suffix: raise IOError else: f = open(pathname, mode=mode) return f, pathname except IOError: name, ext = os.path.splitext(pathname) for i in range(start_suffix, max_files+start_suffix): appendedname = name + "_" + str(i) + ext try: f = open(appendedname, mode=mode) return f, appendedname except IOError: continue raise IOError("Number of files with the name '{}' has exceeded limit." .format(path))
def calibrate_checkerboard(board_vid, m_corners, n_corners, framerate=30, do_debug=True): """ Finds internal corners of checkerboards to generate the camera matrix. Parameters: ----------- board_vid (str): Path to .mp4 checkerboard video or path to a folder containing checkerboard .jpgs. m_corners (int): Number of internal corners along the rows of the checkerboard n_corners (int): Number of internal corners along the columns of the checkerboard framerate (int): Framerate with which `board_vid` was recorded do_debug (bool): If True, will show a live feed of the labelled checkerboards, and will save a directory of the labelled checkerboard .jpgs. Default is True. Returns: -------- A dictionary of length 6 that consists of ret, cam_mtx, dist, r_vecs, t_vecs from cv2.calibrateCamera() and the mean reprojection error. Saves this dictionary as a pickle file called 'cam_calib_results.pkl'. If this file already exists, running this function will read the pickle file and return the contained dictionary. In addition, saves at least a video of the labelled checkerboards. """ board_vid = expanduser(board_vid) assert (basename(board_vid) != "checkerboards.mp4" ), "Rename 'checkerboards.mp4' to something else!" if Path(board_vid).is_file(): assert (splitext(board_vid)[1] == ".mp4" ), "`board_vid` must be an '.mp4' file!" output_vid = path.join(dirname(board_vid), "checkerboards.mp4") pkl_file = path.join(dirname(board_vid), "cam_calib_results.pkl") if do_debug: proceed_debug = ask_yes_no( f"Debug mode is on, which means the script will actually delete things. Previous {basename(output_vid)} and {basename(pkl_file)} outputs will be deleted. Continue?" ) if proceed_debug: boards_dir = path.join(dirname(board_vid), "checkerboards") else: exit("Quitting ...") if Path(boards_dir).is_dir(): rmtree(boards_dir) else: mkdir(boards_dir) if Path(output_vid).is_file(): Path(output_vid).unlink() if Path(pkl_file).is_file(): Path(pkl_file).unlink() if Path(output_vid).is_file() and Path(pkl_file).is_file(): print( f"{basename(output_vid)} already exists at {dirname(output_vid)}") print(f"Reading {basename(pkl_file)} from {dirname(pkl_file)} ...") cam_calib_results = pickle.load(open(pkl_file, "rb")) msg = f"camera matrix: \n{cam_calib_results['cam_mtx']}\n\ndistortion coefficients: \n{cam_calib_results['dist']}\n\nmean reprojection error: \n{cam_calib_results['mean_reproj_error']}\n" print(msg) return cam_calib_results elif not Path(output_vid).is_file() and not Path(pkl_file).is_file(): # Define the codec: fourcc = cv2.VideoWriter_fourcc(*"mp4v") # Set up corner-finding: # ----------------------- # Termination criteria: criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 30, 0.001) # Prepare object points like (0,0,0), (1,0,0), (2,0,0) ....,(6,5,0): obj_p = np.zeros((n_corners * m_corners, 3), np.float32) obj_p[:, :2] = np.mgrid[0:m_corners, 0:n_corners].T.reshape(-1, 2) # Arrays to store object points and image points from all the images: obj_points = [] # 3d point in real world space img_points = [] # 2d points in image plane # ------------------------ i = 0 if Path(board_vid).is_file(): cap = cv2.VideoCapture(board_vid) out = cv2.VideoWriter(filename=output_vid, apiPreference=0, fourcc=fourcc, fps=int(framerate), frameSize=(int(cap.get(3)), int(cap.get(4))), params=None) frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) pbar = trange(frame_count) for f, _ in enumerate(pbar): _, frame = cap.read() gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) # Find the checkerboard corners: ret, corners = cv2.findChessboardCorners( gray, (m_corners, n_corners), None) # If found, add object points, image points (after refining them): if ret == True: obj_points.append(obj_p) # This method increases the accuracy of the identified corners: better_corners = cv2.cornerSubPix(gray, corners, (11, 11), (-1, -1), criteria) img_points.append(better_corners) # Draw and display the corners: img = cv2.drawChessboardCorners(frame, (m_corners, n_corners), better_corners, ret) # Save to video: out.write(img) if do_debug: cv2.imwrite( path.join(boards_dir, f"frame_{i:08d}.jpg"), img) cv2.imshow("checkerboard detected ...", img) if cv2.waitKey(1) & 0xFF == ord("q"): break pbar.set_description( f"Found {i+1} checkerboards in {f+1}/{frame_count} frames" ) cv2.waitKey(1) i += 1 cap.release() elif Path(board_vid).is_dir(): jpgs = [ str(path.absolute()) for path in Path(board_vid).rglob("*.jpg") ] if len(jpgs) == 0: raise ValueError("No '.jpg' images were found.") jpg_shape = get_img_shape(jpgs[0]) # from first image out = cv2.VideoWriter(filename=output_vid, apiPreference=0, fourcc=fourcc, fps=int(framerate), frameSize=(int(jpg_shape[1]), int(jpg_shape[0])), params=None) pbar = tqdm(jpgs) for f, jpg in enumerate(pbar): img = cv2.imread(jpg) gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # Find the checkerboard corners: ret, corners = cv2.findChessboardCorners( gray, (m_corners, n_corners), None) # If found, add object points, image points (after refining them): if ret == True: obj_points.append(obj_p) # This method increases the accuracy of the identified corners: better_corners = cv2.cornerSubPix(gray, corners, (11, 11), (-1, -1), criteria) img_points.append(better_corners) # Draw and display the corners: img = cv2.drawChessboardCorners(img, (m_corners, n_corners), better_corners, ret) # Save to video: out.write(img) if do_debug: cv2.imwrite(path.join(boards_dir, basename(jpg)), img) cv2.imshow("checkerboard detected ...", img) if cv2.waitKey(1) & 0xFF == ord("q"): break pbar.set_description( f"Found {i+1} checkerboards in {f+1}/{len(jpgs)} frames" ) cv2.waitKey(1) i += 1 out.release cv2.destroyAllWindows() # Calibrate: print( "Computing camera matrix from calibration data. If many checkerboards were found, will take a long while ..." ) ret, cam_mtx, dist, r_vecs, t_vecs = cv2.calibrateCamera( obj_points, img_points, gray.shape[::-1], None, None) # Get re-projection error: total_reproj_error = 0 for obj_point, img_point, r_vec, t_vec in zip(obj_points, img_points, r_vecs, t_vecs): img_points_2, _ = cv2.projectPoints(obj_point, r_vec, t_vec, cam_mtx, dist) error = cv2.norm(img_point, img_points_2, cv2.NORM_L2) / len(img_points_2) total_reproj_error += np.abs(error) mean_reproj_error = total_reproj_error / len(obj_points) # Output: msg = f"\ncamera matrix: \n{cam_mtx}\n\ndistortion coefficients: \n{dist}\n\nmean reprojection error: \n{mean_reproj_error}\n" print(msg) cam_calib_results = { "ret": ret, "cam_mtx": cam_mtx, "dist": dist, "r_vecs": r_vecs, "t_vecs": t_vecs, "mean_reproj_error": mean_reproj_error } pickle.dump(cam_calib_results, open(pkl_file, "wb")) return cam_calib_results else: exit( f"Only one of {basename(output_vid)} or {basename(pkl_file)} exists at {dirname(board_vid)}. \nPlease delete whichever one exists and re-run." )
def upload( paths, existing="refresh", validation="require", dandiset_path=None, girder_collection=collection_drafts, girder_top_folder=None, dandi_instance="dandi", fake_data=False, # TODO: not implemented, prune? allow_any_path=False, upload_dandiset_metadata=False, devel_debug=False, ): from .dandiset import Dandiset from . import girder from .support.digests import Digester dandiset = Dandiset.find(dandiset_path) if not dandiset: raise RuntimeError( f"Found no {dandiset_metadata_file} anywhere. " "Use 'dandi register', 'download', or 'organize' first") # Should no longer be needed # dandiset_path = Path(dandiset_path).resolve() instance = get_instance(dandi_instance) if instance.girder is None: assert instance.api is not None return _new_upload( instance.api, dandiset, paths, existing, validation, dandiset_path, allow_any_path, upload_dandiset_metadata, devel_debug, ) if upload_dandiset_metadata: raise NotImplementedError( "Upload of dandiset metadata to Girder based server is not supported." ) client = girder.get_client(instance.girder) # Girder side details: if not girder_collection: girder_collection = collection_drafts if not girder_top_folder: # We upload to staging/dandiset_id ds_identifier = dandiset.identifier if not ds_identifier: raise ValueError( "No 'identifier' set for the dandiset yet. Use 'dandi register'" ) if not re.match(dandiset_identifier_regex, ds_identifier): raise ValueError( f"Dandiset identifier {ds_identifier} does not follow expected " f"convention {dandiset_identifier_regex!r}. Use " f"'dandi register' to get a legit identifier") # this is a path not a girder id girder_top_folder = ds_identifier girder_top_folder = PurePosixPath(girder_top_folder) if str(girder_top_folder) in (".", "..", "", "/"): raise ValueError( f"Got folder {girder_top_folder}, but files cannot be uploaded " f"into a collection directly.") import multiprocessing from .metadata import get_metadata from .pynwb_utils import get_object_id, ignore_benign_pynwb_warnings from .support.generatorify import generator_from_callback from .support.pyout import naturalsize from .utils import find_dandi_files, find_files, path_is_subpath from .validate import validate_file ignore_benign_pynwb_warnings() # so validate doesn't whine try: collection_rec = girder.ensure_collection(client, girder_collection) except girder.gcl.HttpError as exc: if devel_debug: raise # provide a bit less intimidating error reporting lgr.error( "Failed to assure presence of the %s collection: %s", girder_collection, (girder.get_HttpError_response(exc) or {}).get("message", str(exc)), ) sys.exit(1) lgr.debug("Working with collection %s", collection_rec) try: girder.lookup(client, girder_collection, path=girder_top_folder) except girder.GirderNotFound: raise ValueError( f"There is no {girder_top_folder} in {girder_collection}. " f"Did you use 'dandi register'?") # # Treat paths # if not paths: paths = [dandiset.path] # Expand and validate all paths -- they should reside within dandiset paths = find_files(".*", paths) if allow_any_path else find_dandi_files(paths) paths = list(map(Path, paths)) npaths = len(paths) lgr.info(f"Found {npaths} files to consider") for path in paths: if not (allow_any_path or path.name == dandiset_metadata_file or path.name.endswith(".nwb")): raise NotImplementedError( f"ATM only .nwb and dandiset.yaml should be in the paths to upload. Got {path}" ) if not path_is_subpath(str(path.absolute()), dandiset.path): raise ValueError(f"{path} is not under {dandiset.path}") # We will keep a shared set of "being processed" paths so # we could limit the number of them until # https://github.com/pyout/pyout/issues/87 # properly addressed process_paths = set() from collections import defaultdict uploaded_paths = defaultdict(lambda: {"size": 0, "errors": []}) def skip_file(msg): return {"status": "skipped", "message": str(msg)} lock = multiprocessing.Lock() # TODO: we might want to always yield a full record so no field is not # provided to pyout to cause it to halt def process_path(path, relpath): """ Parameters ---------- path: Path Non Pure (OS specific) Path relpath: For location on Girder. Will be cast to PurePosixPath Yields ------ dict Records for pyout """ # Ensure consistent types path = Path(path) relpath = PurePosixPath(relpath) try: try: path_stat = path.stat() yield {"size": path_stat.st_size} except FileNotFoundError: yield skip_file("ERROR: File not found") return except Exception as exc: # without limiting [:50] it might cause some pyout indigestion yield skip_file("ERROR: %s" % str(exc)[:50]) return yield {"status": "checking girder"} girder_folder = girder_top_folder / relpath.parent # we will add some fields which would help us with deciding to # reupload or not file_metadata_ = { "uploaded_size": path_stat.st_size, "uploaded_mtime": ensure_strtime(path_stat.st_mtime), # "uploaded_date": None, # to be filled out upon upload completion } # A girder delete API target to .delete before uploading a file # (e.g. if decided to reupload) delete_before_upload = None def ensure_item(): """This function might need to be called twice, e.g. if we are to reupload the entire item. ATM new versions of the files would create new items since the policy is one File per Item """ try: lock.acquire(timeout=60) # TODO: we need to make this all thread safe all the way # until uploading the file since multiple threads would # create multiple # ATM it even fails with No such folder: 5e33658d6eb14e0bf49e97d5", # so will first upload one file and then the rest... not sure why # locking doesn't work folder_rec = girder.ensure_folder(client, collection_rec, girder_collection, girder_folder) # Get (if already exists) or create an item item_rec = client.createItem(folder_rec["_id"], name=relpath.name, reuseExisting=True) finally: lock.release() return item_rec def ensure_folder(): try: lock.acquire(timeout=60) folder_rec = girder.ensure_folder(client, collection_rec, girder_collection, girder_folder) finally: lock.release() return folder_rec # # 1. Validate first, so we do not bother girder at all if not kosher # # TODO: enable back validation of dandiset.yaml if path.name != dandiset_metadata_file and validation != "skip": yield {"status": "validating"} validation_errors = validate_file(path) yield {"errors": len(validation_errors)} # TODO: split for dandi, pynwb errors if validation_errors: if validation == "require": yield skip_file("failed validation") return else: yield {"status": "validated"} else: # yielding empty causes pyout to get stuck or crash # https://github.com/pyout/pyout/issues/91 # yield {"errors": '',} pass # # Special handling for dandiset.yaml # Yarik hates it but that is life for now. TODO # if path.name == dandiset_metadata_file: # TODO This is a temporary measure to avoid breaking web UI # dandiset metadata schema assumptions. All edits should happen # online. yield skip_file("should be edited online") return # We need to upload its content as metadata for the entire # folder. folder_rec = ensure_folder() remote_metadata = folder_rec["meta"] if remote_metadata.get("dandiset", {}) == dandiset.metadata: yield skip_file("exists (same)") else: remote_metadata["dandiset"] = dandiset.metadata yield {"status": "uploading dandiset metadata"} client.addMetadataToFolder(folder_rec["_id"], remote_metadata) yield {"status": "done"} # Interrupt -- no file to upload return # # 2. Ensure having an item # item_rec = ensure_item() # # 3. Analyze possibly present on the remote files in the item # file_recs = list(client.listFile(item_rec["_id"])) # get metadata and if we have all indications that it is # probably the same -- we just skip stat_fields = [ # Care only about mtime, ignore ctime which could change "uploaded_mtime", "uploaded_size", ] assert sorted(file_metadata_) == stat_fields item_file_metadata_ = { k: item_rec.get("meta", {}).get(k, None) for k in stat_fields } lgr.debug( "Files meta: local file: %s remote file: %s", file_metadata_, item_file_metadata_, ) if item_file_metadata_["uploaded_mtime"]: local_mtime = ensure_datetime(file_metadata_["uploaded_mtime"]) remote_mtime = ensure_datetime( item_file_metadata_.get("uploaded_mtime")) remote_file_status = ( "same" if (file_metadata_ == item_file_metadata_) else ("newer" if remote_mtime > local_mtime else ("older" if remote_mtime < local_mtime else "diff"))) else: remote_file_status = "no mtime" exists_msg = f"exists ({remote_file_status})" if file_recs: # there is a file already if len(file_recs) > 1: lgr.debug( f"Item {item_rec} contains multiple files: {file_recs}" ) if existing == "error": # as promised -- not gentle at all! raise FileExistsError(exists_msg) if existing == "skip": yield skip_file(exists_msg) return # Logic below only for overwrite and reupload if existing == "overwrite": if remote_file_status == "same": yield skip_file(exists_msg) return elif existing == "refresh": if not remote_file_status == "older": yield skip_file(exists_msg) return elif existing == "force": pass else: raise ValueError("existing") delete_before_upload = f'/item/{item_rec["_id"]}' yield {"message": exists_msg + " - reuploading"} # # 4. Extract metadata - delayed since takes time, but is done # before actual upload, so we could skip if this fails # # Extract metadata before actual upload and skip if fails # TODO: allow for for non-nwb files to skip this step # ad-hoc for dandiset.yaml for now if path.name != dandiset_metadata_file: yield {"status": "extracting metadata"} try: metadata = get_metadata(path) except Exception as exc: if allow_any_path: yield {"status": "failed to extract metadata"} metadata = {} else: yield skip_file("failed to extract metadata: %s" % str(exc)) if not file_recs: # remove empty item yield {"status": "deleting empty item"} client.delete(f'/item/{item_rec["_id"]}') yield {"status": "deleted empty item"} return # # ?. Compute checksums and possible other digests (e.g. for s3, ipfs - TODO) # yield {"status": "digesting"} try: # TODO: in theory we could also cache the result, but since it is # critical to get correct checksums, safer to just do it all the time. # Should typically be faster than upload itself ;-) digester = Digester(metadata_digests) file_metadata_.update(digester(path)) except Exception as exc: yield skip_file("failed to compute digests: %s" % str(exc)) return # # 5. Upload file # # TODO: we could potentially keep new item "hidden" until we are # done with upload, and only then remove old one and replace with # a new one (rename from "hidden" name). if delete_before_upload: yield {"status": "deleting old"} client.delete(delete_before_upload) yield {"status": "old deleted"} # create a a new item item_rec = ensure_item() yield {"status": "uploading"} # Upload file to an item # XXX TODO progress reporting back to pyout is actually tricky # if possible to implement via callback since # callback would need to yield somehow from the context here. # yoh doesn't see how that could be done yet. In the worst # case we would copy uploadFileToItem and _uploadContents # and make them into generators to relay progress instead of # via callback # https://stackoverflow.com/questions/9968592/turn-functions-with-a-callback-into-python-generators # has some solutions but all IMHO are abit too complex for r in generator_from_callback(lambda c: client.uploadFileToItem( item_rec["_id"], str(path), progressCallback=c)): upload_perc = 100 * ( (r["current"] / r["total"]) if r["total"] else 1.0) if girder._DANDI_LOG_GIRDER: girder.lgr.debug( "PROGRESS[%s]: done=%d %%done=%s", str(path), r["current"], upload_perc, ) uploaded_paths[str(path)]["size"] = r["current"] yield {"upload": upload_perc} # Get uploaded file id file_id, current = client.isFileCurrent(item_rec["_id"], path.name, path.absolute()) if not current: yield skip_file("File on server was unexpectedly changed") return # Compare file size against what download headers report # S3 doesn't seem to allow HEAD requests, so we need to instead do # a GET with a streaming response and not read the body. with client.sendRestRequest("GET", f"file/{file_id}/download", jsonResp=False, stream=True) as r: if int(r.headers["Content-Length"]) != path.stat().st_size: yield skip_file( "File size on server does not match local file") return # # 6. Upload metadata # metadata_ = {} for k, v in metadata.items(): if v in ("", None): continue # degenerate, why bother # XXX TODO: remove this -- it is only temporary, search should handle if isinstance(v, str): metadata_[k] = v.lower() elif isinstance(v, datetime): metadata_[k] = ensure_strtime(v) # we will add some fields which would help us with deciding to # reupload or not # .isoformat() would give is8601 representation but I see in girder # already # session_start_time 1971-01-01 12:00:00+00:00 # decided to go for .isoformat for internal consistency -- let's see file_metadata_["uploaded_datetime"] = ensure_strtime(time.time()) metadata_.update(file_metadata_) metadata_["uploaded_size"] = path_stat.st_size metadata_["uploaded_mtime"] = ensure_strtime(path_stat.st_mtime) metadata_["uploaded_by"] = "dandi %s" % __version__ # Also store object_id for the file to help identify changes/moves try: metadata_["uploaded_nwb_object_id"] = get_object_id(str(path)) except Exception as exc: (lgr.debug if allow_any_path else lgr.warning)( "Failed to read object_id: %s", exc) # # # # 7. Also set remote file ctime to match local mtime # # since for type "file", Resource has no "updated" field. # # and this could us help to identify changes being done # # to the remote file -- if metadata["uploaded_mtime"] # # differs # yield {"status": "setting remote file timestamp"} # try: # client.setResourceTimestamp( # file_id, type="file", created=metadata_["uploaded_mtime"] # ) # except girder.gcl.HttpError as exc: # if devel_debug: # raise # response = girder.get_HttpError_response(exc) # message = response.get("message", str(exc)) # yield {"status": "WARNING", "message": message} # 7. Upload metadata yield {"status": "uploading metadata"} client.addMetadataToItem(item_rec["_id"], metadata_) yield {"status": "done"} except Exception as exc: if devel_debug: raise # Custom formatting for some exceptions we know to extract # user-meaningful message message = str(exc) if isinstance(exc, girder.gcl.HttpError): response = girder.get_HttpError_response(exc) if "message" in response: message = response["message"] uploaded_paths[str(path)]["errors"].append(message) yield {"status": "ERROR", "message": message} finally: process_paths.remove(str(path)) # We will again use pyout to provide a neat table summarizing our progress # with upload etc import pyout from .support import pyout as pyouts # for the upload speeds we need to provide a custom aggregate t0 = time.time() def upload_agg(*ignored): dt = time.time() - t0 total = sum(v["size"] for v in uploaded_paths.values()) if not total: return "" speed = total / dt if dt else 0 return "%s/s" % naturalsize(speed) pyout_style = pyouts.get_style(hide_if_missing=False) pyout_style["upload"]["aggregate"] = upload_agg rec_fields = ["path", "size", "errors", "upload", "status", "message"] out = pyout.Tabular(style=pyout_style, columns=rec_fields) with out, client.lock_dandiset(dandiset.identifier): for path in paths: while len(process_paths) >= 10: lgr.log(2, "Sleep waiting for some paths to finish processing") time.sleep(0.5) rec = {"path": str(path)} process_paths.add(str(path)) try: relpath = path.absolute().relative_to(dandiset.path) rec["path"] = str(relpath) if devel_debug: # DEBUG: do serially for v in process_path(path, relpath): print(str(v), flush=True) else: rec[tuple(rec_fields[1:])] = process_path(path, relpath) except ValueError as exc: if "does not start with" in str(exc): # if top_path is not the top path for the path # Provide more concise specific message without path details rec.update(skip_file("must be a child of top path")) else: rec.update(skip_file(exc)) out(rec)
def __iter__(self): for path in self._path.iterdir(): if not path.match('.*') and path.is_file(): yield Image.open(str(path.absolute()))
def validate_disk_file(parser: argparse, arg: str) -> str: path: pathlib.Path = pathlib.Path(arg) abs_file_path: str = str(path.absolute().as_posix()) return abs_file_path
def _open(self, article: Optional[Article]): if article: path = self.article_path(article) else: path = self.index_path(self.project.articles.root) webbrowser.open(path.absolute().as_uri())
def process_path(path, relpath): """ Parameters ---------- path: Path Non Pure (OS specific) Path relpath: For location on Girder. Will be cast to PurePosixPath Yields ------ dict Records for pyout """ # Ensure consistent types path = Path(path) relpath = PurePosixPath(relpath) try: try: path_stat = path.stat() yield {"size": path_stat.st_size} except FileNotFoundError: yield skip_file("ERROR: File not found") return except Exception as exc: # without limiting [:50] it might cause some pyout indigestion yield skip_file("ERROR: %s" % str(exc)[:50]) return yield {"status": "checking girder"} girder_folder = girder_top_folder / relpath.parent # we will add some fields which would help us with deciding to # reupload or not file_metadata_ = { "uploaded_size": path_stat.st_size, "uploaded_mtime": ensure_strtime(path_stat.st_mtime), # "uploaded_date": None, # to be filled out upon upload completion } # A girder delete API target to .delete before uploading a file # (e.g. if decided to reupload) delete_before_upload = None def ensure_item(): """This function might need to be called twice, e.g. if we are to reupload the entire item. ATM new versions of the files would create new items since the policy is one File per Item """ try: lock.acquire(timeout=60) # TODO: we need to make this all thread safe all the way # until uploading the file since multiple threads would # create multiple # ATM it even fails with No such folder: 5e33658d6eb14e0bf49e97d5", # so will first upload one file and then the rest... not sure why # locking doesn't work folder_rec = girder.ensure_folder(client, collection_rec, girder_collection, girder_folder) # Get (if already exists) or create an item item_rec = client.createItem(folder_rec["_id"], name=relpath.name, reuseExisting=True) finally: lock.release() return item_rec def ensure_folder(): try: lock.acquire(timeout=60) folder_rec = girder.ensure_folder(client, collection_rec, girder_collection, girder_folder) finally: lock.release() return folder_rec # # 1. Validate first, so we do not bother girder at all if not kosher # # TODO: enable back validation of dandiset.yaml if path.name != dandiset_metadata_file and validation != "skip": yield {"status": "validating"} validation_errors = validate_file(path) yield {"errors": len(validation_errors)} # TODO: split for dandi, pynwb errors if validation_errors: if validation == "require": yield skip_file("failed validation") return else: yield {"status": "validated"} else: # yielding empty causes pyout to get stuck or crash # https://github.com/pyout/pyout/issues/91 # yield {"errors": '',} pass # # Special handling for dandiset.yaml # Yarik hates it but that is life for now. TODO # if path.name == dandiset_metadata_file: # TODO This is a temporary measure to avoid breaking web UI # dandiset metadata schema assumptions. All edits should happen # online. yield skip_file("should be edited online") return # We need to upload its content as metadata for the entire # folder. folder_rec = ensure_folder() remote_metadata = folder_rec["meta"] if remote_metadata.get("dandiset", {}) == dandiset.metadata: yield skip_file("exists (same)") else: remote_metadata["dandiset"] = dandiset.metadata yield {"status": "uploading dandiset metadata"} client.addMetadataToFolder(folder_rec["_id"], remote_metadata) yield {"status": "done"} # Interrupt -- no file to upload return # # 2. Ensure having an item # item_rec = ensure_item() # # 3. Analyze possibly present on the remote files in the item # file_recs = list(client.listFile(item_rec["_id"])) # get metadata and if we have all indications that it is # probably the same -- we just skip stat_fields = [ # Care only about mtime, ignore ctime which could change "uploaded_mtime", "uploaded_size", ] assert sorted(file_metadata_) == stat_fields item_file_metadata_ = { k: item_rec.get("meta", {}).get(k, None) for k in stat_fields } lgr.debug( "Files meta: local file: %s remote file: %s", file_metadata_, item_file_metadata_, ) if item_file_metadata_["uploaded_mtime"]: local_mtime = ensure_datetime(file_metadata_["uploaded_mtime"]) remote_mtime = ensure_datetime( item_file_metadata_.get("uploaded_mtime")) remote_file_status = ( "same" if (file_metadata_ == item_file_metadata_) else ("newer" if remote_mtime > local_mtime else ("older" if remote_mtime < local_mtime else "diff"))) else: remote_file_status = "no mtime" exists_msg = f"exists ({remote_file_status})" if file_recs: # there is a file already if len(file_recs) > 1: lgr.debug( f"Item {item_rec} contains multiple files: {file_recs}" ) if existing == "error": # as promised -- not gentle at all! raise FileExistsError(exists_msg) if existing == "skip": yield skip_file(exists_msg) return # Logic below only for overwrite and reupload if existing == "overwrite": if remote_file_status == "same": yield skip_file(exists_msg) return elif existing == "refresh": if not remote_file_status == "older": yield skip_file(exists_msg) return elif existing == "force": pass else: raise ValueError("existing") delete_before_upload = f'/item/{item_rec["_id"]}' yield {"message": exists_msg + " - reuploading"} # # 4. Extract metadata - delayed since takes time, but is done # before actual upload, so we could skip if this fails # # Extract metadata before actual upload and skip if fails # TODO: allow for for non-nwb files to skip this step # ad-hoc for dandiset.yaml for now if path.name != dandiset_metadata_file: yield {"status": "extracting metadata"} try: metadata = get_metadata(path) except Exception as exc: if allow_any_path: yield {"status": "failed to extract metadata"} metadata = {} else: yield skip_file("failed to extract metadata: %s" % str(exc)) if not file_recs: # remove empty item yield {"status": "deleting empty item"} client.delete(f'/item/{item_rec["_id"]}') yield {"status": "deleted empty item"} return # # ?. Compute checksums and possible other digests (e.g. for s3, ipfs - TODO) # yield {"status": "digesting"} try: # TODO: in theory we could also cache the result, but since it is # critical to get correct checksums, safer to just do it all the time. # Should typically be faster than upload itself ;-) digester = Digester(metadata_digests) file_metadata_.update(digester(path)) except Exception as exc: yield skip_file("failed to compute digests: %s" % str(exc)) return # # 5. Upload file # # TODO: we could potentially keep new item "hidden" until we are # done with upload, and only then remove old one and replace with # a new one (rename from "hidden" name). if delete_before_upload: yield {"status": "deleting old"} client.delete(delete_before_upload) yield {"status": "old deleted"} # create a a new item item_rec = ensure_item() yield {"status": "uploading"} # Upload file to an item # XXX TODO progress reporting back to pyout is actually tricky # if possible to implement via callback since # callback would need to yield somehow from the context here. # yoh doesn't see how that could be done yet. In the worst # case we would copy uploadFileToItem and _uploadContents # and make them into generators to relay progress instead of # via callback # https://stackoverflow.com/questions/9968592/turn-functions-with-a-callback-into-python-generators # has some solutions but all IMHO are abit too complex for r in generator_from_callback(lambda c: client.uploadFileToItem( item_rec["_id"], str(path), progressCallback=c)): upload_perc = 100 * ( (r["current"] / r["total"]) if r["total"] else 1.0) if girder._DANDI_LOG_GIRDER: girder.lgr.debug( "PROGRESS[%s]: done=%d %%done=%s", str(path), r["current"], upload_perc, ) uploaded_paths[str(path)]["size"] = r["current"] yield {"upload": upload_perc} # Get uploaded file id file_id, current = client.isFileCurrent(item_rec["_id"], path.name, path.absolute()) if not current: yield skip_file("File on server was unexpectedly changed") return # Compare file size against what download headers report # S3 doesn't seem to allow HEAD requests, so we need to instead do # a GET with a streaming response and not read the body. with client.sendRestRequest("GET", f"file/{file_id}/download", jsonResp=False, stream=True) as r: if int(r.headers["Content-Length"]) != path.stat().st_size: yield skip_file( "File size on server does not match local file") return # # 6. Upload metadata # metadata_ = {} for k, v in metadata.items(): if v in ("", None): continue # degenerate, why bother # XXX TODO: remove this -- it is only temporary, search should handle if isinstance(v, str): metadata_[k] = v.lower() elif isinstance(v, datetime): metadata_[k] = ensure_strtime(v) # we will add some fields which would help us with deciding to # reupload or not # .isoformat() would give is8601 representation but I see in girder # already # session_start_time 1971-01-01 12:00:00+00:00 # decided to go for .isoformat for internal consistency -- let's see file_metadata_["uploaded_datetime"] = ensure_strtime(time.time()) metadata_.update(file_metadata_) metadata_["uploaded_size"] = path_stat.st_size metadata_["uploaded_mtime"] = ensure_strtime(path_stat.st_mtime) metadata_["uploaded_by"] = "dandi %s" % __version__ # Also store object_id for the file to help identify changes/moves try: metadata_["uploaded_nwb_object_id"] = get_object_id(str(path)) except Exception as exc: (lgr.debug if allow_any_path else lgr.warning)( "Failed to read object_id: %s", exc) # # # # 7. Also set remote file ctime to match local mtime # # since for type "file", Resource has no "updated" field. # # and this could us help to identify changes being done # # to the remote file -- if metadata["uploaded_mtime"] # # differs # yield {"status": "setting remote file timestamp"} # try: # client.setResourceTimestamp( # file_id, type="file", created=metadata_["uploaded_mtime"] # ) # except girder.gcl.HttpError as exc: # if devel_debug: # raise # response = girder.get_HttpError_response(exc) # message = response.get("message", str(exc)) # yield {"status": "WARNING", "message": message} # 7. Upload metadata yield {"status": "uploading metadata"} client.addMetadataToItem(item_rec["_id"], metadata_) yield {"status": "done"} except Exception as exc: if devel_debug: raise # Custom formatting for some exceptions we know to extract # user-meaningful message message = str(exc) if isinstance(exc, girder.gcl.HttpError): response = girder.get_HttpError_response(exc) if "message" in response: message = response["message"] uploaded_paths[str(path)]["errors"].append(message) yield {"status": "ERROR", "message": message} finally: process_paths.remove(str(path))