def test_save_sets_mod(self, tmp_path): path = tmp_path / "doc.json" doc = FileMappedDocument(path, mode=0o600) doc.save() mode = path.stat().st_mode & 0o777 assert oct(mode) == oct(0o600)
def inferr_from_path(path): "Infer as much information as possible from the file" doc = Document.parse_path(path) if not doc.date: doc.date = datetime.datetime.fromtimestamp( path.stat().st_ctime).strftime("%Y-%m-%d") return doc
def add(self, path, derived=None, commit=True): # type: (EntryType, Optional[Dict[str, Any]], bool) -> None stats = path.stat() self._add_file(fspath(path), stats.st_size, stats.st_mtime_ns, derived) if commit: self.commit()
def filesize(path): if hasattr(path, 'filesize') and path.filesize is not None: return path.filesize if path.fp or path.is_url(): return 0 st = path.stat() # vstat(path) return st and st.st_size
def tree(path, dirs_only=False, max_depth=0, _depth=0): path = Path(path) lst = path.lstat() is_symlink = stat.S_ISLNK(lst.st_mode) st = lst if is_symlink else path.stat() is_dir = stat.S_ISDIR(st.st_mode) if is_symlink: size = 0 elif is_dir: size = functools.reduce(operator.add, [ tree(p, dirs_only=dirs_only, max_depth=max_depth, _depth=_depth+1) for p in sorted(path.iterdir()) ], 0) else: size = lst.st_size if (is_dir or not dirs_only) and \ (not max_depth or _depth <= max_depth): p = str(path) if is_dir: p += os.path.sep if is_symlink: p += ' -> ' + os.readlink(str(path)) print('%10s %s' % (format_size(size), p)) return size
def add_candidates(self): if not self.new_target_list(): return () dirsize = 0 start = datetime.datetime.now() logger.info(f"Walking target list: {self.state.dirlist}") gis = ImageSignature() for top in self.state.dirlist: message = f"Traversing tree at {top} and adding to queue." logger.info(message) self.status(message) top_path = Path(top) for path in top_path.rglob("**/*"): ext = path.suffix.lower() if ext in cfg.settings.image_filetypes: size = path.stat().st_size dirsize += size photo_b = self.get_bytes(path) md5sum = hashlib.md5(photo_b).hexdigest() # if not MD%sum already in database: im = Image.open(io.BytesIO(photo_b)) tags = { "cameraMake": im.info['parsed_exif'].get(0x010f, ""), "cameraModel": im.info['parsed_exif'].get(0x0110, ""), "creationTime": im.info['parsed_exif'].get(0x9003, ""), "width": im.width, "height": im.height, } image_md5 = hashlib.md5(im.tobytes()).hexdigest() signature = gis.generate_signature( photo_b, bytestream=True ).tolist() record = { "src_path": str(path), "size": size, "md5sum": md5sum, "image_md5": image_md5, "signature": signature, "mediaMetadata": tags, } photos.add(record) logger.info(f"Added: {path}") else: ext = ext.replace( ".", "" ) # Database can't handle keys starting with dot excluded = self.state.excluded_ext_dict if ext in excluded: excluded[ext] += 1 else: excluded[ext] = 1 self.state.update(excluded_ext_dict=excluded) self.state.save() elapsed = datetime.datetime.now() - start self.state.modify( dirsize=self.state.dirsize + dirsize, dirtime=elapsed.seconds + elapsed.microseconds / 1e6, ) return
def __init__(self, str_dir_name): dir_name = "." #dirname(filePath) dir_name_new = str_dir_name + '_lg_files' dir_name_new2 = str_dir_name + '_lg_files_out' if path.exists(dir_name_new): shutil.rmtree(dir_name_new) if path.exists(dir_name_new2): shutil.rmtree(dir_name_new2) try: stat(dir_name_new) stat(dir_name_new2) except: mkdir(dir_name_new) mkdir(dir_name_new2)
def delete_empty_file(local_filename): path = Path(local_filename) try: size = path.stat().st_size except FileNotFoundError: return if size == 0: path.unlink()
def __init__(self,str_dir_name): dir_name ="." #dirname(filePath) dir_name_new = str_dir_name+'_lg_files' dir_name_new2=str_dir_name+'_lg_files_out' if path.exists(dir_name_new): shutil.rmtree(dir_name_new) if path.exists(dir_name_new2): shutil.rmtree(dir_name_new2) try: stat(dir_name_new) stat(dir_name_new2) except: mkdir(dir_name_new) mkdir(dir_name_new2)
def file_id(path: pl.Path) -> bytes: stat = path.stat() stat_data = f"{stat.st_ino}_{stat.st_size}_{stat.st_mtime}" id_sum = hashlib.new('sha1') id_sum.update(stat_data.encode('ascii')) with path.open(mode="rb") as fh: id_sum.update(fh.read()) return id_sum.digest()
def set_path_readonly(path: Path) -> None: if path.is_dir(): # Need to add right = stat.S_IXUSR | stat.S_IRGRP | stat.S_IXGRP | stat.S_IRUSR else: # Already in read only right = stat.S_IRGRP | stat.S_IRUSR if path.stat().st_mode & ~right != 0: path.chmod(right)
def get(self, path, only=frozenset(), no=frozenset()): # type: (EntryType, FrozenSet[str], FrozenSet[str]) -> tuple """ Retrieves latest row based on mandatory information which is solely based on the `path`. Use `only`/`no` to include/exclude returned fields. """ stats = path.stat() return self.get_latest(fspath(path), stats.st_size, stats.st_mtime_ns, ignore_null=True, only=only, no=no)
def path_to_info_hash(self, path): # type: (Path, ) -> str name = path.name size = path.stat().st_size try: return self.map[(name, size)] except KeyError: raise NotFound( f"Could not find infohash for name={name}, size={size}")
def calc_state_size(self): # Should work even for pre-7.4 versions, counting only files and folder related to that version result = 0 for filename in STATE_FILES_TO_COPY: path = self.directory / filename if path.exists(): result += path.stat().st_size for dirname in STATE_DIRS_TO_COPY: path = self.directory / dirname for f in path.glob('**/*'): result += f.stat().st_size return result
def unset_path_readonly(path: Path) -> None: if path.is_dir(): right = (stat.S_IXUSR | stat.S_IRGRP | stat.S_IXGRP | stat.S_IRUSR | stat.S_IWGRP | stat.S_IWUSR) else: right = stat.S_IRGRP | stat.S_IRUSR | stat.S_IWGRP | stat.S_IWUSR if path.stat().st_mode & right != right: path.chmod(right)
def get_file_stat(path: Path) -> Tuple[int, Optional[datetime]]: """Get size of file in bytes and last modified time stamp.""" try: stats = path.stat() except IOError as exc: raise ValueError( f"Could not retrieve file stat of {path}: {exc}") from exc try: update_time = datetime.fromtimestamp(stats.st_mtime, tzlocal()) except (ValueError, OSError, OverflowError): update_time = None return stats.st_size, update_time
def unset_path_readonly(path: Path) -> None: if path.is_dir(): right = ( stat.S_IXUSR | stat.S_IRGRP | stat.S_IXGRP | stat.S_IRUSR | stat.S_IWGRP | stat.S_IWUSR ) else: right = stat.S_IRGRP | stat.S_IRUSR | stat.S_IWGRP | stat.S_IWUSR if path.stat().st_mode & right != right: path.chmod(right)
def mdatetime(path, aslocal=False): # type: (PathType, bool) -> datetime """Returns the last modified date of `path` as a timezone aware datetime object. If `aslocal=True` it will be formatted as local time, and UTC otherwise (the default). """ if isinstance(path, (Path, DirEntry)): mtime = path.stat().st_mtime else: mtime = os.stat(path).st_mtime return datetime_from_utc_timestamp(mtime, aslocal)
def __file_is_empty(path: Path) -> bool: """ Check whether the input file path is an empty file Parameters ---------- path Path object containing the input file path Returns ------- bool whether the input file path is an empty file """ return path.stat().st_size == 0
def delete_files(directory, filenames, files_to_keep=()): ensure_overwritable(*[directory.joinpath(f) for f in filenames]) # We implement the "files to keep" logic using inodes rather than names so # we can safely handle case-insenstiive filesystems inodes_to_keep = set() for filename in files_to_keep: try: stat = directory.joinpath(filename).stat() inodes_to_keep.add((stat.st_dev, stat.st_ino)) except FileNotFoundError: pass for filename in filenames: path = directory / filename try: stat = path.stat() except FileNotFoundError: continue inode = (stat.st_dev, stat.st_ino) if inode not in inodes_to_keep: path.unlink()
def add_candidates(self): self.state.reload() if self.state.target == self.state.old_target: return self.state.modify(old_target=self.state.target) message = 'Walking target directories...' logger.info(message) self.status(message) dirsize = 0 start = datetime.datetime.now() self.state.modify(dirlist=list(glob.iglob(self.state.target))) logger.info(f"Target list: {self.state.dirlist}") for top in self.state.dirlist: message = f'Traversing tree at {top} and adding to queue.' logger.info(message) self.status(message) top_path = Path(top) for path in top_path.rglob("**/*"): ext = path.suffix.lower() if ext in cfg.local.image_filetypes: size = path.stat().st_size dirsize += size Queue(src_path=str(path), size=size).save() else: ext = ext.replace( ".", "" ) # Database can't handle keys starting with dot excluded = self.state.excluded_ext_dict if ext in excluded: excluded[ext] += 1 else: excluded[ext] = 1 self.state.update(excluded_ext_dict=excluded) self.state.save() elapsed = datetime.datetime.now() - start self.state.modify( dirsize=self.state.dirsize + dirsize, dirtime=elapsed.seconds + elapsed.microseconds / 1e6, ) return
def preview(hwd, dir, order='latest'): glob = Path(dir).glob('*.jpg') if order is 'latest': # find latest image in the folder image_path = next(iter(glob)) for path in glob: if path.stat().st_mtime > image_path.stat().st_mtime: image_path = path elif order is 'random': # choose random image p = list(glob) try: image_path = str(random.choice(p)) except e: print(e) pass else: raise NotImplementedError # Delay preview, let image be fully writen to disk QtCore.QTimer.singleShot(1000, lambda: set_image(hwd, image_path))
def create_file_path(path): for base_folder, folder, files in os.walk(path): # check for files for file in files: # create file_path file_path = os.path.join(base_folder, file) # getting file extension file_extension = os.path.splitext(file_path)[1] # compare file extension with log_extension if log_extension == file_extension: # check file properties for set condition # get date file was last modified timestamp = date.fromtimestamp(path.stat().st_ctime) if date.today() == timestamp: if not os.remove(file_path): success_alert() print(f'{file_path} removed successfully') else: print(f'Unable to delete {file_path}') else: print(f'{file_path} is not a log file')
def process_path(path, relpath): """ Parameters ---------- path: Path Non Pure (OS specific) Path relpath: For location on server. Will be cast to PurePosixPath Yields ------ dict Records for pyout """ # Ensure consistent types path = Path(path) relpath = PurePosixPath(relpath) try: try: path_stat = path.stat() yield {"size": path_stat.st_size} except FileNotFoundError: yield skip_file("ERROR: File not found") return except Exception as exc: # without limiting [:50] it might cause some pyout indigestion yield skip_file("ERROR: %s" % str(exc)[:50]) return # # Compute checksums and possible other digests (e.g. for s3, ipfs - TODO) # yield {"status": "digesting"} try: # TODO: in theory we could also cache the result, but since it is # critical to get correct checksums, safer to just do it all the time. # Should typically be faster than upload itself ;-) digester = Digester(["sha256"]) sha256_digest = digester(path)["sha256"] except Exception as exc: yield skip_file("failed to compute digests: %s" % str(exc)) return extant = client.get_asset_bypath(ds_identifier, "draft", relpath) if extant is not None and extant["sha256"] == sha256_digest: if existing == "error": # as promised -- not gentle at all! raise FileExistsError("file exists") if existing == "skip": yield skip_file("file exists") return # Logic below only for overwrite and reupload if existing == "overwrite": if extant["sha256"] == sha256_digest: yield skip_file("file exists") return elif existing == "refresh": pass elif existing == "force": pass else: raise ValueError("existing") # # Validate first, so we do not bother server at all if not kosher # # TODO: enable back validation of dandiset.yaml if path.name != dandiset_metadata_file and validation != "skip": yield {"status": "validating"} validation_errors = validate_file(path) yield {"errors": len(validation_errors)} # TODO: split for dandi, pynwb errors if validation_errors: if validation == "require": yield skip_file("failed validation") return else: yield {"status": "validated"} else: # yielding empty causes pyout to get stuck or crash # https://github.com/pyout/pyout/issues/91 # yield {"errors": '',} pass # # Special handling for dandiset.yaml # Yarik hates it but that is life for now. TODO # if path.name == dandiset_metadata_file: # TODO This is a temporary measure to avoid breaking web UI # dandiset metadata schema assumptions. All edits should happen # online. if upload_dandiset_metadata: yield {"status": "updating metadata"} client.set_dandiset_metadata(dandiset.identifier, metadata=dandiset.metadata) yield {"status": "updated metadata"} else: yield skip_file("should be edited online") return # # Extract metadata - delayed since takes time, but is done before # actual upload, so we could skip if this fails # # Extract metadata before actual upload and skip if fails # TODO: allow for for non-nwb files to skip this step # ad-hoc for dandiset.yaml for now yield {"status": "extracting metadata"} try: asset_metadata = nwb2asset(path, digest=sha256_digest, digest_type="SHA256") except Exception as exc: if allow_any_path: yield {"status": "failed to extract metadata"} metadata = { "contentSize": os.path.getsize(path), "digest": sha256_digest, "digest_type": "SHA256", # "encodingFormat": # TODO } else: yield skip_file("failed to extract metadata: %s" % str(exc)) return else: # We need to convert to a `dict` this way instead of with # `.dict()` so that enums will be converted to strings. metadata = json.loads( asset_metadata.json(exclude_unset=True, exclude_none=True)) # # Upload file # yield {"status": "uploading"} for r in client.iter_upload(ds_identifier, "draft", str(relpath), metadata, str(path)): if r["status"] == "uploading": uploaded_paths[str(path)]["size"] = r["current"] yield r yield {"status": "done"} except Exception as exc: if devel_debug: raise # Custom formatting for some exceptions we know to extract # user-meaningful message message = str(exc) uploaded_paths[str(path)]["errors"].append(message) yield {"status": "ERROR", "message": message} finally: process_paths.remove(str(path))
def process_path(path, relpath): """ Parameters ---------- path: Path Non Pure (OS specific) Path relpath: For location on server. Will be cast to PurePosixPath Yields ------ dict Records for pyout """ # Ensure consistent types path = Path(path) relpath = PurePosixPath(relpath) try: try: path_stat = path.stat() yield {"size": path_stat.st_size} except FileNotFoundError: yield skip_file("ERROR: File not found") return except Exception as exc: # without limiting [:50] it might cause some pyout indigestion yield skip_file("ERROR: %s" % str(exc)[:50]) return # # Validate first, so we do not bother server at all if not kosher # # TODO: enable back validation of dandiset.yaml if path.name != dandiset_metadata_file and validation != "skip": yield {"status": "pre-validating"} validation_errors = validate_file(path) yield {"errors": len(validation_errors)} # TODO: split for dandi, pynwb errors if validation_errors: if validation == "require": yield skip_file("failed validation") return else: yield {"status": "validated"} else: # yielding empty causes pyout to get stuck or crash # https://github.com/pyout/pyout/issues/91 # yield {"errors": '',} pass # # Special handling for dandiset.yaml # Yarik hates it but that is life for now. TODO # if path.name == dandiset_metadata_file: # TODO This is a temporary measure to avoid breaking web UI # dandiset metadata schema assumptions. All edits should happen # online. if upload_dandiset_metadata: yield {"status": "updating metadata"} client.set_dandiset_metadata( dandiset.identifier, metadata=dandiset.metadata ) yield {"status": "updated metadata"} else: yield skip_file("should be edited online") return # # Compute checksums # yield {"status": "digesting"} try: file_etag = get_digest(path, digest="dandi-etag") except Exception as exc: yield skip_file("failed to compute digest: %s" % str(exc)) return extant = client.get_asset_bypath(ds_identifier, "draft", str(relpath)) if extant is not None: # The endpoint used to search by paths doesn't include asset # metadata, so we need to make another API call: metadata = client.get_asset(ds_identifier, "draft", extant["asset_id"]) local_mtime = ensure_datetime(path_stat.st_mtime) remote_mtime_str = metadata.get("blobDateModified") d = metadata.get("digest", {}) if "dandi:dandi-etag" in d: extant_etag = d["dandi:dandi-etag"] else: # TODO: Should this error instead? extant_etag = None if remote_mtime_str is not None: remote_mtime = ensure_datetime(remote_mtime_str) remote_file_status = ( "same" if extant_etag == file_etag and remote_mtime == local_mtime else ( "newer" if remote_mtime > local_mtime else ("older" if remote_mtime < local_mtime else "diff") ) ) else: remote_mtime = None remote_file_status = "no mtime" exists_msg = f"exists ({remote_file_status})" if existing == "error": # as promised -- not gentle at all! raise FileExistsError(exists_msg) if existing == "skip": yield skip_file(exists_msg) return # Logic below only for overwrite and reupload if existing == "overwrite": if extant_etag == file_etag: yield skip_file(exists_msg) return elif existing == "refresh": if extant_etag == file_etag: yield skip_file("file exists") return elif remote_mtime is not None and remote_mtime >= local_mtime: yield skip_file(exists_msg) return elif existing == "force": pass else: raise ValueError(f"invalid value for 'existing': {existing!r}") yield {"message": f"{exists_msg} - reuploading"} # # Extract metadata - delayed since takes time, but is done before # actual upload, so we could skip if this fails # # Extract metadata before actual upload and skip if fails # TODO: allow for for non-nwb files to skip this step # ad-hoc for dandiset.yaml for now yield {"status": "extracting metadata"} try: asset_metadata = nwb2asset( path, digest=file_etag, digest_type="dandi_etag" ) except Exception as exc: lgr.exception("Failed to extract metadata from %s", path) if allow_any_path: yield {"status": "failed to extract metadata"} asset_metadata = get_default_metadata( path, digest=file_etag, digest_type="dandi_etag" ) else: yield skip_file("failed to extract metadata: %s" % str(exc)) return metadata = asset_metadata.json_dict() metadata["path"] = str(relpath) # # Upload file # yield {"status": "uploading"} validating = False for r in client.iter_upload( ds_identifier, "draft", metadata, str(path), jobs=jobs_per_file ): if r["status"] == "uploading": uploaded_paths[str(path)]["size"] = r.pop("current") yield r elif r["status"] == "post-validating": # Only yield the first "post-validating" status if not validating: yield r validating = True else: yield r yield {"status": "done"} except Exception as exc: if devel_debug: raise # Custom formatting for some exceptions we know to extract # user-meaningful message message = str(exc) uploaded_paths[str(path)]["errors"].append(message) yield {"status": "ERROR", "message": message} finally: process_paths.remove(str(path))
def modtime(path): st = path.stat() return st and st.st_mtime
def revert(self, path_or_version=None, snapshot=False, *, log_path=None, make_backup=True, override=False, reply=print): """Revert to a different version of Minecraft and restore a pre-update backup. Optional arguments: path_or_version -- If given, a pathlib.Path pointing at the backup file to be restored, or the Minecraft version to which to restore. By default, the newest available pre-update backup is restored. snapshot -- If true, single-letter Minecraft versions will be expanded to include the current year and week number. Defaults to False. Keyword-only arguments: log_path -- This is passed to the stop function if the server is stopped before the revert. make_backup -- Whether to back up the world before reverting. Defaults to True. override -- If this is True and the server jar for the target version already exists, it will be deleted and redownloaded. Defaults to False. reply -- This function is called several times with a string argument representing revert progress. Defaults to the built-in print function. """ # determine version and backup path if path_or_version is None: path = sorted((self.backup_path / 'pre-update').iterdir(), key=lambda path: path.stat().st_mtime, reverse=True)[0] # latest pre-update backup version = path.name.split('_')[3] elif isinstance(path_or_version, pathlib.Path): path = path_or_version version = path.name.split('_')[3] else: version = path_or_version if snapshot and len(version) == 1: version = datetime.datetime.utcnow().strftime( '%yw%V') + version path = next( path for path in sorted((self.backup_path / 'pre-update').iterdir(), key=lambda path: path.stat().st_mtime, reverse=True) if path.name.split('_')[3] == version) # start iter_update update_iterator = self.iter_update(version, log_path=log_path, make_backup=False, override=override, reply=reply) version_dict = next(update_iterator) reply('Downloading ' + version_dict['version_text']) # make a backup to backup/<world>/reverted if make_backup: old_version = self.version() backup_path = self.backup_path / 'reverted' / '{}_{:%Y-%m-%d_%Hh%M}_{}_{}'.format( self.name, datetime.datetime.utcnow(), old_version, version) self.backup(reply=reply, path=backup_path, copy_to_latest=False) # stop the server was_running = self.status() if was_running: self.say('Server will be reverting to ' + version_dict["version_text"] + ' and therefore restart') time.sleep(5) self.stop(reply=reply, log_path=log_path) reply('Server stopped. Restoring backup...') # revert Minecraft version for message in update_iterator: reply(message) # restore backup world_path = self.world_path if world_path.exists(): shutil.rmtree(str(world_path)) subprocess.call( ['tar', '-C', str(self.path), '-xzf', str(path), world_path.name]) # untar tar the world backup # restart server if was_running: self.start(reply=reply, start_message='Server reverted. Restarting...', log_path=log_path) return version_dict['version'], version_dict[ 'is_snapshot'], version_dict['version_text']
def revert(self, path_or_version=None, snapshot=False, *, log_path=None, make_backup=True, override=False, reply=print): """Revert to a different version of Minecraft and restore a pre-update backup. Optional arguments: path_or_version -- If given, a pathlib.Path pointing at the backup file to be restored, or the Minecraft version to which to restore. By default, the newest available pre-update backup is restored. snapshot -- If true, single-letter Minecraft versions will be expanded to include the current year and week number. Defaults to False. Keyword-only arguments: log_path -- This is passed to the stop function if the server is stopped before the revert. make_backup -- Whether to back up the world before reverting. Defaults to True. override -- If this is True and the server jar for the target version already exists, it will be deleted and redownloaded. Defaults to False. reply -- This function is called several times with a string argument representing revert progress. Defaults to the built-in print function. """ # determine version and backup path if path_or_version is None: path = sorted((self.backup_path / 'pre-update').iterdir(), key=lambda path: path.stat().st_mtime, reverse=True)[0] # latest pre-update backup version = path.name.split('_')[3] elif isinstance(path_or_version, pathlib.Path): path = path_or_version version = path.name.split('_')[3] else: version = path_or_version if snapshot and len(version) == 1: version = datetime.utcnow().strftime('%yw%V') + version path = next(path for path in sorted((self.backup_path / 'pre-update').iterdir(), key=lambda path: path.stat().st_mtime, reverse=True) if path.name.split('_')[3] == version) # start iter_update update_iterator = self.iter_update(version, log_path=log_path, make_backup=False, override=override, reply=reply) version_dict = next(update_iterator) reply('Downloading ' + version_dict['version_text']) # make a backup to backup/<world>/reverted if make_backup: old_version = self.version() backup_path = self.backup_path / 'reverted' / '{}_{:%Y-%m-%d_%Hh%M}_{}_{}'.format(self.name, datetime.utcnow(), old_version, version) self.backup(reply=reply, path=backup_path, copy_to_latest=False) # stop the server was_running = self.status() if was_running: self.say('Server will be reverting to ' + version_text + ' and therefore restart') time.sleep(5) self.stop(reply=reply, log_path=log_path) reply('Server stopped. Restoring backup...') # revert Minecraft version for message in update_iterator: reply(message) # restore backup world_path = self.world_path if world_path.exists(): shutil.rmtree(str(world_path)) subprocess.call(['tar', '-C', str(self.path), '-xzf', str(path), world_path.name]) # untar tar the world backup # restart server if was_running: self.start(reply=reply, start_message='Server reverted. Restarting...', log_path=log_path) return version_dict['version'], version_dict['is_snapshot'], version_dict['version_text']
def process_path(path, relpath): """ Parameters ---------- path: Path Non Pure (OS specific) Path relpath: For location on Girder. Will be cast to PurePosixPath Yields ------ dict Records for pyout """ # Ensure consistent types path = Path(path) relpath = PurePosixPath(relpath) try: try: path_stat = path.stat() yield {"size": path_stat.st_size} except FileNotFoundError: yield skip_file("ERROR: File not found") return except Exception as exc: # without limiting [:50] it might cause some pyout indigestion yield skip_file("ERROR: %s" % str(exc)[:50]) return yield {"status": "checking girder"} girder_folder = girder_top_folder / relpath.parent # we will add some fields which would help us with deciding to # reupload or not file_metadata_ = { "uploaded_size": path_stat.st_size, "uploaded_mtime": ensure_strtime(path_stat.st_mtime), # "uploaded_date": None, # to be filled out upon upload completion } # A girder delete API target to .delete before uploading a file # (e.g. if decided to reupload) delete_before_upload = None def ensure_item(): """This function might need to be called twice, e.g. if we are to reupload the entire item. ATM new versions of the files would create new items since the policy is one File per Item """ try: lock.acquire(timeout=60) # TODO: we need to make this all thread safe all the way # until uploading the file since multiple threads would # create multiple # ATM it even fails with No such folder: 5e33658d6eb14e0bf49e97d5", # so will first upload one file and then the rest... not sure why # locking doesn't work folder_rec = girder.ensure_folder(client, collection_rec, girder_collection, girder_folder) # Get (if already exists) or create an item item_rec = client.createItem(folder_rec["_id"], name=relpath.name, reuseExisting=True) finally: lock.release() return item_rec def ensure_folder(): try: lock.acquire(timeout=60) folder_rec = girder.ensure_folder(client, collection_rec, girder_collection, girder_folder) finally: lock.release() return folder_rec # # 1. Validate first, so we do not bother girder at all if not kosher # # TODO: enable back validation of dandiset.yaml if path.name != dandiset_metadata_file and validation != "skip": yield {"status": "validating"} validation_errors = validate_file(path) yield {"errors": len(validation_errors)} # TODO: split for dandi, pynwb errors if validation_errors: if validation == "require": yield skip_file("failed validation") return else: yield {"status": "validated"} else: # yielding empty causes pyout to get stuck or crash # https://github.com/pyout/pyout/issues/91 # yield {"errors": '',} pass # # Special handling for dandiset.yaml # Yarik hates it but that is life for now. TODO # if path.name == dandiset_metadata_file: # TODO This is a temporary measure to avoid breaking web UI # dandiset metadata schema assumptions. All edits should happen # online. yield skip_file("should be edited online") return # We need to upload its content as metadata for the entire # folder. folder_rec = ensure_folder() remote_metadata = folder_rec["meta"] if remote_metadata.get("dandiset", {}) == dandiset.metadata: yield skip_file("exists (same)") else: remote_metadata["dandiset"] = dandiset.metadata yield {"status": "uploading dandiset metadata"} client.addMetadataToFolder(folder_rec["_id"], remote_metadata) yield {"status": "done"} # Interrupt -- no file to upload return # # 2. Ensure having an item # item_rec = ensure_item() # # 3. Analyze possibly present on the remote files in the item # file_recs = list(client.listFile(item_rec["_id"])) # get metadata and if we have all indications that it is # probably the same -- we just skip stat_fields = [ # Care only about mtime, ignore ctime which could change "uploaded_mtime", "uploaded_size", ] assert sorted(file_metadata_) == stat_fields item_file_metadata_ = { k: item_rec.get("meta", {}).get(k, None) for k in stat_fields } lgr.debug( "Files meta: local file: %s remote file: %s", file_metadata_, item_file_metadata_, ) if item_file_metadata_["uploaded_mtime"]: local_mtime = ensure_datetime(file_metadata_["uploaded_mtime"]) remote_mtime = ensure_datetime( item_file_metadata_.get("uploaded_mtime")) remote_file_status = ( "same" if (file_metadata_ == item_file_metadata_) else ("newer" if remote_mtime > local_mtime else ("older" if remote_mtime < local_mtime else "diff"))) else: remote_file_status = "no mtime" exists_msg = f"exists ({remote_file_status})" if file_recs: # there is a file already if len(file_recs) > 1: lgr.debug( f"Item {item_rec} contains multiple files: {file_recs}" ) if existing == "error": # as promised -- not gentle at all! raise FileExistsError(exists_msg) if existing == "skip": yield skip_file(exists_msg) return # Logic below only for overwrite and reupload if existing == "overwrite": if remote_file_status == "same": yield skip_file(exists_msg) return elif existing == "refresh": if not remote_file_status == "older": yield skip_file(exists_msg) return elif existing == "force": pass else: raise ValueError("existing") delete_before_upload = f'/item/{item_rec["_id"]}' yield {"message": exists_msg + " - reuploading"} # # 4. Extract metadata - delayed since takes time, but is done # before actual upload, so we could skip if this fails # # Extract metadata before actual upload and skip if fails # TODO: allow for for non-nwb files to skip this step # ad-hoc for dandiset.yaml for now if path.name != dandiset_metadata_file: yield {"status": "extracting metadata"} try: metadata = get_metadata(path) except Exception as exc: if allow_any_path: yield {"status": "failed to extract metadata"} metadata = {} else: yield skip_file("failed to extract metadata: %s" % str(exc)) if not file_recs: # remove empty item yield {"status": "deleting empty item"} client.delete(f'/item/{item_rec["_id"]}') yield {"status": "deleted empty item"} return # # ?. Compute checksums and possible other digests (e.g. for s3, ipfs - TODO) # yield {"status": "digesting"} try: # TODO: in theory we could also cache the result, but since it is # critical to get correct checksums, safer to just do it all the time. # Should typically be faster than upload itself ;-) digester = Digester(metadata_digests) file_metadata_.update(digester(path)) except Exception as exc: yield skip_file("failed to compute digests: %s" % str(exc)) return # # 5. Upload file # # TODO: we could potentially keep new item "hidden" until we are # done with upload, and only then remove old one and replace with # a new one (rename from "hidden" name). if delete_before_upload: yield {"status": "deleting old"} client.delete(delete_before_upload) yield {"status": "old deleted"} # create a a new item item_rec = ensure_item() yield {"status": "uploading"} # Upload file to an item # XXX TODO progress reporting back to pyout is actually tricky # if possible to implement via callback since # callback would need to yield somehow from the context here. # yoh doesn't see how that could be done yet. In the worst # case we would copy uploadFileToItem and _uploadContents # and make them into generators to relay progress instead of # via callback # https://stackoverflow.com/questions/9968592/turn-functions-with-a-callback-into-python-generators # has some solutions but all IMHO are abit too complex for r in generator_from_callback(lambda c: client.uploadFileToItem( item_rec["_id"], str(path), progressCallback=c)): upload_perc = 100 * ( (r["current"] / r["total"]) if r["total"] else 1.0) if girder._DANDI_LOG_GIRDER: girder.lgr.debug( "PROGRESS[%s]: done=%d %%done=%s", str(path), r["current"], upload_perc, ) uploaded_paths[str(path)]["size"] = r["current"] yield {"upload": upload_perc} # Get uploaded file id file_id, current = client.isFileCurrent(item_rec["_id"], path.name, path.absolute()) if not current: yield skip_file("File on server was unexpectedly changed") return # Compare file size against what download headers report # S3 doesn't seem to allow HEAD requests, so we need to instead do # a GET with a streaming response and not read the body. with client.sendRestRequest("GET", f"file/{file_id}/download", jsonResp=False, stream=True) as r: if int(r.headers["Content-Length"]) != path.stat().st_size: yield skip_file( "File size on server does not match local file") return # # 6. Upload metadata # metadata_ = {} for k, v in metadata.items(): if v in ("", None): continue # degenerate, why bother # XXX TODO: remove this -- it is only temporary, search should handle if isinstance(v, str): metadata_[k] = v.lower() elif isinstance(v, datetime): metadata_[k] = ensure_strtime(v) # we will add some fields which would help us with deciding to # reupload or not # .isoformat() would give is8601 representation but I see in girder # already # session_start_time 1971-01-01 12:00:00+00:00 # decided to go for .isoformat for internal consistency -- let's see file_metadata_["uploaded_datetime"] = ensure_strtime(time.time()) metadata_.update(file_metadata_) metadata_["uploaded_size"] = path_stat.st_size metadata_["uploaded_mtime"] = ensure_strtime(path_stat.st_mtime) metadata_["uploaded_by"] = "dandi %s" % __version__ # Also store object_id for the file to help identify changes/moves try: metadata_["uploaded_nwb_object_id"] = get_object_id(str(path)) except Exception as exc: (lgr.debug if allow_any_path else lgr.warning)( "Failed to read object_id: %s", exc) # # # # 7. Also set remote file ctime to match local mtime # # since for type "file", Resource has no "updated" field. # # and this could us help to identify changes being done # # to the remote file -- if metadata["uploaded_mtime"] # # differs # yield {"status": "setting remote file timestamp"} # try: # client.setResourceTimestamp( # file_id, type="file", created=metadata_["uploaded_mtime"] # ) # except girder.gcl.HttpError as exc: # if devel_debug: # raise # response = girder.get_HttpError_response(exc) # message = response.get("message", str(exc)) # yield {"status": "WARNING", "message": message} # 7. Upload metadata yield {"status": "uploading metadata"} client.addMetadataToItem(item_rec["_id"], metadata_) yield {"status": "done"} except Exception as exc: if devel_debug: raise # Custom formatting for some exceptions we know to extract # user-meaningful message message = str(exc) if isinstance(exc, girder.gcl.HttpError): response = girder.get_HttpError_response(exc) if "message" in response: message = response["message"] uploaded_paths[str(path)]["errors"].append(message) yield {"status": "ERROR", "message": message} finally: process_paths.remove(str(path))