def main(): g_client = GirderCli("http://3.19.164.171") a_client = DandiAPIClient("https://api.dandiarchive.org/api") with a_client.session(): g_client.dandi_authenticate() # gather all dandisets known to girder: hardcoded _id for "drafts" collection g_dandisets = list( g_client.listFolder("5e59bb0af19e820ab6ea6c62", "collection")) for dandiset, girder_id in [(x["name"], x["_id"]) for x in g_dandisets]: if dandiset != "000026": continue print(f"DANDI:{dandiset}", end="\t") g_meta, g_assets_ = g_client.get_dandiset_and_assets( girder_id, "folder") g_assets = list(g_assets_) # harmonize and get only what we care about ATM - path and size, # or otherwise we would need to query each asset for metadata g_assets_h = set( (a["path"].lstrip("/"), a["size"]) for a in g_assets) # Yarik trusts nobody. Two identical bugs are less likely! g_assets_adhoc = set(adhoc_list_girder(girder_id, g_client)) if g_assets_h != g_assets_adhoc: print("ad-hoc and dandi listing of girder differs!") import pdb pdb.set_trace() a_meta, a_assets_ = a_client.get_dandiset_and_assets( dandiset, "draft") a_assets = list(a_assets_) a_assets_h = set( (a["path"].lstrip("/"), a["size"]) for a in a_assets) if a_assets_h != g_assets_h: print("differs") import pdb pdb.set_trace() else: print(f"{len(a_assets)} assets the same")
def main(api_url, token, dandiset_path, delete_extant, only_metadata): client = DandiAPIClient(api_url=api_url, token=token) with client.session(): for dpath in dandiset_path: dandiset = APIDandiset(dpath) if delete_extant: try: client.get_dandiset(dandiset.identifier, "draft") except requests.HTTPError as e: if e.response.status_code != 404: raise else: print("Dandiset", dandiset.identifier, "already exists; deleting") client.delete(f"/dandisets/{dandiset.identifier}/") if only_metadata: print("Setting metadata for Dandiset", dandiset.identifier) client.set_dandiset_metadata( dandiset.identifier, metadata=dandiset.metadata ) else: print("Creating Dandiset", dandiset.identifier) client.create_dandiset( name=dandiset.metadata.get("name", ""), metadata=dandiset.metadata )
class URLUpdater: def __init__(self, datasets_path: Path): self.datasets_path = datasets_path self.dandi_client = DandiAPIClient("https://api.dandiarchive.org/api") self.s3client = boto3.client("s3", config=Config(signature_version=UNSIGNED)) def run(self, dandisets=()): with self.dandi_client.session(): for did in dandisets or self.get_dandiset_ids(): dsdir = self.datasets_path / did log.info("Updating URLs for Dandiset %s", did) ds = Dataset(str(dsdir)) self.update_dandiset_urls(did, ds) log.info("Pushing to sibling") ds.push(to="github") def update_dandiset_urls(self, dandiset_id, ds): if ds.repo.dirty: raise RuntimeError( "Dirty repository; clean or save before running") ds.repo.always_commit = False for a in self.dandi_client.get_dandiset_assets(dandiset_id, "draft", include_metadata=False): path = a["path"] log.info("Processing asset %s", path) if ds.repo.is_under_annex(path, batch=True): file_urls = set(ds.repo.get_urls(path, batch=True)) bucket_url = self.get_file_bucket_url(dandiset_id, "draft", a["asset_id"]) download_url = ( f"https://api.dandiarchive.org/api/dandisets/{dandiset_id}" f"/versions/draft/assets/{a['asset_id']}/download/") for url in [bucket_url, download_url]: if url not in file_urls: log.info("Adding URL %s to asset", url) ds.repo.add_url_to_file(path, url, batch=True) for url in file_urls: if "dandiarchive.s3.amazonaws.com/girder-assetstore/" in url: log.info("Removing URL %s from asset", url) ds.repo.rm_url(path, url) else: log.info("File is not managed by git annex; not updating URLs") log.info("Commiting changes") ds.save(message="Ran use-new-urls.py") def get_dandiset_ids(self): r = self.dandi_client.get("/dandisets/") while True: for d in r["results"]: yield d["identifier"] if r.get("next"): r = self.dandi_client.get(r.get("next")) else: break def get_file_bucket_url(self, dandiset_id, version_id, asset_id): r = self.dandi_client.send_request( "HEAD", f"/dandisets/{dandiset_id}/versions/{version_id}/assets/{asset_id}" "/download/", json_resp=False, ) urlbits = urlparse(r.headers["Location"]) s3meta = self.s3client.get_object(Bucket="dandiarchive", Key=urlbits.path.lstrip("/")) return urlunparse( urlbits._replace(query=f"versionId={s3meta['VersionId']}"))