def flow( output_dir: Optional[str] = None, download_url: str = "https://www.ebi.ac.uk/gwas/api/search/downloads/alternative", created: datetime = datetime.utcnow(), catalog_path: Path = public_catalog.get_path(), ) -> Flow: output_dir_path: Path = Path(output_dir) if output_dir else Path(tempfile.mkdtemp()) entry = get_entry(created) with Flow(f"gwas-catalog-{entry.artifact.version}") as flow: catalog_path = constant(catalog_path, name="catalog_path") url = constant(entry.resources["parquet"], name="url") entry = constant( entry, name=f"entry.key={entry_key_str(entry.key)}", value=False ) # Download and convert to parquet local_csv = constant( output_dir_path.joinpath("gwas_catalog.csv"), name="local_csv" ) parquet_dir = constant( output_dir_path.joinpath("gwas_catalog_parquet"), name="parquet_dir" ) local_csv = download(download_url, local_csv) info = convert_to_parquet(local_csv, parquet_dir) status = upload(entry, parquet_dir, url, upstream_tasks=[info]) add_entry(entry, info, catalog_path, upstream_tasks=[status]) return flow
def flow( output_dir: str = "/tmp/clinvar", raw_url: str = "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/archive/submission_summary_2020-06.txt.gz", ) -> Flow: """Get ClinVar submission summary import flow Parameters ---------- output_dir : str Directory in which csv/parquet files are stored raw_url : str Link to ClinVar submission summary CSV (on ftp.ncbi.nlm.nih.gov). Note that the version and creation timestamp associated with this artifact are inferred from the link since ClinVar has no semantic versioning in its releases, and the FTP site provides archived files where the date of creation/release is clear. Returns ------- Flow Prefect Flow """ created = raw_url.split("/")[-1].split("_")[-1].split(".")[0] if not created: raise ValueError( 'Unable to determine archive date from url "{raw_url}"') version = f"v{created}" output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir(parents=True, exist_ok=True) entry = get_entry(version, created) catalog_path = catalog.default_urlpath() filename = raw_url.split("/")[-1] with Flow(f"clinvar-{version}") as flow: # Add constants with important to DAG (all others are not visualized) catalog_path = constant(catalog_path, name="catalog_path") url = constant(entry.resources["parquet"], name="url") # pylint:disable=unsubscriptable-object entry = constant(entry, name=f"entry.key={entry_key_str(entry.key)}", value=False) # Download and convert to parquet csv_path = constant(str(output_dir / filename), name="csv_path") parquet_path = constant(str(output_dir / filename.split(".")[0]) + ".parquet", name="parquet_path") csv_path = download(raw_url, csv_path) info = convert_to_parquet(csv_path, parquet_path) # Upload results # pylint:disable=unexpected-keyword-arg status = upload(entry, parquet_path, url, upstream_tasks=[info]) add_entry(entry, info, catalog_path, upstream_tasks=[status]) return flow
def flow( output_dir: str = "/tmp/medgen", csv_url: str = "https://ftp.ncbi.nlm.nih.gov/pub/medgen/csv", n_mgrel_files: Optional[int] = None, created: str = "today", ) -> Flow: """Get MedGen import flow Parameters ---------- output_dir : str Directory in which csv/parquet files are stored csv_url : str Link to MedGen CSV exports (e.g. https://ftp.ncbi.nlm.nih.gov/pub/medgen/csv). Note that MedGen appears to have no archival or release process so both versions and created timestamps in artifacts will correspond to a year-month (e.g. 2020-06). n_mgrel_files : Optional[int] Number of MGREL files to download. These contain pairwise concept relationships and are often broken up into chunks to have < 1M rows for spreadsheet users. At TOW, 2 chunks are present so this can be provided explicitly or if left as None, the number of files will be inferred by trying increments until one fails to exist. created: str Year-month associated with artifact. Defaults to current year-month. Returns ------- Flow Prefect Flow """ output_path = Path(output_dir) if n_mgrel_files is None: n_mgrel_files = get_n_mgrel_files(csv_url) if n_mgrel_files <= 0: raise ValueError(f"Failed to find any MGREL files at {csv_url}") entry = get_entry(created) catalog_path = catalog.default_urlpath() with Flow(f"medgen-{entry.artifact.version}") as flow: catalog_path = constant(catalog_path, name="catalog_path") # pylint:disable=unsubscriptable-object url = constant(entry.resources["parquet"], name="url") entry = constant(entry, name=f"entry.key={entry_key_str(entry.key)}", value=False) # Download and convert to parquet csv_dir = constant(str(output_path / 'mgrel.csv'), name="csv_dir") parquet_dir = constant(str(output_path / 'mgrel.parquet'), name="parquet_dir") csv_dir = download(csv_url, csv_dir, n_mgrel_files) info = convert_to_parquet(csv_dir, parquet_dir) # Upload results # pylint:disable=unexpected-keyword-arg status = upload(entry, parquet_dir, url, upstream_tasks=[info]) add_entry(entry, info, catalog_path, upstream_tasks=[status]) return flow
def flow( source: str, relpath: str, convert: bool = True, output_dir: str = "/tmp/otpev", version: str = "20.06", created: Optional[str] = None, n_partitions: Optional[int] = None, ) -> Flow: """Get OTP evidence import flow Parameters ---------- source : str OTP evidence source (e.g. eva, l2g, uniprot) relpath : str Path relative from `gs://open-targets-data-releases/$VERSION/input/evidence-files` to data file or directory (e.g. "progeny-2018-07-23.json.gz" or "evidences_protein_fix/chembl_dataset") output_dir : str Directory in which temporary json/parquet files are stored version : str OTP release version created: str, optional Date at which OTP version was created. This should NOT be a time at which data was collected -- it is intended to reflect when OT created the release and should never change for the same `version`. For this reason, `created` will default to known release dates (see `OT_VERSION_RELEASE_DATES`). n_partitions: int, optional Number of partitions used to write parquet result. Set as None to use default partitioning. Raises ------ KeyError If `created` is not provided and no known release date was previously recorded for the specified `version` Returns ------- Flow Prefect Flow """ version = str(version) if created is None: if version not in OT_VERSION_RELEASE_DATES: raise KeyError( f'No release date known for version "{version}" ' "(pass `created` explicitly or add date to `OT_VERSION_RELEASE_DATES`)" ) created = OT_VERSION_RELEASE_DATES[version] output_dir = Path(output_dir) / source if not output_dir.exists(): output_dir.mkdir(parents=True, exist_ok=True) is_file = relpath.endswith("json.gz") src_url = OT_URL_FMT.format(version=version) + f'/{relpath.lstrip("/")}' entry = get_entry( source, version, created, format="parquet" if is_file else "json.gz", type="file" if is_file else "directory", properties=None if is_file else dict(compression="gzip"), ) catalog_path = catalog.default_urlpath() with Flow(f"otpev-{source}-v{version}") as flow: # Add constants with important to DAG (all others are not visualized) catalog_path = constant(catalog_path, name="catalog_path") dst_url = next(iter(entry.resources.values())) entry = constant( entry, name=f"entry.key={entry_key_str(entry.key)}", value=False ) n_partitions = constant(n_partitions, name="n_partitions") if is_file: filename = src_url.split("/")[-1] src_url = constant(src_url, name="src_url") dst_url = constant(dst_url, name="dst_url") # Download and convert to parquet json_path = constant(str(output_dir / filename), name="json_path") parquet_path = constant( str(output_dir / filename.split(".")[0]) + ".parquet", name="parquet_path", ) json_path = download(src_url, json_path) info = convert_to_parquet( json_path, parquet_path, n_partitions=n_partitions ) # Upload results # pylint:disable=unexpected-keyword-arg status = upload(entry, parquet_path, dst_url, upstream_tasks=[info]) add_entry(entry, info, catalog_path, upstream_tasks=[status]) else: raise NotImplementedError( "Integration of data directories (rather than single files) not yet implemented" ) return flow