def fetch_assembly_meta_xml(accession): """ Fetch assembly metadata xml from ENA. """ url = "%s/xml/%s" % (ENA_API, accession) xml = tofetch.fetch_url(url) return xml
def fetch_goat_data(taxon_id): """Fetch taxon metadata from GoaT.""" LOGGER.info("Fetching taxon metadata") url = "%s/record?recordId=taxon_id-%s&result=taxon" % (GOAT_API, taxon_id) result = tofetch.fetch_url(url) if result is None: LOGGER.error("Unable to fetch taxon metadata for '%s' from GoaT", taxon_id) sys.exit(1) data = ujson.loads(result) return data["records"][0]["record"]
def fetch_accession(bioproject): """Fetch a GCA accession for a bioproject.""" LOGGER.info("Fetching GCA accession for bioproject %s" % bioproject) url = "%s/search?result=assembly&query=study_accession%%3D%%22%s%%22&fields=accession%%2Cversion&format=tsv" % ( ENA_API, bioproject, ) result = tofetch.fetch_url(url) accession = None if result and result is not None: line = result.split("\n")[1] if line and "\t" in line: accession, version = line.split("\t") accession += ".%s" % version return accession
def fetch_bioproject_children(bioproject, *, projects=None, children=None, new_projects=None): """ Fetch children of a bioproject. """ if projects is None: projects = [] if new_projects is None: new_projects = {} if children is None: children = [] url = "%s/search?result=study&query=parent_study%%3D%%22%s%%22&format=tsv" % ( ENA_API, bioproject, ) result = tofetch.fetch_url(url) if result and result is not None: for line in result.split("\n")[1:]: if line and "\t" in line: child_accession, description = line.split("\t") if child_accession in projects: continue if "genome assembly" in description: if "alternate haplotype" not in description: children.append(child_accession) new_projects.update({child_accession: bioproject}) else: sleep(0.5) LOGGER.info( "Fetching nested accessions under bioproject %s" % child_accession) fetch_bioproject_children( child_accession, projects=projects, children=children, new_projects=new_projects, ) return children, new_projects
def fetch_busco_lineages(busco_sets, buscodir): """Fetch busco lineages.""" if not busco_sets: return lineages_to_fetch = [] for lineage in busco_sets: busco_lineage = "%s/lineages/%s" % (buscodir, lineage) if not os.path.isdir(busco_lineage): lineages_to_fetch.append(lineage) if not lineages_to_fetch: return lineage_urls = {} LOGGER.info("Fetching BUSCO lineage directory listing") listing = tofetch.fetch_url("%s/" % BUSCO_URL) for entry in listing.split("\n"): parts = re.split(r"[\"\s]+", entry) if len(parts) == 8: busco_set = re.sub(r"\..+$", "", parts[2]) lineage_urls.update({busco_set: "%s/%s" % (BUSCO_URL, parts[2])}) for lineage in lineages_to_fetch: LOGGER.info("Fetching BUSCO lineage %s" % lineage) tofetch.fetch_tar(lineage_urls[lineage], buscodir)
def fetch_read_info(accession, per_platform): """Fetch read info for an accession.""" portal = "https://www.ebi.ac.uk/ena/portal/api" url = ( "%s/filereport?accession=%s&result=read_run&fields=run_accession,fastq_bytes,base_count,library_strategy,library_selection,library_layout,instrument_platform,experiment_title,fastq_ftp" % (portal, accession)) data = tofetch.fetch_url(url) if data is None: return header = None for line in data.split("\n"): if not line or line == "": continue if header is None: header = line.split("\t") continue fields = line.split("\t") values = {} platform = "OTHER" for i in range(0, len(header)): value = fields[i] if header[ i] == "instrument_platform" and platform != "ILLUMINA_XTEN": platform = fields[i] if header[i] == "experiment_title": if value == "HiSeq X Ten paired end sequencing": platform = "ILLUMINA_XTEN" values.update({header[i]: value}) if "base_count" in values: values["base_count"] = int(values["base_count"]) else: values["base_count"] = 0 try: per_platform[platform].append(values) except KeyError: per_platform["OTHER"].append(values)