def download(self, datadir): self.bolddir = datadir + "bold/" if not os.path.exists(self.bolddir): os.makedirs(self.bolddir) os.chdir(self.bolddir) download_file( "http://www.boldsystems.org/index.php/API_Public/combined?geo=Argentina&format=json", "barcodes.json")
def update_pdb(self, pdb): pdb = pdb.lower() mkdir(self.pdbs_dir + pdb[1:3]) if os.path.exists(self.pdb_path_gzipped(pdb)): execute("gunzip " + self.pdb_path_gzipped(pdb)) elif not os.path.exists(self.pdb_path(pdb)): download_file( self.url_pdb_files + pdb[1:3] + "/pdb" + pdb + self.pdb_download_extention, self.pdbs_dir + pdb[1:3] + "/pdb" + pdb + self.pdb_download_extention) execute("gunzip " + self.pdb_path_gzipped(pdb))
def download_deg(dst="/data/databases/deg/"): for x in ["p", "e", "a"]: filename = "deg-" + x + "-15.2" download_file("http://tubic.tju.edu.cn/deg/download/" + filename + ".zip", dst + filename + ".zip", ovewrite=True) execute("unzip -o " + dst + filename + ".zip" + " -d " + dst) os.remove(dst + filename + ".zip") execute("makeblastdb -dbtype prot -in " + dst + "degaa-" + x + ".dat")
def download_proteome_from_tax(tax_id, dst_dir, format="fasta"): durl = 'http://www.uniprot.org/uniprot/?sort=&desc=&compress=yes&query=taxonomy:{tax}&fil=&format={format}&force=yes' download_file(durl.format(tax=tax_id, format=format), dst_dir + "/" + tax_id + "_all.fasta.gz", ovewrite=True) execute("gunzip " + dst_dir + "/" + tax_id + "_all.fasta.gz") execute("cd-hit -M 0 -c 0.9 -T 0 -i %s -o %s" % (dst_dir + "/" + tax_id + "_all.fasta", dst_dir + "/" + tax_id + ".fasta")) execute("makeblastdb -dbtype prot -in " + dst_dir + "/" + tax_id + ".fasta")
def update_pdb(self, pdb): pdb = pdb.lower() mkdir(self.pdbs_dir + pdb[1:3]) if not os.path.exists(self.pdb_path(pdb)) or (os.path.getsize(self.pdb_path(pdb)) < 100): if os.path.exists(self.pdb_path_gzipped(pdb)) and (os.path.getsize(self.pdb_path_gzipped(pdb)) > 100): execute("gunzip " + self.pdb_path_gzipped(pdb)) if os.path.exists(self.pdb_path_gzipped(pdb)) and not os.path.exists(self.pdb_path(pdb)): os.remove(self.pdb_path_gzipped(pdb)) elif not os.path.exists(self.pdb_path(pdb)): download_file(self.url_pdb_files + pdb[1:3] + "/pdb" + pdb + self.pdb_download_extention, self.pdbs_dir + pdb[1:3] + "/pdb" + pdb + self.pdb_download_extention, ovewrite=True) execute("gunzip " + self.pdb_path_gzipped(pdb)) return self.pdb_path(pdb)
def download_assembly(assembly_accession, dst_dir, dtype="genomic.gbff.gz", force=False): # assembly_name, last_assembly_accession = NCBI.assembly_name_from_acc(assembly_accession) assembly_accession_no_ver = assembly_accession if assembly_accession[ -2] != "." else assembly_accession[:-2] # https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/158/435/ url = "/".join([ ftp_url, assembly_accession_no_ver[0:3], assembly_accession_no_ver[4:7], assembly_accession_no_ver[7:10], assembly_accession_no_ver[10:13] # , last_assembly_accession + "_" + assembly_name.replace(" ", "_").replace("#", "_") # , last_assembly_accession + "_" + assembly_name.replace(" ", "_").replace("#", # "_") + "_" + dtype ]) + "/" r = requests.get(url) download_url = "" acc = "" if r.status_code == 200: accessions = [ x.split("</")[0].replace("/", "") for x in r.text.split(">") if x.startswith(assembly_accession_no_ver) ] # GCF_000158435.(1)_ASM15843v1/ accessions = sorted(accessions, key=lambda x: int(x.split("_")[1][-1])) if accessions: acc = accessions[-1] download_url = f'{url}{acc}/{acc}_{dtype}' if not download_url: err = f"{assembly_accession} not found at {url}" _log.error(err) raise FileNotFoundError(err) assert acc out_file = f'{dst_dir}/{"_".join(acc.split("_")[:2]) }.{dtype}' if force or not os.path.exists(out_file): download_file(download_url, out_file, ovewrite=force) else: _log.debug(f'{out_file} exists') # execute("gunzip -c " + out_file + " > " + out_file[:-3]) return out_file
def cross_reference_dbs(self): download_file(Command.DEFAULT_CROSS_REF_DBS, "data/tmp/database-all.rdf") with open("data/tmp/database-all.rdf") as h: data = xmltodict.parse(h.read(), "utf-8") for db in data["rdf:RDF"]["rdf:Description"]: DBx.objects.get_or_create( url=db['@rdf:about'], name=db['abbreviation'] if "abbreviation" in db else db['dcterms:identifier'], category=db.get('category', ""), description=db.get('rdfs:label', ""), url_template=db.get('urlTemplate', db['dcterms:identifier'])) DBx.objects.get_or_create( url="www.uniprot.org", name="UnipAcc", category='Protein annotation databases', description="UNIPROT", url_template="https://www.uniprot.org/uniprot/%s", )
def handle(self, *args, **options): Ontology.load_ann_terms() Ontology.load_go_base() if options["go"]: if not os.path.exists(options["obo_path"]): download_file(options["go_url"], options["obo_path"]) if not os.path.exists(options["relationships_obo_path"]): download_file(options["go_basic_url"], options["relationships_obo_path"]) self.ontology = Ontology.objects.get(name=Ontology.GO) self.is_a = Ontology.relmap["is_a"] self.create_terms(options["obo_path"], "go") self.create_relationships(options["relationships_obo_path"], "go") if options["tax"]: pass self.stderr.write("Finished!")
def download_fasta(url_unip2reactions=DEFAULT_UNIP2REACTIONS, outdir="/data/databases/reactome/", ovewrite=False): unip_utils = Uniprot() assert os.path.exists(outdir), f'{outdir} does not exists' reactome_map_file = outdir + "/UniProt2ReactomeReactions.txt" if ovewrite or not os.path.exists(reactome_map_file): download_file(url_unip2reactions, reactome_map_file, ovewrite=ovewrite) else: sys.stderr.write(f'{reactome_map_file} already exists') with open(reactome_map_file) as hr, gzip.open("seqs.fasta.gz", "wt") as hw: for line in tqdm(hr): if not line.startswith("#"): unip, reactome, url_path, description = line.split( "\t")[:4] record = unip_utils.download_and_load_seqrecord( unip, format=".fasta") record.name = "" record.description = description + "||" + unip record.id = reactome bpio.write(record, hw, "fasta")
def download_ena_project(project_id, dst_dir): dst_dir = os.path.abspath(dst_dir) url_template = "https://www.ebi.ac.uk/ena/data/warehouse/filereport?accession=" + project_id + "&result=read_run&fields=sample_accession,experiment_accession,run_accession,fastq_ftp,fastq_md5&download=txt" r = requests.get(url_template) if r.status_code == 200: lines = r.text.split("\n") with tqdm(lines) as pbar: for l in pbar: if len(l.strip().split("\t")) > 3: sample_accession, experiment_accession, run_accession, fastq_ftp, fastq_md5 = l.split( "\t") if len(fastq_ftp.split(";")) == 2: basefilename = dst_dir + "/" + "_".join([ sample_accession, experiment_accession, run_accession ]) if (not os.path.exists(basefilename + "_1.fastq.gz") ) and (not os.path.exists(basefilename + "_1.fastq")): pbar.set_description(fastq_ftp.split(";")[0]) try: download_file( fastq_ftp.split(";")[0], basefilename + "_1.fastq.gz") except: _log.warn("error downloading: " + basefilename + "_1.fastq.gz") try: os.rmdir(basefilename + "_1.fastq.gz") except: pass f1md5 = sp.check_output( "md5sum %s_1.fastq.gz" % basefilename, shell=True).split()[0].strip() if fastq_md5.split(";")[0] != f1md5: print("%s error md5 sum" % basefilename) try: os.rmdir(basefilename + "_1.fastq.gz") except: pass if (not os.path.exists(basefilename + "_2.fastq.gz") ) and (not os.path.exists(basefilename + "_2.fastq")): pbar.set_description(fastq_ftp.split(";")[1]) try: download_file( fastq_ftp.split(";")[1], basefilename + "_2.fastq.gz") except: _log.warn("error downloading: " + basefilename + "_2.fastq.gz") try: os.rmdir(basefilename + "_2.fastq.gz") except: pass f1md5 = sp.check_output( "md5sum %s_2.fastq.gz" % basefilename, shell=True).split()[0].strip() if fastq_md5.split(";")[1] != f1md5: print("%s error md5 sum" % basefilename) try: os.rmdir(basefilename + "_2.fastq.gz") except: pass else: raise Exception("request error %i" % r.status_code)
init_log("/tmp/createdb.log") def old_or_inexistent(filepath, period=30): return not os.path.exists(filepath) or (((time.time() - os.path.getatime(filepath)) / 60 / 60 / 24) > period) os.environ["http_proxy"] = "http://proxy.fcen.uba.ar:8080" os.environ["ftp_proxy"] = "http://proxy.fcen.uba.ar:8080" if not os.path.exists("/data/cog/whog"): mkdir("/data/cog/") download_file("ftp://ftp.ncbi.nih.gov/pub/COG/COG/whog", "/data/cog/whog") if not os.path.exists("/data/cog/myva"): mkdir("/data/cog/") download_file("ftp://ftp.ncbi.nih.gov/pub/COG/COG/myva", "/data/cog/myva") execute("formatdb -i /data/cog/myva -o T") if not os.path.exists("/data/ec/PRIAM_MAR15/priam"): mkdir("/data/ec/") download_file("http://priam.prabi.fr/REL_MAR15/Distribution.zip", "/data/ec/PRIAM_MAR15.zip") execute_from("unzip /data/ec/PRIAM_MAR15.zip; exit 0;", "/data/ec/",retcodes=[0,1]) execute_from("ls /data/ec/PRIAM_MAR15/PROFILES/*.chk > priam", "/data/ec/PRIAM_MAR15/") execute_from("formatrpsdb -i /data/ec/PRIAM_MAR15/priam -o T", "/data/ec/PRIAM_MAR15/")
def download_human_prots(dst="/data/databases/human/"): file_path = dst + Offtarget.DEFAULT_HUMAN_FILENAME unip_url = "https://www.uniprot.org/uniref/?query=uniprot:(taxonomy:%22Homo%20sapiens%20(Human)%20[9606]%22)%20identity:1.0&format=fasta&force=true&compress=yes" download_file(unip_url, file_path, ovewrite=True, timeout=120) return file_path
def download_deg(dst="/data/databases/deg/"): for x in ["p", "e", "a"]: download_file(Offtarget.DEG_PROT_URL[x], f"{dst}/{Offtarget.DEG_FAA_NAMES[x]}.gz", ovewrite=True) execute(f"gunzip -f {dst}/{Offtarget.DEG_FAA_NAMES[x]}.gz")
def download_pdb_entries(self): download_file(self.url_pdb_entries, self.entries_path, ovewrite=True)
def download_pdb_seq_ses(self): download_file(self.url_pdb_seq_res, self.pdb_seq_res_path, ovewrite=True)
def download_fasta(uniprot_id, outdir="./", overwrite=False): download_file(Uniprot.DEFAULT_UNIPROT_URL + uniprot_id + ".fasta", f'{outdir}/{uniprot_id}.fasta', overwrite)
def process_file(params): aln_file = params["aln_file"] templates2use = params["templates2use"] tmp_dir = params["tmp_dir"] output_dir = params["output_dir"] aln_file = aln_file.strip() try: if os.path.getsize(aln_file) < 100: return [{"errors": f'\n{aln_file} empty file'}] hsps = [] try: hsps = [ hsp for query_result in bpsio.parse( aln_file.strip(), "blast-xml") for hit in query_result for hsp in hit ] except ValueError: sys.stderr.write(f"error reading alignments in {aln_file}") hsps = hsps[:templates2use] if hsps: seq_id = hsps[0].query.id # pdb_chains = [x.split("_") for x in set([hsp.hit.id[3:7] + "_" + hsp.hit.id[-1] for hsp in hsps])] pdb_chains = [[hsp.hit.id[3:7], hsp.hit.id[-1]] for hsp in hsps] updated = True for pdb, _ in pdb_chains: if not os.path.exists(pdb_utils.pdb_path(pdb)): mkdir(pdb_utils.pdb_path_base(pdb)) download_file( f"https://files.rcsb.org/download/{pdb.upper()}.pdb.gz", pdb_utils.pdb_path_gzipped(pdb), ovewrite=True) pdb_utils.update_pdb(pdb) updated = os.path.exists(pdb_utils.pdb_path(pdb)) if not updated: sys.stderr.write(f'{pdb} could not be updated...\n') return pdb_utils.extract_chains(pdb_chains, tmp_dir) models_results = [] for hsp in hsps: try: models_result = Modelome.model_hsps( seq_id, os.path.abspath(output_dir), [hsp], refinement=REFINEMENT, models_to_generate=MODELS_TO_GENERATE, assessments=ASSESMENTS, entries={}, tmp_dir=tmp_dir, max_models=1) except ModellerOverflowError as e: sys.stderr.write( f"error processing {seq_id}: {str(e)}") continue models_results.append(models_result) return models_results else: return [{"errors": f'\nno aligments for {aln_file}\n'}] except: sys.stderr.write(f'error processing {aln_file}') raise
from SNDG.Structure.PDBs import PDBs init_log("/tmp/createdb.log") def old_or_inexistent(filepath, period=30): return not os.path.exists(filepath) or (( (time.time() - os.path.getatime(filepath)) / 60 / 60 / 24) > period) #os.environ["http_proxy"] = "http://proxy.fcen.uba.ar:8080" #os.environ["ftp_proxy"] = "http://proxy.fcen.uba.ar:8080" mkdir("/data/pdb/") download_file("ftp://ftp.wwpdb.org/pub/pdb/derived_data/index/entries.idx", "/data/pdb/entries.idx", ovewrite=True) pdbs = PDBs("/data/pdb/") pdbs.download_pdb_seq_ses() pdbs.update_pdb_dir() mkdir("/data/pdb/processed/") pdbs.pdbs_seq_for_modelling() execute("makeblastdb -dbtype prot -in /data/pdb/processed/seqs_from_pdb.fasta") if old_or_inexistent("/data/uniprot/uniref/uniref90/uniref90.fasta"): mkdir("/data/uniprot/uniref/uniref90") download_file( "ftp://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref90/uniref90.fasta.gz", "/data/uniprot/uniref/uniref90/uniref90.fasta.gz", ovewrite=True)
def load_structure(self): tmp_dir = "/tmp/2PZI.pdb" download_file("https://files.rcsb.org/view/2PZI.pdb", target=tmp_dir, ovewrite=True)