def __make_seq_file(seq_record, file_name=".seq.fa"): # Remove seq file if exists... if os.path.exists(file_name): os.remove(file_name) # Write Jglobals.write(file_name, seq_record.format("fasta"))
def __get_Pfam_alignments(taxon, out_dir=out_dir): # Skip if Pfam JSON file already exists pfam_json_file = os.path.join(out_dir, taxon + pfam_file_ext) if not os.path.exists(pfam_json_file): # Change dir os.chdir(out_dir) # Initialize pfams = {} seq_file = ".seq.fasta" hmm_db = os.path.join("pfam", "All.hmm") uniprot_json_file = taxon + uniprot_file_ext # Load JSON file with open(uniprot_json_file) as f: uniaccs = json.load(f) # For each uniacc... for u in uniaccs: # Initialize pfams.setdefault(u, []) # Make seq file seq = Seq(uniaccs[u][1], IUPAC.protein) record = SeqRecord(seq, id=u, name=u, description=u) __make_seq_file(record, seq_file) # For each DBD... for pfam_id_std, start, end, evalue in hmmscan( seq_file, hmm_db, non_overlapping_domains=True): # Initialize hmm_file = os.path.join("pfam", "%s.hmm" % pfam_id_std) # Make seq file sub_seq = seq[start:end] record = SeqRecord(sub_seq, id=u, name=u, description=u) __make_seq_file(record, seq_file) # Add DBDs alignment = hmmalign(seq_file, hmm_file) pfams[u].append( (pfam_id_std, alignment, start + 1, end, evalue)) # Write Jglobals.write(pfam_json_file, json.dumps(pfams, sort_keys=True, indent=4)) # Remove seq file if os.path.exists(seq_file): os.remove(seq_file) # Change dir os.chdir(cwd)
def __format_BLAST_database(taxon, out_dir=out_dir): # Skip if taxon FASTA file already exists fasta_file = os.path.join(out_dir, "%s.fa" % taxon) if not os.path.exists(fasta_file): # Load JSON file uniprot_json_file = taxon + uniprot_file_ext with open(uniprot_json_file) as f: uniaccs = json.load(f) # For each UniProt Accession... for uniacc in sorted(uniaccs): seq = uniaccs[uniacc][1] Jglobals.write(fasta_file, ">%s\n%s" % (uniacc, seq)) # Make BLAST+ database cmd = "makeblastdb -in %s -dbtype prot" % fasta_file process = subprocess.run([cmd], shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
def infer_profiles(fasta_file, dummy_dir="/tmp/", files_dir=files_dir, output_file=None, threads=1, latest=False, n=5, taxons=Jglobals.taxons): # Initialize base_name = os.path.basename(__file__) pid = os.getpid() # Load data cisbp = __load_CisBP_models(files_dir) # jaspar = __load_JASPAR_files_n_models(files_dir, models_dir, taxons) jaspar = __load_JASPAR_files_n_models(files_dir, taxons) # Create dummy dir dummy_dir = os.path.join(dummy_dir, "%s.%s" % (base_name, pid)) dummy_file = os.path.join(dummy_dir, "inferred_profiles.tsv") if not os.path.exists(dummy_dir): os.makedirs(dummy_dir) # Get sequences as SeqRecords # Note: https://biopython.org/wiki/SeqRecord seq_records = [] for seq_record in Jglobals.parse_fasta_file(fasta_file): seq_records.append(seq_record) # Write # columns = ["Query", "TF Name", "TF Matrix", "E-value", "Query Start-End", # "TF Start-End", "DBD %ID", "Cis-BP", "JASPAR"] columns = ["Query", "TF Name", "TF Matrix", "E-value", "Query Start-End", "TF Start-End", "DBD %ID"] Jglobals.write(dummy_file, "\t".join(columns)) # Infer SeqRecord profiles kwargs = {"total": len(seq_records), "bar_format": bar_format} pool = Pool(min([threads, len(seq_records)])) p = partial(infer_SeqRecord_profiles, cisbp=cisbp, dummy_dir=dummy_dir, files_dir=files_dir, jaspar=jaspar, latest=latest, n=n, taxons=taxons) for inferences in tqdm(pool.imap(p, seq_records), **kwargs): for inference in inferences: Jglobals.write(dummy_file, "\t".join(map(str, inference))) pool.close() pool.join() # Write if output_file: shutil.copy(dummy_file, output_file) else: with open(dummy_file) as f: # For each line... for line in f: Jglobals.write(None, line.strip("\n")) # Remove dummy dir shutil.rmtree(dummy_dir)
def __get_profile_info(taxon, out_dir=out_dir): # Skip if taxon profiles JSON file already exists profiles_json_file = os.path.join(out_dir, taxon + profiles_file_ext) if not os.path.exists(profiles_json_file): # Initialize profiles = {} url = os.path.join(jaspar_url, "api", "v1", "taxon", taxon) response = client.get(url) json_obj = json.loads(codec.encode(response)) # While there are more pages... while json_obj["next"] is not None: # For each profile... for profile in json_obj["results"]: # Add profiles from the CORE collection... if profile["collection"] == "CORE": profiles.setdefault(profile["matrix_id"], profile["name"]) # Go to next page response = client.get(json_obj["next"]) json_obj = json.loads(codec.encode(response)) # Do last page for profile in json_obj["results"]: # Add profiles from the CORE collection... if profile["collection"] == "CORE": profiles.setdefault(profile["matrix_id"], profile["name"]) # Write Jglobals.write(profiles_json_file, json.dumps(profiles, sort_keys=True, indent=4))
def __download_UniProt_sequences(taxon, out_dir=out_dir): # Initialize faulty_profiles = { "MA1826.1": ["B4FU91"], } faulty_sequences = { "B9GPL8": [ "MEEVGAQVAAPIFIHEALSSRYCDMTSMAKKHDLSYQSPNSQLQQHQFLQASREKNWNSK", "AWDWDSVDDDGLGLNLGGSLTSVEEPVSRPNKRVRSGSPGNGSYPMCQVDNCKEDLSKAK", "DYHRRHKVCQVHSKATKALVGKQMQRFCQQCSRFHPLTEFDEGKRSCRRRLAGHNRRRRK", "TQPEDVTSRLLLPGNPDMNNNGNLDIVNLLTALARSQGKTYLPMIDFYVPPFVLTNCPTV", "PDKDQLIQILNKINSLPLPMDLAAKLSNIASLNVKNPNQPYLGHQNRLNGTASSPSTNDL", "LAVLSTTLAASAPDALAILSQRSSQSSDNDKSKLPGPNQVTVPHLQKRSNVEFPAVGVER", "ISRCYESPAEDSDYQIQESRPNLPLQLFSSSPENESRQKPASSGKYFSSDSSNPIEERSP", "SSSPPVVQKLFPLQSTAETMKSEKMSVSREVNANVEGDRSHGCVLPLELFRGPNREPDHS", "SFQSFPYRGGYTSSSGSDHSPSSQNSDPQDRTGRIIFKLFDKDPSHFPGTLRTKIYNWLS", "NSPSEMESYIRPGCVVLSVYLSMPSASWEQLERNLLQLVDSLVQDSDSDLWRSGRFLLNT", "GRQLASHKDGKVRLCKSWRTWSSPELILVSPVAVIGGQETSLQLKGRNLTGPGTKIHCTY", "MGGYTSKEVTDSSSPGSMYDEINVGGFKIHGPSPSILGRCFIEVENGFKGNSFPVIIADA", "SICKELRLLESEFDENAVVSNIVSEEQTRDLGRPRSREEVMHFLNELGWLFQRKSMPSMH", "EAPDYSLNRFKFLLIFSVERDYCVLVKTILDMLVERNTCRDELSKEHLEMLYEIQLLNRS", "VKRRCRKMADLLIHYSIIGGDNSSRTYIFPPNVGGPGGITPLHLAACASGSDGLVDALTN", "DPHEIGLSCWNSVLDANGLSPYAYAVMTKNHSYNLLVARKLADKRNGQISVAIGNEIEQA", "ALEQEHVTISQFQRERKSCAKCASVAAKMHGRFLGSQGLLQRPYVHSMLAIAAVCVCVCL", "FFRGAPDIGLVAPFKWENLNYGTI" ] } # Change dir os.chdir(out_dir) # Skip if pickle file already exists pickle_file = ".%s.uniaccs.pickle" % taxon if not os.path.exists(pickle_file): # Initialize uniaccs = {} # Load JSON file profiles_json_file = taxon + profiles_file_ext with open(profiles_json_file) as f: profiles = json.load(f) # For each profile... for profile in sorted(profiles): # Get profile detailed info url = os.path.join(jaspar_url, "api", "v1", "matrix", profile) response = client.get(url) json_obj = json.loads(codec.encode(response)) # Fix faulty profiles if json_obj["matrix_id"] in faulty_profiles: json_obj["uniprot_ids"] = faulty_profiles[ json_obj["matrix_id"]] # For each UniProt Accession... for uniacc in json_obj["uniprot_ids"]: # Skip if uniacc == "": continue # Initialize uniacc = uniacc.strip(" ") uniaccs.setdefault(uniacc, [[], None]) # Add uniacc if profile not in uniaccs[uniacc][0]: uniaccs[uniacc][0].append(profile) # Write pickle file with open(pickle_file, "wb") as f: pickle.dump(uniaccs, f) # Skip if taxon uniprot JSON file already exists uniprot_json_file = taxon + uniprot_file_ext if not os.path.exists(uniprot_json_file): # Load pickle file with open(pickle_file, "rb") as f: uniaccs = pickle.load(f) # For each UniProt Accession... for uniacc in uniaccs: # Fix faulty sequences if uniacc in faulty_sequences: uniaccs[uniacc][1] = "".join(faulty_sequences[uniacc]) continue # Get UniProt sequence u = uniprot.queryUniprot(uniacc) uniaccs[uniacc][1] = "".join(u["sequence 0"].split("\n")) # Write Jglobals.write(uniprot_json_file, json.dumps(uniaccs, sort_keys=True, indent=4)) # Change dir os.chdir(cwd)
def __download_Pfam_DBD_HMMs(out_dir=out_dir): # Skip if Pfam file already exists json_file = os.path.join(out_dir, "pfam.json") if not os.path.exists(json_file): # Initialize pfams = {} pfam_ids = set() url = "http://cisbp.ccbr.utoronto.ca/data/2.00/" + \ "DataFiles/Bulk_downloads/EntireDataset/" cisbp_file = "TF_Information_all_motifs.txt.zip" # Create Pfam dir pfam_dir = os.path.join(out_dir, "pfam") if not os.path.isdir(pfam_dir): os.makedirs(pfam_dir) # Change dir os.chdir(pfam_dir) # Skip if Cis-BP file already exists if not os.path.exists(cisbp_file): urlretrieve(os.path.join(url, cisbp_file), cisbp_file) # Get DBD/cut-off pairs cmd = "unzip -p %s | cut -f 11 | sort | uniq | grep -v DBDs" % \ cisbp_file process = subprocess.run([cmd], shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) # For each line... for line in process.stdout.decode("utf-8").split("\n"): # For each Pfam ID... for pfam_id in line.split(","): # Skip if not Pfam ID if pfam_id == "UNKNOWN" or pfam_id == "": continue # Add Pfam ID pfam_ids.add(pfam_id) # For each Pfam ID... for pfam_id in pfam_ids: # Fetch MSA from Pfam attempts = 0 while attempts < 5: try: msa_file = pfam.fetchPfamMSA(pfam_id, alignment="seed") break except: # i.e. try again in 5 seconds attempts += 1 time.sleep(5) # For each line... for line in Jglobals.parse_file(msa_file): m = re.search("^#=GF\sID\s+(\S+)$", line) if m: pfam_id_std = m.group(1) m = re.search("^#=GF\sAC\s+(PF\d{5}).\d+$", line) if m: pfam_ac = m.group(1) break # HMM build hmm_file = "%s.hmm" % pfam_id_std cmd = "hmmbuild %s %s" % (hmm_file, msa_file) process = subprocess.run([cmd], shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) # HMM press cmd = "hmmpress -f %s" % hmm_file process = subprocess.run([cmd], shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) # Add Pfam pfams.setdefault(pfam_ac, pfam_id_std) # Remove MSA file os.remove(msa_file) # Skip if HMM database of all DBDs already exists hmm_db = "All.hmm" if not os.path.exists(hmm_db): # For each HMM file... for hmm_file in os.listdir("."): # Skip if not HMM file if not hmm_file.endswith(".hmm"): continue # Add HMM to database for line in Jglobals.parse_file(hmm_file): Jglobals.write(hmm_db, line) # HMM press cmd = "hmmpress -f %s" % hmm_db process = subprocess.run([cmd], shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) # Remove Cis-BP file if os.path.exists(cisbp_file): os.remove(cisbp_file) # Write Jglobals.write(json_file, json.dumps(pfams, sort_keys=True, indent=4)) # Change dir os.chdir(cwd)