def model_hsps(seq_id, work_dir, hsps, refinement=REFINEMENT, models_to_generate=MODELS_TO_GENERATE, assessments=ASSESMENTS, entries={}, pdb_divided="/data/databases/pdb/divided/", tmp_dir=None, max_models=3): result = {"models": defaultdict(lambda: {})} alns = [] for hsp in hsps: aln = Struct(aln_query=Struct(name=seq_id, seq=str(hsp.aln[0].seq), start=hsp.query_start, end=hsp.query_end), aln_hit=Struct(name=hsp.hit.id, seq=str(hsp.aln[1].seq), start=hsp.hit_start, end=hsp.hit_end)) alns.append(aln) modeler = Modeller(work_dir, tmp_dir) modeler._refinement = refinement modeler.model_count = models_to_generate modeler._assess_methods = assessments modeler.parallel_jobs = 1 def pdb_fn(x): return x.aln_hit.name.split("_")[0] alns = sorted(alns, key=lambda x: entries[pdb_fn(x)] if pdb_fn(x) in entries else 20) result["alns"] = alns for aligment in alns[0:max_models]: # pdb,aligment = pdb_alignment pdb, chain, _, _ = aligment.aln_hit.name.split("_") if not os.path.exists( modeler.pdb_path(seq_id + "_" + aligment.aln_hit.name, seq_id)): base_model_path = pdb_divided + pdb[1:3] + "/pdb" + pdb + ".ent" ChainSplitter(tmp_dir).make_pdb(base_model_path, pdb, chain, overwrite=True) models = modeler.create_model( seq_id + "_" + aligment.aln_hit.name, aligment) else: models = [ modeler.pdb_path(seq_id + "_" + aligment.aln_hit.name, seq_id, idx) for idx in range(1, models_to_generate + 1) ] result["models"][aligment.aln_hit.name] = models return result
def run_dssp(self): out = tempfile.mkstemp(suffix=".dssp")[1] execute("dssp -i {pdb_path} -o {out}", pdb_path=self.pdb_path, out=out) with open(out) as h: start = False for l in h: if start: res = int(l[5:10]) aa = l[10:14].strip() ss = l[14:17].strip() bbl1 = l[23:24] bbl2 = l[24:25] bp1 = int(l[25:29]) bp2 = int(l[29:33]) bslabel = l[33:34] self.dssp.append( Struct(res=res, aa=aa, ss=ss, bp1=bp1, bp2=bp2, bbl1=bbl1, bbl2=bbl2, bslabel=bslabel)) else: if l.startswith(" # RESIDUE AA"): start = True
def residues_near_drug(drug_centroid, aa_residues): residues_near = [] for r in aa_residues: for a in list(r): dist = a - Struct(coord=drug_centroid) if dist > 20: break if dist < 10: residues_near.append(r) break return residues_near
def query(self): "hmmscan [-options] <hmmdb> <seqfile>" self._format_database() self._check_input_file() if (not os.path.exists(self.output_file)) or (os.path.getsize(self.output_file) == 0): self._hmmscan() decorated = self.query_iterator() meta = self.query_iterator()._meta return Struct(_meta=meta, __iter__=lambda: decorated)
def smart_parse(path, seqs=None, gz_input=None): """ :param path: sequence path :param seqs: dictionary of sequences. key=sequence name, value=Bio.Seq.Seq object :return: sequences iterator """ raw_path = path.strip() if gz_input or raw_path.endswith(".gz"): path = path[:-3] handle = gzip.open(raw_path, "rt") else: path = raw_path handle = open(path, "r") try: it = None if path.endswith(".fasta"): it = bpio.parse(handle, "fasta") if path.endswith(".faa"): it = bpio.parse(handle, "fasta") if path.endswith(".fna"): it = bpio.parse(handle, "fasta") if path.endswith(".gb"): it = bpio.parse(handle, "gb") if path.endswith(".gbf"): it = bpio.parse(handle, "gb") if path.endswith(".gbk"): it = bpio.parse(handle, "gb") if path.endswith(".genebank"): it = bpio.parse(path, "gb") if path.endswith(".gbff"): it = bpio.parse(handle, "gb") if path.endswith(".embl"): it = bpio.parse(handle, "embl") if path.endswith(".gff") or path.endswith(".gff3"): from BCBio import GFF it = GFF.parse(handle) if path.endswith(".fq"): it = bpio.parse(handle, "fastq") if path.endswith(".fastq"): it = bpio.parse(handle, "fastq") if path.endswith(".hmm"): it = bpsio.parse(handle, "hmmer3-text") if path.endswith(".xml"): with open(path) as h: h.readline() l = h.readline() if "BlastOutput" in l: it = add_blast_xml_props( search_iterator(bpsio.parse(handle, "blast-xml"))) if "<uniprot" in l: it = bpio.parse(handle, "uniprot-xml") except: handle.close() if it: if seqs: def witer(): for x in it: if x.id in seqs: x.seq = seqs[x.id] yield x return Struct(__iter__=witer) else: return it raise Exception("invalid format")
os.makedirs(self.model_directory(model_id, query_id)) if __name__ == '__main__': from SNDG import init_log, Struct init_log() workdir = "/media/eze/Data/data/organismos/Pext14-3B/analysis/struct/good" modeler = Modeller(workdir, "/tmp") model_id = "PE143B_RS25640_3u52_B_6_498" alignment = Struct( aln_query=Struct( name="PE143B_RS25640", seq= "KKLNAKDKYRLLTRDLAWEPSYRTEEEIFPYIAYEGLKIHDWNKWEDPFRLTMDAYWKYQAEKERKFYAIIDAHAQNNGHLNITDARYLSALKIFLQAISPGEYAAHKGFARAGREFRGVGTQVACQMQAIDELRHAQTQIHALSNYNKFYNGFHAFADQRDRIWYTSVARSFFDDAMSAGPFEFMIAIGFSFEYVLTNLLFVPFMSGAAYNGDMATVTFGFSAQSDEARHMTLGLECIKFMLEQDPANLPIVQGWIDKWFWRGFRVLGLVSTMMDYMLPKRVMSWREAWEIYGAENGGALFKDLARYGIRPPKSWDDAEASIDHMSHQFMLGLYQWSFGTAFHAWIPSDDDMQWLSAKYPTTFDKYYRPRWEHIKKMEAAGTPFKNYGLAKLCQCCQLPTVFTEPDDPTLICHRQVQYKGDKYHFCSDHCMGIFNNEPEKYIQAWLPMPALFQAPTN-GDLGAWMD-WVSLKDGQDNGDFADSQDRRN", start=7, end=494), # .ungap("-") aln_hit=Struct( name="3u52_B_6_498", seq= "KKLNLKDKYQYLTRDMAWEPTYQDKKDIFPEEDFEGIKITDWSQWEDPFRLTMDAYWKYQAEKEKKLYAIFDAFAQNNGHQNISDARYVNALKLFISGISPLEHAAFQGYSKVGRQFSGAGARVACQMQAIDELRHSQTQQHAMSHYNKHFNGLHDGPHMHDRVWYLSVPKSFFDDARSAGPFEFLTAISFSFEYVLTNLLFVPFMSGAAYNGDMATVTFGFSAQSDEARHMTLGLEVIKFILEQHEDNVPIVQRWIDKWFWRGFRLLSLVSMMMDYMLPNKVMSWSEAWEVYYEQNGGALFKDLERYGIRPPKYQDVANDAKHHLSHQLWTTFYQYCQATNFHTWIPEKEEMDWMSEKYPDTFDKYYRPRYEYLAKEAAAGRRFYNNTLPQLCQVCQIPTIFTEKDAPTMLSHRQIEHEGERYHFCSDGCCDIFKHEPEKYIQAWLPVHQIYQGNCEGGDLETVVQKYYHINIGEDNFDYVGSPDQKH", start=0, end=489)) ChainSplitter("/tmp/").make_pdb( pdb_path="/data/databases/pdb/divided/u5/pdb3u52.ent", pdb_id="3u52", chain="B", overwrite=True) models = modeler.create_model(model_id, alignment)
def model_hsps(seq_id, work_dir, hsps, refinement=REFINEMENT, models_to_generate=MODELS_TO_GENERATE, assessments=ASSESMENTS, entries={}, tmp_dir=None, max_models=3): result = {"models": defaultdict(lambda: {}), "errors": []} alns = [] for hsp in hsps: aln = Struct(aln_query=Struct(name=seq_id, seq=str(hsp.aln[0].seq), start=hsp.query_start, end=hsp.query_end), aln_hit=Struct(name=hsp.hit.id, seq=str(hsp.aln[1].seq), start=hsp.hit_start, end=hsp.hit_end)) alns.append(aln) modeler = Modeller(work_dir, tmp_dir) modeler._refinement = refinement modeler.model_count = models_to_generate modeler._assess_methods = assessments modeler.parallel_jobs = 1 def pdb_fn(x): return x.aln_hit.name.split("_")[0] alns = sorted(alns, key=lambda x: entries[pdb_fn(x)] if pdb_fn(x) in entries else 20) result["alns"] = alns for aligment in alns[0:max_models]: # pdb,aligment = pdb_alignment if ";" in aligment.aln_hit.name: pdb = aligment.aln_hit.name[-5:-1] chain = aligment.aln_hit.name[-1] modeller_pdb_id = f'{seq_id}_{pdb}_{chain}' aligment.aln_hit.name = f'{pdb}_{chain}' else: pdb, chain, _, _ = aligment.aln_hit.name.split("_") modeller_pdb_id = f'{seq_id}_{pdb}_{chain}' if not os.path.exists(modeler.pdb_path(modeller_pdb_id, seq_id)): try: models = modeler.create_model(modeller_pdb_id, aligment) except SequenceMismatchError as ex: result["errors"].append(f"modeller_pdb_id {str(ex)}\n") continue except ModellerError as ex: result["errors"].append(f"modeller_pdb_id {str(ex)}\n") continue else: models = [ modeler.pdb_path(modeller_pdb_id, seq_id, idx) for idx in range(1, models_to_generate + 1) ] result["models"][aligment.aln_hit.name] = models result["models"] = dict(result["models"]) return result