def computeFromSeqStructMapper(self, seqStructMap, prefixExtended, pssmOutNameRaw): ''' Computes spyder2 for the sequence seqStr, that is contained at fastaInFname. This sequence is associated with prefixExtended as an unambiguous id :param seqStructMap: computeFeatures.seqStep.seqToolManagers.seqExtraction.SeqStructMapper :param prefixExtended: str. unambiguous id of the sequence that will be the prefix of output names :param pssmOutNameRaw: str. Path to psiblast pssms results ''' if pssmOutNameRaw is not None: if not os.path.isfile(pssmOutNameRaw): pssmOutNameRaw = pssmOutNameRaw + ".gz" if os.path.isfile(pssmOutNameRaw): uncompressFileName = self.uncompressFile( pssmOutNameRaw, self.tmp) else: uncompressFileName = None else: uncompressFileName = None try: prefix, chainType, chainId = self.splitExtendedPrefix( prefixExtended)[:3] seqStr, fastaFname = seqStructMap.getSeq( chainType, chainId) # repeat as psiBlastManager can modify seqs seqStructMap.setCurrentSeq(seqStr, chainType, chainId) if self.checkAlreayComputed(prefixExtended): print("spyder2 already computed for %s" % prefixExtended) return 0 fNames = self.getFNames(prefixExtended) spider2ProcName = fNames[0] spider2RawName = os.path.join(self.spider2OutPath, prefixExtended + ".spd3") print("launching spyder2 over %s" % prefixExtended) curWd = os.getcwd() os.chdir(self.spider2OutPath) if uncompressFileName is not None: cmd = ["python", self.spider2PyScript, uncompressFileName] process = Popen(cmd, stdout=PIPE, stderr=PIPE) processOut = process.communicate() os.chdir(curWd) if len(processOut[1]) > 0: print("Error computing spider2. Caught stdin/stderr:\n", processOut[0], processOut[1]) else: spider2RawName = None dataList = self.processSpider2(seqStr, seqStructMap, prefixExtended, spider2RawName, spider2ProcName) if self.winSize: self.makeWindowed(dataList, ["asa", "P_C", "P_E", "P_H"], Spider2Manager.BAD_SCORE_PREDS, [None] * 4, fNames[1]) except (Exception, KeyboardInterrupt): self.tryToRemoveAllFnames(prefixExtended) raise finally: if uncompressFileName is not None: tryToRemove(uncompressFileName)
def processPSAIA(self, prefixAndChainTypeId): ''' Parses raw output from PSAIA and creates results in tab format. @param prefixAndChainTypeId: str. fname prefix of raw psaia results (not taking into account the full path but the name itself) ''' stringDict = {} computed = False header = ( "chainId structResId resName total_ASA b-bone_ASA s-chain_ASA polar_ASA n-polar_ASA total_RASA " + "b-bone_RASA s-chain_RASA polar_RASA n-polar_RASA average_DPX s_avg_DPX s-ch_avg_DPX s-ch_s_avg_DPX " "max_DPX min_DPX average_CX s_avg_CX s-ch_avg_CX s-ch_s_avg_CX max_CX min_CX Hydrophobicity\n" ) for fname in os.listdir(self.outPathRaw): if fname.endswith(".tbl"): if fname.startswith(prefixAndChainTypeId) and not computed: f = open(os.path.join(self.outPathRaw, fname)) for i in range(8): f.readline() for line in f: arrayLine = line.split() out = [arrayLine[0]] + [arrayLine[6]] + [ self.threeLetterAA_to_one(arrayLine[7]) ] + arrayLine[8:] out = "\t".join(out) try: stringDict[arrayLine[0]].append(out) except KeyError: stringDict[arrayLine[0]] = [out] f.close() computed = True splitName = prefixAndChainTypeId.split("_") outNames = [] try: for chainId in stringDict: if len(splitName) == 3: prefix, chainType, unbound = splitName outName = os.path.join( self.outPathProc, prefix + "_" + chainType + "_" + chainId + "_u.psaia.tab") else: outName = os.path.join( self.outPathProc, prefixAndChainTypeId + "_" + chainId + "_u.psaia.tab") outNames.append(outName) outFile = open(outName, "w") outFile.write(header) outFile.write("\n".join(stringDict[chainId])) outFile.close() except (KeyboardInterrupt, Exception): for outName in outNames: print("Exception happend computing %s" % outName) tryToRemove(outName) raise
def saveProcResults(self, seqL, seqR, corrMutOutName, iterOfCorrelatedRows, chainIdL, chainIdR, nAlig): ''' Reads corrMut output file and writes another one with tabulated format, headers and some error checking. @param: seqL: str. Sequence of the ligand chain @param: seqR: str. Sequence of the receptor chain @param corrMutOutName: str. Fname where formated results will be saved. @param iterOfCorrelatedRows: iterator of elements as [res_i, res_j, corrMuScore] ] res_i and res_j are 0 based @param chainIdL:str. The chain Id for the ligand @param chainIdR:str. The chain Id for the receptor @param nAlig: int. The number of rows of MSA ''' corrMutQuality= float(nAlig)/ (len(seqL)+len(seqR)) if iterOfCorrelatedRows==None: self.makeFakeFile( seqL, seqR, corrMutOutName, corrMutQuality, chainIdL, chainIdR) return 1 else: try: with open(corrMutOutName,"w") as outFile: self.writeHeader(outFile) scoresDict={} lenSeqL= len(seqL) lenSeparator= len(CorrMutGeneric.SEQUENCES_SEPARATOR) addedI_J= set([]) # for line in corrMutOut.split("\n")[1:]: for line in iterOfCorrelatedRows: i, j, score= line # i, j=int(i)-1, int(j)-1 if i>=lenSeqL or j <(lenSeqL+lenSeparator): continue j= j-lenSeqL-lenSeparator assert j>=0 addedI_J.add((i,j)) letterL= seqL[i] letterR= seqR[j] score= float(score) structIndexL= self.seqsManager.seqToStructIndex("l", chainIdL, i, asString= True) structIndexR= self.seqsManager.seqToStructIndex("r", chainIdR, j, asString= True) if structIndexR is None or (self.filterOutLabels and structIndexR[-1].isalpha()): continue if structIndexL is None or (self.filterOutLabels and structIndexL[-1].isalpha()): continue outFile.write("%s %s %s %s %s %s %f %f\n"%(chainIdL, structIndexL, letterL, chainIdR, structIndexR, letterR, score, corrMutQuality)) for i in range(len(seqL)): letterL= seqL[i] for j in range(len(seqR)): if not (i,j) in addedI_J: letterR= seqR[j] structIndexL= self.seqsManager.seqToStructIndex("l", chainIdL, i, asString= True) structIndexR= self.seqsManager.seqToStructIndex("r", chainIdR, j, asString= True) if structIndexR is None or (self.filterOutLabels and structIndexR[-1].isalpha()): continue if structIndexL is None or (self.filterOutLabels and structIndexL[-1].isalpha()): continue outFile.write("%s %s %s %s %s %s %f %f\n"%(chainIdL, structIndexL, letterL, chainIdR, structIndexR, letterR, 0.0, corrMutQuality)) return 0 except (KeyboardInterrupt, Exception) as e: print(e) print("Exception happend computing %s"%corrMutOutName) tryToRemove(corrMutOutName) raise
def tryToRemoveAllFnames(self, prefixExtended): ''' try to remove all fnames returned by getFNames (useful to clean if some exception happens :param prefixExtended. prefix for output fnames. ''' for fname in self.getFNames(prefixExtended): if os.path.isfile(fname): tryToRemove(fname)
def fromDfToComplexCodif(self, prefix, pairsCodified, prefixesInvolvedInCoding, isSeqOnly=""): if isSeqOnly == "": wholeComplexObject = ComplexCodified(prefix, pairsCodified, prefixesInvolvedInCoding) else: pairsCodifiedL_seq = pairsCodified if isSeqOnly == "l" else None pairsCodifiedR_seq = pairsCodified if isSeqOnly == "r" else None wholeComplexObject = ComplexSeqStructCodified( prefix, pairsCodifiedL_seq, pairsCodifiedR_seq, prefixesInvolvedInCoding) if not self.sampledOutPath is None: if self.verbose: print("Sampling %s" % prefix) sampledComplexObject = wholeComplexObject.getSampledVersion( self.samplingFold) if isSeqOnly != "" and prefix.split("@")[0][-3:] not in [ "#sl", "#sr" ]: outName = os.path.join( self.sampledOutPath, prefix + "#s%s.train.pkl.gz" % isSeqOnly) else: outName = os.path.join(self.sampledOutPath, prefix + ".train.pkl.gz") try: joblib.dump(sampledComplexObject, outName, compress=5, protocol=2) except (KeyboardInterrupt, Exception): print("Exception happened computing %s" % outName) tryToRemove(outName) raise if not self.wholeComplexOutPath is None: if isSeqOnly != "" and prefix.split("@")[0][-3:] not in [ "#sl", "#sr" ]: outName = os.path.join( self.wholeComplexOutPath, prefix + "#s%s.predict.pkl.gz" % isSeqOnly) else: outName = os.path.join(self.wholeComplexOutPath, prefix + ".predict.pkl.gz") try: if self.verbose: print("Writing results to disk") joblib.dump(wholeComplexObject, outName, compress=5, protocol=2) except (KeyboardInterrupt, Exception): print("Exception happened computing %s" % outName) tryToRemove(outName) raise return wholeComplexObject
def codifyComplex(self, prefix): ''' Codifies one complex whose identifier is prefix. The features of the complex must have been computed previously and they must be located at self.dataRootPath path. @param prefix: str. A pdb id for a complex. If complex is formed by 2 pdb files, then prefix looks like "receptorId<->ligandId" e.g. "1da2<->1lla" @return wholeComplexObject: ComplexCodified.ComplexCodified. A ComplexCodified object containing all putative pairs ''' if self.verbose: print("Codifying %s" % prefix) if "<->" in prefix: prefixR, prefixL = prefix.split("<->") else: prefixR, prefixL = (prefix, prefix) prefixR = prefix + "_r" prefixL = prefix + "_l" pairsCodifiedDir = self.CodProtocol.applyProtocol( prefix, prefixL, prefixR) wholeComplexObject = ComplexCodified(prefix, pairsCodifiedDir) if not self.sampledOutPath is None: if self.verbose: print("Sampling %s" % prefix) sampledComplexObject = wholeComplexObject.getSampledVersion( self.samplingFold) outName = os.path.join(self.sampledOutPath, prefix + ".train.pkl.gz") try: joblib.dump(sampledComplexObject, outName, compress=5, protocol=2) except (KeyboardInterrupt, Exception): print("Exception happened computing %s" % outName) tryToRemove(outName) raise if not self.wholeComplexOutPath is None: outName = os.path.join(self.wholeComplexOutPath, prefix + ".predict.pkl.gz") try: if self.verbose: print("Writing results to disk") joblib.dump(wholeComplexObject, outName, compress=5, protocol=2) except (KeyboardInterrupt, Exception): print("Exception happened computing %s" % outName) tryToRemove(outName) raise if self.verbose: print("%s succesfully codified" % (prefix)) ## a= raw_input("press enter to continue") return wholeComplexObject
def processPSSM(self, seq, prefixExtended, pssmNameRaw, pssmNameProc, areSeqIdsMapped): ''' Reads psiblast pssms output file and writes another one with tabulated format, headers and some error checking. @param seq: str. Sequence of the chain @param prefixExtended: str. unambiguous id of the sequence that will be the prefix of output names @param pssmNameRaw: str. Path to psiblast aligments results @param pssmNameProc: str. Path where formated results will be saved. @param areSeqIdsMapped: boolean. True if psiblast output is obtained from 3dConsDb and thus, structIds are included in first col of pssm files instead of seqIds, false if first column are seqIds. ''' try: pssmData, pssmResIds, pssmSeq = self.loadPSSM(pssmNameRaw) if areSeqIdsMapped: seq = pssmSeq prefix, chainType, chainId, __ = prefixExtended.split("_") self.seqsManager.addResiduesToSeqToStructMap( chainType, chainId, pssmSeq, pssmResIds) else: assert pssmSeq == seq except IOError: print("Pssm was not computed. Default value inserted instead") pssmSeq = seq pssmData = [ " ".join([ PsiBlastManager.BAD_SCORE_CONSERVATION for i in range(42) ]) for i in range(len(seq)) ] prefix, chainType, chainId, __ = prefixExtended.split("_") try: outFile = open(pssmNameProc, "w") outFile.write("chainId seqIndex structResId resName " + "pssm " * 20 + "psfm " * 20 + "score " * 2 + "\n") assert len(pssmData) == len(seq) for i, (pssmArrayJoined, letter) in enumerate(zip(pssmData, seq)): structIndex = self.seqsManager.seqToStructIndex(chainType, chainId, i, asString=True) if self.filterOutLabels and structIndex[-1].isalpha(): continue outFile.write("%s %d %s %s " % (chainId, i, structIndex, letter) + pssmArrayJoined + "\n") outFile.close() except (KeyboardInterrupt, Exception): print("Exception happend computing %s" % pssmNameProc) tryToRemove(pssmNameProc) raise return pssmSeq
def computeOneFile(self, fileName): ''' Computes distance for each pair of aminoacids for a given pdb file @param fileName: str. fname to pdb file ''' prefixAndChainTypeId = (fileName.split("/")[-1]).split(".pdb")[0] outName = os.path.join(self.outPath, prefixAndChainTypeId + ".distMat") if os.path.isfile(outName): print("Already computed Distance Maps") return 0 structure = self.parser.get_structure(prefixAndChainTypeId, fileName) structCenterMass = self.getStructCenterMass(structure) try: outFile = open(outName, "w") outFile.write( "chainId1 structResId1 chainId2 structResId2 distance angle_to_protCM\n" ) for res1 in structure[0].get_residues(): if is_aa(res1, standard=True): ## print res, res.get_full_id() structId1, modelId1, chainId1, resId1 = res1.get_full_id() resId1 = list(resId1) resId1[1] = str(resId1[1]) resId1 = "".join(resId1[1:]) if chainId1 == " ": chainId1 = "*" for res2 in structure[0].get_residues(): if is_aa(res2, standard=True): ## print( res, res.get_full_id()) structId2, modelId2, chainId2, resId2 = res2.get_full_id( ) resId2 = list(resId2) resId2[1] = str(resId2[1]) resId2 = "".join(resId2[1:]) if chainId2 == " ": chainId2 = "*" magnitude = self.getMagnitude( res1, res2, structCenterMass) # print( chainId1, resId1, chainId2, resId2, magnitude) # a= raw_input() outFile.write( chainId1 + " " + resId1 + " " + chainId2 + " " + resId2 + " " + " ".join([str(val) for val in magnitude]) + "\n") outFile.close() except (KeyboardInterrupt, Exception): print("Exception happend computing %s" % outName) tryToRemove(outName) raise return 0
def makeWindowedPSSM(self, pssmNameProc, winPssmOutName): ''' Computes sliding windows for a given pssmFile. Windows will include aa code and pssm features @param pssmNameProc: str. Path to processed pssm file (my format) @param winPssmOutName: str. Path to windowed results. ''' try: WindowPSSM(self.winSize, True, INCLUDE_PSSM, INCLUDE_PSFM).compute(pssmNameProc, winPssmOutName) except (KeyboardInterrupt, Exception): print("Exception happend computing %s" % winPssmOutName) tryToRemove(winPssmOutName) raise
def makeWindowedPSSMHhblits(self, profileNameProc, winProfileOutName): ''' Computes sliding windows for a given profileNameProc. @param profileNameProc: str. Path to processed hhblits profile file @param winProfileOutName: str. Path to windowed results. ''' try: WindowHHblits(self.winSize).compute(profileNameProc, winProfileOutName) except (KeyboardInterrupt, Exception): print("Exception happend computing %s" % winProfileOutName) tryToRemove(winProfileOutName) raise
def runClustalW(self, filteredSeqsFname, psiBlastOut, clustalWOutName=None): tmpFnameCommon = ".".join(filteredSeqsFname.split(".")[:-1]) if clustalWOutName is None: clustalWOutName = tmpFnameCommon + ".clustalw" clustalCommand = [ self.clustalW, "-infile=%s" % filteredSeqsFname, "-outfile=%s" % clustalWOutName, "-outorder=INPUT" ] print(" ".join(clustalCommand)) try: proc = Popen(clustalCommand, stdin=PIPE, stdout=PIPE, stderr=PIPE) output = proc.communicate() if output == None or output[ 1] != "" or "There was an error parsing psiblast, clustalw" in output[ 0]: print(output) print("Error when clustalw %s for al2Co" % psiBlastOut) raise FeatureComputerException( "Error when clustalw %s for al2Co" % psiBlastOut) return clustalWOutName except (Exception, KeyboardInterrupt): tryToRemove(clustalWOutName) raise finally: tryToRemove(filteredSeqsFname) tryToRemove(filteredSeqsFname + ".clstr") tryToRemove(tmpFnameCommon + ".dnd")
def makeFakeFile(self, seqL, seqR, corrMutOutName, corrMutQuality, chainIdL, chainIdR): try: with open(corrMutOutName,"w") as outFile: self.writeHeader(outFile) for i,letterL in enumerate(seqL): structIndexL= self.seqsManager.seqToStructIndex("l", chainIdL, i, asString= True) if structIndexL is None or (self.filterOutLabels and structIndexL[-1].isalpha()): continue for j,letterR in enumerate(seqR): structIndexR= self.seqsManager.seqToStructIndex("r", chainIdR, j, asString= True) if structIndexR is None or (self.filterOutLabels and structIndexR[-1].isalpha()): continue outFile.write("%s %s %s %s %s %s %f %f\n"%(chainIdL, structIndexL, letterL, chainIdR, structIndexR, letterR, CorrMutGeneric.BAD_SCORE_CONSERVATION, corrMutQuality)) except (KeyboardInterrupt, Exception): print("Exception happend computing %s"%corrMutOutName) tryToRemove(corrMutOutName) raise
def runCdHit(self, allHits, inputSeq, psiBlastOut, pairSeqIdThr=0.95): tmpName = os.path.basename(psiBlastOut).split(".")[0] tmpName = os.path.join(self.tmp, tmpName) cdhitInName = tmpName + ".in-cdhit" cdhitOutName = tmpName + ".out-cdhit" try: with open(cdhitInName, "w") as f: for hit in allHits: f.write("> %s\n" % (hit["target_full_id"])) f.write("%s\n" % (hit["targetSeq"].replace("-", ""))) if (pairSeqIdThr > .70 and pairSeqIdThr <= 1.00): n = 5 elif (pairSeqIdThr <= .70 and pairSeqIdThr >= .55): n = 4 elif (pairSeqIdThr < .55 and pairSeqIdThr >= .50): n = 3 elif (pairSeqIdThr < .50 and pairSeqIdThr >= .40): n = 2 else: raise ValueError("Error, just .4<=pairSeqIdThr<=1.00 allowed") cdhitCmd = [ self.cdHitBin, "-i", cdhitInName, "-o", cdhitOutName, "-n", str(n), "-c", str(pairSeqIdThr), "-T", str(self.psiBlastNThrs) ] print(" ".join(cdhitCmd)) proc = Popen(cdhitCmd, stdin=PIPE, stdout=PIPE, stderr=PIPE) output = proc.communicate() if output == None or output[ 1] != "" or "There was an error cd-hit psiblast" in output[ 0]: print(output) print("Error when parsing %s for al2Co" % psiBlastOut) raise FeatureComputerException( "Error when cd-hit %s for al2Co" % psiBlastOut) with open(cdhitOutName, "r+") as f: fileData = f.read() f.seek(0, 0) f.write("> InputSeq\n") f.write("%s\n" % (inputSeq.replace("-", ""))) f.write(fileData + "\n") return cdhitOutName except (Exception, KeyboardInterrupt): tryToRemove(cdhitOutName) raise finally: tryToRemove(cdhitInName)
def compute(self, HHBlitsFnamesDict, prefix): ''' Computes corrMut for the Multiple Sequence aligment hhBlitsOut after pairing it by taxa. If more than 2 sequences are found for one taxa, just best match is choosen @param HHBlitsFnamesDict: {"l":{"A":"1A2K_l_A_u.a3m"}, "r":{"B":"1A2K_r_B_u.a3m", "C":"1A2K_r_C_u.a3m"}} @param prefix: str. The prefix of the complex, p.e. 1A2K ''' aligsDict= {chainType:{ chainId: self.loadOneAligFile(HHBlitsFnamesDict[chainType][chainId]) for chainId in HHBlitsFnamesDict[chainType]} for chainType in HHBlitsFnamesDict} for chainIdL in aligsDict["l"]: for chainIdR in aligsDict["r"]: print("launching corrMut over chains %s - %s"%(chainIdL, chainIdR)) # raw_input("press enter to procced") aligFormatedName= os.path.join(self.corrMutOutPath, "tmp_"+prefix+"_l-"+chainIdL+"-r-"+chainIdR+"_"+"u.ali") try: corrMutOutName= os.path.join(self.corrMutOutPath, prefix+"_l-"+chainIdL+"_r-"+chainIdR+"_"+"u.corrMut") if self.checkAlreayComputed(corrMutOutName): print("%s already computed"%corrMutOutName) continue aligOut= self.createPairedAlignmet(aligsDict["l"][chainIdL], aligsDict["r"][chainIdR], aligFormatedName) if aligOut: __, __, nAlig, seqL, seqR= aligOut else: nAlig=0 seqL, __= self.seqsManager.getSeq("l", chainIdL) seqR, __= self.seqsManager.getSeq("r", chainIdR) if nAlig> CorrMutGeneric.MIN_N_SEQS_MSA: startTime= time.time() iterOfCorrelatedRows= self.lauchCorrMutProgram(aligFormatedName) print("Time CorrMut", time.time()- startTime) else: iterOfCorrelatedRows= None #( "*** Sorry", "Error, not enough sequences in MSA") # if len(processOut[1])>0: # print("Error computing corrMut. Caught stdin/stderr:\n",processOut[0],processOut[1]) self.saveProcResults(seqL, seqR, corrMutOutName, iterOfCorrelatedRows, chainIdL, chainIdR, nAlig) except (KeyboardInterrupt, Exception): print("Exception happend computing corrMut for %s over chains %s - %s"%(prefix, chainIdL, chainIdR)) tryToRemove(corrMutOutName) raise finally: tryToRemove(aligFormatedName) pass
def contactMapOneComplex(self): ''' Computes the contact map of a complex. Initial input for complex codification. Contact map is a file written at self.computedFeatsRootDir/common/contactMaps/ with name prefix.cMap.tab where prefix is either the common name of ligand and receptor pdb files or the concatenation of ligand and receptor names. 1A2K_l_u.pdb and 1A2K_r_u.pdb --> 1A2K.cMap.tab 1A2K_l_u.pdb and 1A22.pdb --> 1A2K-1A22.cMap.tab ''' outName= self.outName print (outName) if os.path.isfile(outName): print ('Already computed contact map') return 0 seqL = self.parseFasta(self.lFname, inputNumber="1") seqR = self.parseFasta( self.rFname, inputNumber="2") # print(repr(seqL)) # print(repr(seqR)) nResiduesL= len(seqL) nResiduesR= len(seqR) if not (self.minNumResiduesPartner< nResiduesL < self.maxNumResiduesPartner): raise BadNumberOfResidues(nResiduesL, "1") if not (self.minNumResiduesPartner< nResiduesR < self.maxNumResiduesPartner): raise BadNumberOfResidues(nResiduesL, "2") with open(outName,"w") as outFile: outFile.write("chainIdL structResIdL resNameL chainIdR structResIdR resNameR categ\n") try: for ixL, resnameL in enumerate(seqL): chainIdL="L" resIdL= str(ixL) if not resnameL in d1_to_index: continue for ixR, resnameR in enumerate(seqR): if not resnameR in d1_to_index: continue chainIdR="R" resIdR= str(ixR) categ= np.nan # print("%s %s %s %s %s %s %s\n" %(chainIdL, resIdL, resnameL, chainIdR, resIdR, resnameR, categ)) outFile.write("%s %s %s %s %s %s %s\n" %(chainIdL, resIdL, resnameL, chainIdR, resIdR, resnameR, categ)) except (KeyboardInterrupt, Exception): print("Exception happend computing %s"%outName) tryToRemove(outName) raise
def computeOneFile(self, fileName): ''' Computes PSAIA for a given pdb file @param fileName: str. fname to pdb file ''' try: prefixAndChainTypeId = ( os.path.split(fileName)[-1]).split(".pdb")[0] for fname in os.listdir(self.outPathRaw): #remove old psaia runs if fname.startswith(prefixAndChainTypeId): os.remove(os.path.join(self.outPathRaw, fname)) if self.checkIfAlreadyComputed(prefixAndChainTypeId): print("PSAIA already computed") return 0 f = open(self.listFileNameForPSAIA % prefixAndChainTypeId, "w") f.write(fileName) f.close() f = open(self.configFileName % prefixAndChainTypeId, "w") f.write(self.configForPSAIATemplate) f.close() proc = Popen([ os.path.join(self.psaiaRootDir, "psa"), (self.configFileName % prefixAndChainTypeId), self.listFileNameForPSAIA % prefixAndChainTypeId ], stdin=PIPE, stdout=PIPE, stderr=PIPE) output = proc.communicate(input="y\n") if output == None or output[ 1] != "" or "There was an error in PDB" in output[0]: print(output) print("Error when computing PSAIA for %s" % fileName) raise FeatureComputerException( "Error when computing PSAIA for %s" % fileName) else: self.processPSAIA(prefixAndChainTypeId) return 0 finally: tryToRemove(self.listFileNameForPSAIA % prefixAndChainTypeId) tryToRemove(self.configFileName % prefixAndChainTypeId)
def getClusterRepresentatives(prefix, filesPattern, outPath, maxPerClus=MAX_ELEMS_PER_CLUS): outPath = myMakeDir(os.path.expanduser(outPath)) if maxPerClus is None: path, base = os.path.split(os.path.expanduser(filesPattern)) nPos = check_output("ls %s/*_T* | wc -l " % (path), shell=True) maxPerClus = int(nPos) / 2 mergedFileName = os.path.join(os.path.expanduser("~/tmp"), prefix + ".merged.pdb") try: fileNames = mergePDBs(filesPattern, mergedFileName) except (Exception, KeyboardInterrupt) as e: print("exception:", e) tryToRemove(mergedFileName) raise print(fileNames) clusterize(mergedFileName, prefix, fileNames, outPath, maxPerClus)
def writeGzResults(self, outName, headerStr, listOfRecords): ''' Used to write a list of records as a .gz file :param outName: str. path where results will be saved :param headerStr: str. string that will be written at the beginning of the file :param dataList: a list of str that represents the rows of the dataframe e.g [ "A 123 L B 2 I 0.1 0.8 -1", ...] ''' dirName, baseName = os.path.split(outName) tmpOutName = os.path.join(dirName, "tmp-" + baseName) try: with gzip.open(tmpOutName, "w") as outFile: outFile.write(headerStr) outFile.write("\n".join(listOfRecords)) tryToMove(tmpOutName, outName) except (KeyboardInterrupt, Exception): tryToRemove(outName) tryToRemove(tmpOutName) raise
def clusterize(mergedFileName, prefix, fileNames, outPath, maxPerClus): logsClusters = os.path.join(os.path.expanduser("~/tmp"), prefix + ".clusters.log") distCut = DIST_CUT gmx = GMX_PATH cwd = os.getcwd() os.chdir(os.path.expanduser("~/tmp")) cmd = ( "echo 5 | %(gmx)s cluster -f %(mergedFileName)s -s %(mergedFileName)s " + "-cutoff %(distCut)f -g %(logsClusters)s -nofit -method gromos" ) % locals() if not os.path.isfile(logsClusters): print(cmd) try: check_output(cmd, shell=True) os.chdir(cwd) except (Exception, KeyboardInterrupt): tryToRemove(logsClusters) with open(logsClusters) as f: for line in f: if line.startswith("cl."): break members = None for line in f: lineArray = line.split("|") print(lineArray) if not lineArray[0].strip().isdigit(): members += [int(elem) for elem in lineArray[-1].split()] else: if members: processOneCluster(fileNames, centroid, members, outPath, maxPerClus) centroid = lineArray[2].split(".")[0] if centroid[-1].isdigit(): centroid = centroid[:-1] centroid = int(centroid) members = [int(elem) for elem in lineArray[-1].split()] processOneCluster(fileNames, centroid, members, outPath, maxPerClus)
def processHhblits(self, seq, prefixExtended, profileNameRaw, profileNameProc): ''' Reads hhblits profile output file and writes another one with tabulated format, headers and some error checking. @param: seq: str. Sequence of the chain @param prefixExtended: str. unambiguous id of the sequence that will be the prefix of output names @param profileNameRaw: str. Path to profiles results @param profileNameProc: str. Path where formated results will be saved. ''' try: hhBlitsData = self.loadHhblits(profileNameRaw) except IOError: hhBlitsData = [ " ".join( [HHBlitsManager.BAD_SCORE_CONSERVATION for i in range(31)]) for i in range(len(seq)) ] prefix, chainType, chainId, __ = prefixExtended.split("_") try: outFile = open(profileNameProc, "w") outFile.write("chainId seqIndex structResId resName " + "hhblits " * 31 + "\n") for i, (hhBlitsArrayJoined, letter) in enumerate(zip(hhBlitsData, seq)): structIndex = self.seqsManager.seqToStructIndex(chainType, chainId, i, asString=True) if self.filterOutLabels and structIndex[-1].isalpha(): continue outFile.write("%s %d %s %s " % (chainId, i, structIndex, letter) + hhBlitsArrayJoined + "\n") outFile.close() except (KeyboardInterrupt, Exception): print("Exception happend computing %s" % profileNameProc) tryToRemove(profileNameProc) raise
def createFileForError(self, pdbStruct, outName): ''' Creates a fake DSSP raw output generated when DSSP fails. All residues will be assigned secStruc= Z @param pdbStruct: Bio.PDB.Structure. Structure of the psb that is being analyzed @param outName: str. output fname ''' oneResLine = "%5d%5d%2s%2s %2s\n" try: f = open(outName, "w") f.write(DsspComputer.DSSP_HEADER) if len(pdbStruct) == 0: raise NoValidPDBFile( "No valid pdb File. There are no models contained") for chain in pdbStruct[0]: for i, res in enumerate(chain): if not is_aa(res): continue ## print i,res,res.get_id() seqIndex = i + 1 structIndex = res.get_id()[1] letter = self.threeLetterAA_to_one(res.resname) fakeSecStruct = "Z" fakeCharacters1 = tuple("f" * 7) fakeDigits1 = tuple([0, 0, "f", 0]) fakeStrs = tuple("f" * 4) fakeFloats = tuple(elem + 0.0 for elem in range(8)) ## print ((seqIndex,structIndex, chain.get_id(), letter, fakeSecStruct)+ ## fakeCharacters1+fakeDigits1+fakeStrs+fakeFloats) ## print oneResLine%( (seqIndex,structIndex, chain.get_id(), letter, fakeSecStruct)+ ## fakeCharacters1+fakeDigits1+fakeStrs+fakeFloats) f.write(oneResLine % (seqIndex, structIndex, chain.get_id(), letter, fakeSecStruct)) f.close() except (KeyboardInterrupt, Exception): print("Exception happend computing %s" % outName) tryToRemove(outName) raise return 0
def computeOneFile(self, pdbFname, struct): ''' Computes PSAIA for a given pdb file :param pdbFname: str. fname to pdb file :param struct: ignored ''' assert isinstance( pdbFname, str ), "Error, PSAIA computeOneFile first argument is a path to pdb file (str). given %s" % pdbFname prefixExtended = self.getExtendedPrefix(pdbFname) prefix, chainType = self.splitExtendedPrefix(prefixExtended)[:2] if self.checkAlreayComputed(prefixExtended): print("Psaia already computed for %s" % prefixExtended) return 0 print("launching PSAIA over %s" % prefixExtended) uncompressFileName = self.uncompressFile(pdbFname, self.tmp) try: with open(self.listFileNameForPSAIA % prefixExtended, "w") as f: f.write(uncompressFileName) with open(self.configFileName % prefixExtended, "w") as f: f.write(self.configForPSAIATemplate) proc = Popen([ os.path.join(self.psaiaRootDir, "psa"), (self.configFileName % prefixExtended), self.listFileNameForPSAIA % prefixExtended ], stdin=PIPE, stdout=PIPE, stderr=PIPE) output = proc.communicate(input="y\n") if output == None or output[ 1] != "" or "There was an error in PDB" in output[0]: print(output) print("Error when computing PSAIA for %s" % pdbFname) raise FeatureComputerException( "Error when computing PSAIA for %s" % pdbFname) else: self.processPSAIA(prefixExtended) return 0 except (Exception, KeyboardInterrupt): self.tryToRemoveAllFnames(prefixExtended) raise finally: tryToRemove(self.listFileNameForPSAIA % prefixExtended) tryToRemove(self.configFileName % prefixExtended) tryToRemove(uncompressFileName) tryToCleanDir(self.outPathRaw, prefixExtended, rootDataDir=self.computedFeatsRootDir)
def trainAndTestOneFold(trainData, testPrefixes, trainSubsetN, testPath, outputPath, verbose=False, ncpu=1): ''' Trains and tests one fold :param trainData: a numpy array for training with first column labels and the others are features :param testPrefixes: str[]. A list that contains prefixes for all complexes to be tested :param trainSubsetN: int Tuple. The numerical ids of the training split. :param testPath: str. Path to a dir where testing data files are stored :param outputPath: str. Path to a dir where predictions will be stored. None if results will not be saved :param verbose: boolean. Whether or not print to stdout info :param ncpu: int. Number of cpu's to use in parallel ''' testPrefixesNotEvaluated = [] originalTestPrefixToNewPrefix, __ = getOriginalToActualPrefixs(testPrefixes) alreadyComputedPrefixes_and_outnames= [] for testPrefix in originalTestPrefixToNewPrefix: if outputPath is not None: outName = getResultsOutname(outputPath, testPrefix, trainSubsetN) if verbose and os.path.isfile(outName): print("Complex already computed: %s" % (outName)) alreadyComputedPrefixes_and_outnames.append( (testPrefix, outName) ) else: testPrefixesNotEvaluated.append((testPrefix, outName)) else: testPrefixesNotEvaluated.append((testPrefix, None)) modelo = None from Config import Configuration conf = Configuration() modelFname= os.path.join(conf.tmp, hashlib.md5("".join(sorted(testPrefixes))).hexdigest()+str(trainSubsetN)+"bipspi2.pckl") resultsForEvaluation_list=[] if len(testPrefixesNotEvaluated) > 0 or len(testPrefixes) == 0: if verbose: print("Testing:", [ x[0] for x in testPrefixesNotEvaluated]) verboseLevel = 1 else: verboseLevel = 0 if os.path.exists(modelFname): print("Loading classifier") modelo= joblib_load(modelFname) else: print("Training classifier") modelo = trainMethod(trainData[:, 1:], trainData[:, 0], verboseLevel=verboseLevel, ncpu=ncpu) joblib_save(modelo, modelFname) del trainData gc.collect() if verbose: print("Classifier fitted.") expectedSize= estimateRequiredMemoryPerComplex(testPrefixesNotEvaluated, testPath) freeMem= checkFreeMemory() nJobs= int(max(1, min(ncpu, freeMem/expectedSize, len(testPrefixesNotEvaluated)))) print("Free memory for predictOnePrefix: %s GB. Njobs: %s (%s expected size)"%(freeMem, nJobs, expectedSize)) resultsForEvaluation_list= Parallel(n_jobs=nJobs)(delayed(predictOnePrefix)(originalTestPrefixToNewPrefix[testPrefix], modelo, outName, testPath) for testPrefix, outName in testPrefixesNotEvaluated ) gc.collect() expectedSize= estimateRequiredMemoryPerComplex(alreadyComputedPrefixes_and_outnames, testPath) freeMem= checkFreeMemory() nJobs= int(max(1, min(ncpu, freeMem/expectedSize, len(alreadyComputedPrefixes_and_outnames)))) resultsForEvaluation_list+= Parallel(n_jobs=nJobs)(delayed(loadExistingResults)( testPrefix, outName,) for testPrefix, outName in alreadyComputedPrefixes_and_outnames ) if len(resultsForEvaluation_list)>0: freeMem = checkFreeMemory() totMem= getTotalMemory() usedMem= totMem-freeMem nJobs = int(max(1, min(ncpu, freeMem / (usedMem/(1+len(resultsForEvaluation_list)))))) print("Free memory for evaluateOneResultObj: %s GB. Njobs: %s" % (freeMem, nJobs)) Parallel(n_jobs=nJobs)(delayed(evaluateOneResultObj)(testPrefix, resultObj, False) for testPrefix, resultObj in resultsForEvaluation_list) finalResults= zip(*resultsForEvaluation_list)[1] else: finalResults=[] del resultsForEvaluation_list tryToRemove(modelFname) return finalResults, modelo
def processspider2(self, seq, prefixExtended, spider2Raw, spider2Proc): ''' Reads spider2 output file and writes another one with tabulated format, headers and some error checking. @param: seq: str. Sequence of the chain @param prefixExtended: str. unambiguous id of the sequence that will be the prefix of output names @param spider2Raw: str. Path to spider2 results @param spider2Proc: str. Path where formated results will be saved. head spider2Proc chainId seqIndex structResId resName score_asa score_Pc score_Pe score_Ph A 0 6 E 146.4 0.982 0.011 0.006 A 1 7 P 100.2 0.977 0.012 0.012 ''' try: predictionsData = self.loadspider2(spider2Raw) except IOError: predictionsData= [ (letter, (tuple([Spider2Manager.BAD_SCORE_CONSERVATION]*4)) ) for letter in seq] prefix, chainType, chainId, __= prefixExtended.split("_") try: outFile= open(spider2Proc,"w") outFile.write("chainId seqIndex structResId resName score_asa score_Pc score_Pe score_Ph\n") predsIx=0 seqIx=0 seqLen= len(seq) alcoLen= len(predictionsData) while seqIx<seqLen and predsIx<alcoLen: letter= seq[seqIx] letterspider2, consValTuple= predictionsData[predsIx] if letterspider2== letter: structIndex= self.seqsManager.seqToStructIndex(chainType, chainId, seqIx, asString= True) if self.filterOutLabels and structIndex[-1].isalpha(): continue outFile.write("%s %d %s %s %s %s %s %s\n"%((chainId, seqIx, structIndex, letter)+ consValTuple)) predsIx+=1 seqIx+=1 elif letter=="X" and letterspider2=="-": predsIx+=1 seqIx+=1 elif letterspider2=="-": predsIx+=1 else: print(predictionsData) print(seq) print(predsIx, seqIx) raise ValueError("spider2 mismatch %s %s "%(letterspider2, letter)) # for i, (letter, (consVal,letterspider2)) in enumerate(zip(seq, predictionsData)): # if letter!="X" and letterspider2!= letter: continue # structIndex= self.seqsManager.seqToStructIndex(chainType, chainId, i, asString= True) # if self.filterOutLabels and structIndex[-1].isalpha(): # continue # outFile.write("%s %d %s %s %s\n"%(chainId, i, structIndex, letter, consVal)) outFile.close() except (KeyboardInterrupt, Exception): print("Exception happend computing %s"%spider2Proc) tryToRemove(spider2Proc) raise finally: tryToRemove(spider2Raw) pass
def computeOneFile(self, fileName): ''' Computes DSSP for a given pdb file @param fileName: str. fname to pdb file ''' prefixAndChainTypeId = (fileName.split("/")[-1]).split(".pdb")[0] prefixAndChainTypeId = "_".join(prefixAndChainTypeId.split("_")[:2]) structure = self.parser.get_structure(prefixAndChainTypeId, fileName) model = structure[0] outNames = {} for chain in structure[0]: chainId = chain.get_id() nResidues = sum((1 for res in chain if is_aa(res))) if chainId == " ": chainId = "*" outName = os.path.join( self.outPath, prefixAndChainTypeId + "_" + chainId + "_u.hse") if nResidues > 5 and not os.path.isfile(outName): outNames[chainId] = outName if len(outNames) == 0: print("HalfSphere already computed") return 0 featuresDict = {} hse = HSExposureCA(model) for aa, feat in hse: featuresDict[aa] = [elem if elem != None else -1 for elem in feat] hseDict = { aa: [elem if elem != None else -1 for elem in feat] for aa, feat in HSExposureCB(model) } ## print(len(hseDict)) ## raw_input("press enter") for aa in set(featuresDict.keys()).union(set(hseDict.keys())): try: prevFeatures = featuresDict[aa] except KeyError: prevFeatures = [-1, -1, -1.0] try: newFeatures = hseDict[aa] except KeyError: newFeatures = [-1, -1, -1.0] featuresDict[aa] = prevFeatures + newFeatures hseDict = { aa: [feat] if feat != None else -1 for aa, feat in ExposureCN(model) } for aa in set(featuresDict.keys()).union(set(hseDict.keys())): try: prevFeatures = featuresDict[aa] except KeyError: prevFeatures = [-1, -1, -1.0, -1, -1, -1.0] try: newFeatures = hseDict[aa] except KeyError: newFeatures = [-1] featuresDict[aa] = prevFeatures + newFeatures filesHandlers = { chainId: open(outNames[chainId], "w") for chainId in outNames.keys() } for fHand in filesHandlers.values(): fHand.write( "chainId structResId resName HSExposureCA1 HSExposureCA2 HSExposureCA3" + " HSExposureCB1 HSExposureCB2 HSExposureCB3 ExposureCN\n") resuisduesList = [ res for chain in structure[0] for res in chain if is_aa(res) ] badExample = [-1 for elem in list(featuresDict.values())[0]] for res in resuisduesList: # for res in featuresDict: # print(res.get_full_id()) # print(filesHandlers, res, is_aa(res)) # raw_input("press enter to continue") # structId, modelId, chainId, resId = res.get_full_id() resId = list(resId) resId[1] = str(resId[1]) resId = "".join(resId[1:]) resName = self.threeLetterAA_to_one(res.resname) if chainId == " ": chainId = "*" try: if chainId not in filesHandlers: continue #small chains try: valuesForRes = featuresDict[res] except KeyError: valuesForRes = badExample filesHandlers[chainId].write( chainId + " " + resId + " " + resName + " " + " ".join([str(val) for val in valuesForRes]) + "\n") except (KeyboardInterrupt, Exception): for outName in outNames.values(): print("Exception happend computing %s" % outName) tryToRemove(outName) raise for outFile in filesHandlers.values(): outFile.close() return 0
def processDSSP(self, prefixAndChainTypeId, pdbStruct, dsspFName): ''' Parses raw output from DSSP and creates results in tab format. @param prefixAndChainTypeId: str. fname prefix of raw psaia results (not taking into account the full path but the name itself) @param pdbStruct: Bio.PDB.Structure. Structure of the pdb that is being analyzed @param dsspFName: str. fname of raw Dssp output ''' struct = pdbStruct resDict = {} for chain in pdbStruct[0]: chainId = chain.get_id() if chainId == " ": chainId = "*" resDict[chainId] = set([]) for res in chain: if not is_aa(res, standard=True): continue resId = str(res.get_id()[1]) if res.get_id()[2] != " ": resId += res.get_id()[2] try: resDict[chainId].add( (resId, self.threeLetterAA_to_one(res.resname))) except KeyError: continue chain = None stringDict = {} computed = False header = ("#Levels: 3:H;B;E;G;I;T;S;Z\n" + "chainId structResId resName 2ndStruct\n") f = open(dsspFName) error = True for line in f: if line.startswith(' # RESIDUE '): error = False break if error == True: raise Exception("Error processing DSSP file " + dsspFName) for line in f: secStruc = line[16] if secStruc == " ": secStruc = "Z" lineArray = line.split() chain = line[11] resLetter = line[13] if resLetter.islower(): resLetter = "C" resInd = lineArray[1] if chain == " ": chain = "*" if resInd.startswith("!"): #Chain changed continue resDict[chain].discard((resInd, resLetter)) out = "\t".join([chain, resInd, resLetter, secStruc]) try: stringDict[chain].append(out) except KeyError: stringDict[chain] = [out] #To fill residues for which we do not have enough information for chainId in resDict: for resId, resLetter in sorted(resDict[chainId]): out = "\t".join([chainId, resId, resLetter, "Z"]) try: stringDict[chain].append(out) except KeyError: stringDict[chain] = [out] f.close() splitName = prefixAndChainTypeId.split("_") for chainId in stringDict: if len(splitName) == 3: prefix, chainType, unbound = splitName outName = os.path.join( self.outPathProc, prefix + "_" + chainType + "_" + chainId + "_u.dssp.tab") else: outName = os.path.join( self.outPathProc, prefixAndChainTypeId + "_" + chainId + "_u.dssp.tab") ## print(outName) try: outFile = open(outName, "w") outFile.write(header) outFile.write("\n".join(stringDict[chainId])) outFile.close() except (KeyboardInterrupt, Exception): print("Exception happend computing %s" % outName) tryToRemove(outName) raise
def computeFromSeqStructMapper(self, seqStructMap, prefixExtended, psiblastOutName, pssmOutNameRaw): ''' Computes al2co for the sequence seqStr, that is contained at fastaInFname. This sequence is associated with prefixExtended as an unambiguous id :param seqStructMap: computeFeatures.seqStep.seqToolManagers.seqExtraction.SeqStructMapper :param prefixExtended: str. unambiguous id of the sequence that will be the prefix of output names :param psiblastOutName: str. Path to psiblast aligments results :param pssmOutNameRaw: str. Path to psiblast pssms results ''' msaFname = None prefix, chainType, chainId = self.splitExtendedPrefix( prefixExtended)[:3] seqStr, fastaFname = seqStructMap.getSeq( chainType, chainId) # repeat as psiBlastManager can modify seqs seqStructMap.setCurrentSeq(seqStr, chainType, chainId) if self.checkAlreayComputed(prefixExtended): print("Al2co already computed for %s" % prefixExtended) return 0 fNames = self.getFNames(prefixExtended) print("launching al2co over %s" % prefixExtended) al2coProcName = fNames[0] al2coRawName = os.path.join(self.al2coOutPath, prefixExtended + ".fasta.csv") try: if os.path.isfile(psiblastOutName): alignedSeqsDict = parsePsiBlast(inputSeq=seqStr, psiBlastOut=psiblastOutName) filteredSeqsFname = self.runCdHit(alignedSeqsDict, inputSeq=seqStr, psiBlastOut=psiblastOutName) msaFname = self.runClustalW(filteredSeqsFname, psiBlastOut=psiblastOutName) cmd = [ self.al2coBin, "-i", msaFname, "-m", "0", "-f", "2", "-a", "F", "-b", "50", "-g", "0.50", "-w", "1", "-c", "0", "-o", al2coRawName, "-t", al2coProcName ] print(" ".join(cmd)) process = Popen(cmd, stdout=PIPE, stderr=PIPE) processOut = process.communicate() if len(processOut[1]) > 0: print("Error computing al2co. Caught stdin/stderr:\n", processOut[0], processOut[1]) else: print("Error computing al2co. Psiout does not exists for %s" % (prefixExtended)) al2coRawName = None dataList = self.processAl2co(seqStr, seqStructMap, prefixExtended, al2coRawName, al2coProcName) if self.winSize: self.makeWindowed(dataList, ["al2co", "al2coNorm"], [Al2coManager.BAD_SCORE_CONSERVATION] * 2, [None] * 2, fNames[1]) except (Exception, KeyboardInterrupt): self.tryToRemoveAllFnames(prefixExtended) raise finally: if msaFname: tryToRemove(msaFname)
def contactMapOneComplex(self): ''' Computes the contact map of a complex. Initial input for complex codification. Contact map is a file written at self.computedFeatsRootDir/common/contactMaps/ with name prefix.cMap.tab where prefix is either the common name of ligand and receptor pdb files or the concatenation of ligand and receptor names. 1A2K_l_u.pdb and 1A2K_r_u.pdb --> 1A2K.cMap.tab 1A2K_l_u.pdb and 1A22.pdb --> 1A2K-1A22.cMap.tab ''' outName = self.outName print(outName) if os.path.isfile(outName): print('Already computed contact map') return 0 lStructId = self.prefixL + "_l_u.pdb" rStructId = self.prefixR + "_r_u.pdb" structureL_u = self.parser.get_structure(lStructId, self.lFname) structureR_u = self.parser.get_structure(rStructId, self.rFname) if self.boundAvailable == False or self.isForPrediction: structureL_b = None structureR_b = None else: try: lStructId_b = self.prefix + "_l_b.pdb" rStructId_b = self.prefix + "_r_b.pdb" lFname_b = os.path.join( os.path.split(self.lFname)[0], lStructId_b) rFname_b = os.path.join( os.path.split(self.rFname)[0], rStructId_b) structureL_b = self.parser.get_structure(lStructId_b, lFname_b) structureR_b = self.parser.get_structure(rStructId_b, rFname_b) except IOError as e: # in this case there are just unbound pdbs available structureL_b = None structureR_b = None if self.isForPrediction: positiveContacts = None chainsNotContactR = set([]) chainsNotContactL = set([]) elif structureL_b is None or structureR_b is None: #Compute contacs in bound structures positiveContacts, chainsNotContactL, chainsNotContactR = self.getPairsOfResiduesInContact( structureL_u, structureR_u) else: #Compute contacs in unbound structures positiveContacts, chainsNotContactL, chainsNotContactR = self.getPairsOfResiduesInContact( structureL_b, structureR_b) if JUST_INTERACTING_CHAINS == False: chainsNotContactR = set([]) chainsNotContactL = set([]) rResDict = self.mapBoundToUnbound(structureR_u, structureR_b, skipBoundChainsIds=chainsNotContactR) lResDict = self.mapBoundToUnbound(structureL_u, structureL_b, skipBoundChainsIds=chainsNotContactL) nResiduesL = len(lResDict) nResiduesR = len(rResDict) if not (self.minNumResiduesPartner < nResiduesL < self.maxNumResiduesPartner): raise BadNumberOfResidues(nResiduesL, "1") if not (self.minNumResiduesPartner < nResiduesR < self.maxNumResiduesPartner): raise BadNumberOfResidues(nResiduesR, "2") outFile = open(outName, "w") outFile.write( "chainIdL structResIdL resNameL chainIdR structResIdR resNameR categ\n" ) # print(sorted(lResDict, key= lambda x: x.get_id())) # a= raw_input() try: for resL_bound in sorted(lResDict, key=lambda x: x.get_full_id()): # print(resL_bound.get_full_id()) resL_unbound = lResDict[resL_bound] pdbIdL, modelL, chainIdL, resIdL = resL_unbound.get_full_id() resIdL = self.makeStrResId(resIdL) try: letraL = three_to_one(resL_unbound.resname) if letraL != three_to_one(resL_bound.resname): continue except KeyError: continue for resR_bound in sorted(rResDict, key=lambda x: x.get_full_id()): resR_unbound = rResDict[resR_bound] pdbIdR, modelR, chainIdR, resIdR = resR_unbound.get_full_id( ) try: letraR = three_to_one(resR_unbound.resname) if letraR != three_to_one(resR_bound.resname): continue except KeyError: continue if self.isForPrediction: categ = np.nan elif (resL_bound.get_full_id(), resR_bound.get_full_id()) in positiveContacts: categ = 1 else: categ = -1 resIdR = self.makeStrResId(resIdR) if chainIdL == " ": chainIdL = "*" if chainIdR == " ": chainIdR = "*" # print("%s %s %s %s %s %s %s\n" %(chainIdL, resIdL, letraL, chainIdR, resIdR, letraR, categ) ) # raw_input("enter") outFile.write("%s %s %s %s %s %s %s\n" % (chainIdL, resIdL, letraL, chainIdR, resIdR, letraR, categ)) outFile.close() except (KeyboardInterrupt, Exception): print("Exception happend computing %s" % outName) tryToRemove(outName) raise
def processAl2co(self, seq, seqStructMap, prefixExtended, al2coRaw, al2coProc): ''' Reads al2co output file and writes another one with tabulated format, headers and some error checking. :param: seq: str. Sequence of the chain :param prefixExtended: str. unambiguous id of the sequence that will be the prefix of output names :param al2coRaw: str. Path to al2co results :param al2coProc: str. Path where formatted results will be saved. ''' if al2coRaw is None: conserData = [(letter, Al2coManager.BAD_SCORE_CONSERVATION) for letter in seq] else: try: conserData = self.loadRawAl2co(al2coRaw) except IOError: conserData = [(letter, Al2coManager.BAD_SCORE_CONSERVATION) for letter in seq] prefix, chainType, chainId = self.splitExtendedPrefix( prefixExtended)[:3] # print(len(conserData)); raw_input("enter") try: alcoIx = 0 seqIx = 0 seqLen = len(seq) letters, conserVals = zip(*conserData) conserVals = [float(elem) for elem in conserVals] alcoLen = len(conserData) dataList = [] listOfRowsToPrint = [] mean_val = np.mean(conserVals) std_val = np.std(conserVals) while seqIx < seqLen and alcoIx < alcoLen: letter = seq[seqIx] letterAl2co, consVal = conserData[alcoIx] if letterAl2co == letter or (letterAl2co == "-" and letter == "X"): structIndex = seqStructMap.seqToStructIndex(chainType, chainId, seqIx, asString=True) # print(seqIx, letter, alcoIx, structIndex) if structIndex: if self.filterOutLabels and structIndex[-1].isalpha(): continue else: structIndex = str(seqIx) + "?" if std_val != 0: consValNormalized = (float(consVal) - mean_val) / std_val else: consValNormalized = float(consVal) dataList.append(((chainId, structIndex, letter), ( [consVal], [str(consValNormalized)], ))) listOfRowsToPrint.append("%s %s %s %s %s" % (chainId, structIndex, letter, consVal, consValNormalized)) alcoIx += 1 seqIx += 1 elif not letter in AA_STANDARD and letterAl2co == "-": alcoIx += 1 seqIx += 1 elif letterAl2co == "-": alcoIx += 1 else: print(conserData) print(alcoIx, seqIx) raise ValueError("Al2co mismatch %s %s " % (letterAl2co, letter)) # print(len(listOfRowsToPrint)); raw_input("enter to continue") self.writeResultsFromDataDictSingleChain( {chainId: listOfRowsToPrint}, outName=al2coProc) return dataList except (KeyboardInterrupt, Exception): print("Exception happend computing %s" % al2coProc) tryToRemove(al2coProc) raise finally: if al2coRaw is not None: tryToRemove(al2coRaw) pass
def computeFromSeqStructMapper(self, seqStructMap, extendedPrefix, HHBlitsFnamesDict): ''' Computes corrMut for the Multiple Sequence aligment hhBlitsOut after pairing it by taxa. If more than 2 sequences are found for one taxa, just best match is choosen :param seqStructMap: computeFeatures.seqStep.seqToolManagers.seqExtraction.SeqStructMapper :param HHBlitsFnamesDict: {"l":{"A":"1A2K_F0_l_C_.a3m"}, "r":{"B":"1A2K_F0_r_A_.a3m", "C":"1A2K_F0_r_B_.a3m"}} ''' self.chainsL = set(HHBlitsFnamesDict["l"].keys()) self.chainsR = set(HHBlitsFnamesDict["r"].keys()) if self.checkAlreayComputed(extendedPrefix): print("%s already computed correlated mutations for " % extendedPrefix) return prefix, __, chainType, chainId = self.splitExtendedPrefix( extendedPrefix) aligsDict = { chainType_: { chainId_: self.loadOneAligFile(HHBlitsFnamesDict[chainType_][chainId_]) for chainId_ in HHBlitsFnamesDict[chainType_] } for chainType_ in HHBlitsFnamesDict } for chainIdL in aligsDict["l"]: for chainIdR in aligsDict["r"]: aligFormatedName = os.path.join( self.corrMutOutPath, "tmp_" + prefix + "_l-" + chainIdL + "-r-" + chainIdR + "_" + "u.ali") try: corrMutOutName = self.generateOutName( prefix, chainIdL, chainIdR) if os.path.isfile(corrMutOutName) and self.getNLines( corrMutOutName) > self.minNumResiduesPartner: print("%s already computed" % corrMutOutName) continue print("launching corrMut over chains %s - %s" % (chainIdL, chainIdR)) aligOut = self.createPairedAlignmet( aligsDict["l"][chainIdL], aligsDict["r"][chainIdR], aligFormatedName) if aligOut: __, __, nAlig, seqL, seqR = aligOut else: nAlig = 0 if nAlig > CorrMutGeneric.MIN_N_SEQS_MSA: startTime = time.time() iterOfCorrelatedRows = self.lauchCorrMutProgram( aligFormatedName) print("Time CorrMut", time.time() - startTime) else: iterOfCorrelatedRows = None #( "*** Sorry", "Error, not enough sequences in MSA") self.processResults(iterOfCorrelatedRows, seqStructMap, chainIdL, chainIdR, corrMutOutName, nAlig) except (KeyboardInterrupt, Exception): print( "Exception happend computing corrMut for %s over chains %s - %s" % (prefix, chainIdL, chainIdR)) tryToRemove(corrMutOutName) raise finally: tryToRemove(aligFormatedName) pass