예제 #1
0
    def computeFromSeqStructMapper(self, seqStructMap, prefixExtended,
                                   pssmOutNameRaw):
        '''
      Computes spyder2 for the sequence seqStr, that is contained at fastaInFname. This sequence is
      associated with prefixExtended as an unambiguous id
      :param seqStructMap: computeFeatures.seqStep.seqToolManagers.seqExtraction.SeqStructMapper
      :param prefixExtended: str. unambiguous id of the sequence that will be the prefix of output names
      :param pssmOutNameRaw: str. Path to psiblast pssms results
    '''
        if pssmOutNameRaw is not None:
            if not os.path.isfile(pssmOutNameRaw):
                pssmOutNameRaw = pssmOutNameRaw + ".gz"
            if os.path.isfile(pssmOutNameRaw):
                uncompressFileName = self.uncompressFile(
                    pssmOutNameRaw, self.tmp)
            else:
                uncompressFileName = None
        else:
            uncompressFileName = None
        try:
            prefix, chainType, chainId = self.splitExtendedPrefix(
                prefixExtended)[:3]
            seqStr, fastaFname = seqStructMap.getSeq(
                chainType,
                chainId)  # repeat as psiBlastManager can modify seqs
            seqStructMap.setCurrentSeq(seqStr, chainType, chainId)
            if self.checkAlreayComputed(prefixExtended):
                print("spyder2 already computed for %s" % prefixExtended)
                return 0
            fNames = self.getFNames(prefixExtended)
            spider2ProcName = fNames[0]
            spider2RawName = os.path.join(self.spider2OutPath,
                                          prefixExtended + ".spd3")
            print("launching spyder2 over %s" % prefixExtended)
            curWd = os.getcwd()
            os.chdir(self.spider2OutPath)
            if uncompressFileName is not None:
                cmd = ["python", self.spider2PyScript, uncompressFileName]
                process = Popen(cmd, stdout=PIPE, stderr=PIPE)
                processOut = process.communicate()
                os.chdir(curWd)
                if len(processOut[1]) > 0:
                    print("Error computing spider2. Caught stdin/stderr:\n",
                          processOut[0], processOut[1])
            else:
                spider2RawName = None
            dataList = self.processSpider2(seqStr, seqStructMap,
                                           prefixExtended, spider2RawName,
                                           spider2ProcName)

            if self.winSize:
                self.makeWindowed(dataList, ["asa", "P_C", "P_E", "P_H"],
                                  Spider2Manager.BAD_SCORE_PREDS, [None] * 4,
                                  fNames[1])
        except (Exception, KeyboardInterrupt):
            self.tryToRemoveAllFnames(prefixExtended)
            raise
        finally:
            if uncompressFileName is not None:
                tryToRemove(uncompressFileName)
예제 #2
0
    def processPSAIA(self, prefixAndChainTypeId):
        '''
      Parses raw output from PSAIA and creates results in tab format.
      @param prefixAndChainTypeId: str. fname prefix of raw psaia results (not taking into account 
                                  the full path but the name itself)
    '''
        stringDict = {}
        computed = False
        header = (
            "chainId structResId resName total_ASA b-bone_ASA s-chain_ASA polar_ASA n-polar_ASA total_RASA "
            +
            "b-bone_RASA s-chain_RASA polar_RASA n-polar_RASA average_DPX s_avg_DPX s-ch_avg_DPX s-ch_s_avg_DPX "
            "max_DPX min_DPX average_CX s_avg_CX s-ch_avg_CX s-ch_s_avg_CX max_CX min_CX Hydrophobicity\n"
        )

        for fname in os.listdir(self.outPathRaw):
            if fname.endswith(".tbl"):
                if fname.startswith(prefixAndChainTypeId) and not computed:
                    f = open(os.path.join(self.outPathRaw, fname))
                    for i in range(8):
                        f.readline()
                    for line in f:
                        arrayLine = line.split()
                        out = [arrayLine[0]] + [arrayLine[6]] + [
                            self.threeLetterAA_to_one(arrayLine[7])
                        ] + arrayLine[8:]
                        out = "\t".join(out)
                        try:
                            stringDict[arrayLine[0]].append(out)
                        except KeyError:
                            stringDict[arrayLine[0]] = [out]
                    f.close()
                    computed = True

        splitName = prefixAndChainTypeId.split("_")
        outNames = []
        try:

            for chainId in stringDict:

                if len(splitName) == 3:
                    prefix, chainType, unbound = splitName
                    outName = os.path.join(
                        self.outPathProc, prefix + "_" + chainType + "_" +
                        chainId + "_u.psaia.tab")
                else:
                    outName = os.path.join(
                        self.outPathProc,
                        prefixAndChainTypeId + "_" + chainId + "_u.psaia.tab")
                outNames.append(outName)
                outFile = open(outName, "w")
                outFile.write(header)
                outFile.write("\n".join(stringDict[chainId]))
                outFile.close()
        except (KeyboardInterrupt, Exception):
            for outName in outNames:
                print("Exception happend computing %s" % outName)
                tryToRemove(outName)
            raise
예제 #3
0
  def saveProcResults(self, seqL, seqR, corrMutOutName, iterOfCorrelatedRows, chainIdL, chainIdR, nAlig):
    '''
      Reads corrMut output file and writes another one with tabulated format, headers and
      some error checking.
      @param: seqL: str. Sequence of the ligand chain
      @param: seqR: str. Sequence of the receptor chain
      @param corrMutOutName: str. Fname where formated results will be saved.
      @param iterOfCorrelatedRows: iterator of elements as [res_i, res_j, corrMuScore] ] res_i and res_j are 0 based
      @param chainIdL:str. The chain Id for the ligand
      @param chainIdR:str. The chain Id for the receptor
      @param nAlig: int. The number of rows of MSA
    '''

    corrMutQuality=  float(nAlig)/ (len(seqL)+len(seqR))
    if iterOfCorrelatedRows==None:
      self.makeFakeFile( seqL, seqR, corrMutOutName, corrMutQuality, chainIdL, chainIdR)
      return 1
    else:
      try:
        with open(corrMutOutName,"w") as outFile:
          self.writeHeader(outFile)
          scoresDict={}
          lenSeqL= len(seqL)
          lenSeparator= len(CorrMutGeneric.SEQUENCES_SEPARATOR)
          addedI_J= set([])
#          for line in corrMutOut.split("\n")[1:]:
          for line in iterOfCorrelatedRows:
            i, j, score= line
#            i, j=int(i)-1, int(j)-1
            if i>=lenSeqL or j <(lenSeqL+lenSeparator): continue
            j= j-lenSeqL-lenSeparator
            assert j>=0
            addedI_J.add((i,j))
            letterL= seqL[i]
            letterR= seqR[j]
            score= float(score)
            structIndexL= self.seqsManager.seqToStructIndex("l", chainIdL, i, asString= True) 
            structIndexR= self.seqsManager.seqToStructIndex("r", chainIdR, j, asString= True)
            if structIndexR is None or (self.filterOutLabels and structIndexR[-1].isalpha()): continue
            if structIndexL is None or (self.filterOutLabels and structIndexL[-1].isalpha()): continue
            outFile.write("%s %s %s %s %s %s %f %f\n"%(chainIdL, structIndexL, letterL, chainIdR, structIndexR, letterR, 
                                                  score, corrMutQuality))
          for i in range(len(seqL)):
              letterL= seqL[i]            
              for j in range(len(seqR)):
                if not (i,j) in addedI_J:
                  letterR= seqR[j]
                  structIndexL= self.seqsManager.seqToStructIndex("l", chainIdL, i, asString= True) 
                  structIndexR= self.seqsManager.seqToStructIndex("r", chainIdR, j, asString= True)
                  if structIndexR is None or (self.filterOutLabels and structIndexR[-1].isalpha()): continue
                  if structIndexL is None or (self.filterOutLabels and structIndexL[-1].isalpha()): continue
                  outFile.write("%s %s %s %s %s %s %f %f\n"%(chainIdL, structIndexL, letterL, chainIdR, structIndexR, letterR, 
                                                    0.0, corrMutQuality))
          return 0
      except (KeyboardInterrupt, Exception) as e:
        print(e)
        print("Exception happend computing %s"%corrMutOutName)
        tryToRemove(corrMutOutName)    
        raise
예제 #4
0
 def tryToRemoveAllFnames(self, prefixExtended):
     '''
 try to remove all fnames returned by getFNames (useful to clean if some exception happens
 :param prefixExtended. prefix for output fnames.
 '''
     for fname in self.getFNames(prefixExtended):
         if os.path.isfile(fname):
             tryToRemove(fname)
예제 #5
0
    def fromDfToComplexCodif(self,
                             prefix,
                             pairsCodified,
                             prefixesInvolvedInCoding,
                             isSeqOnly=""):
        if isSeqOnly == "":
            wholeComplexObject = ComplexCodified(prefix, pairsCodified,
                                                 prefixesInvolvedInCoding)
        else:
            pairsCodifiedL_seq = pairsCodified if isSeqOnly == "l" else None
            pairsCodifiedR_seq = pairsCodified if isSeqOnly == "r" else None
            wholeComplexObject = ComplexSeqStructCodified(
                prefix, pairsCodifiedL_seq, pairsCodifiedR_seq,
                prefixesInvolvedInCoding)
        if not self.sampledOutPath is None:
            if self.verbose: print("Sampling %s" % prefix)
            sampledComplexObject = wholeComplexObject.getSampledVersion(
                self.samplingFold)
            if isSeqOnly != "" and prefix.split("@")[0][-3:] not in [
                    "#sl", "#sr"
            ]:
                outName = os.path.join(
                    self.sampledOutPath,
                    prefix + "#s%s.train.pkl.gz" % isSeqOnly)
            else:
                outName = os.path.join(self.sampledOutPath,
                                       prefix + ".train.pkl.gz")
            try:
                joblib.dump(sampledComplexObject,
                            outName,
                            compress=5,
                            protocol=2)
            except (KeyboardInterrupt, Exception):
                print("Exception happened computing %s" % outName)
                tryToRemove(outName)
                raise

        if not self.wholeComplexOutPath is None:
            if isSeqOnly != "" and prefix.split("@")[0][-3:] not in [
                    "#sl", "#sr"
            ]:
                outName = os.path.join(
                    self.wholeComplexOutPath,
                    prefix + "#s%s.predict.pkl.gz" % isSeqOnly)
            else:
                outName = os.path.join(self.wholeComplexOutPath,
                                       prefix + ".predict.pkl.gz")
            try:
                if self.verbose: print("Writing results to disk")
                joblib.dump(wholeComplexObject,
                            outName,
                            compress=5,
                            protocol=2)
            except (KeyboardInterrupt, Exception):
                print("Exception happened computing %s" % outName)
                tryToRemove(outName)
                raise
        return wholeComplexObject
예제 #6
0
    def codifyComplex(self, prefix):
        '''
      Codifies one complex whose identifier is prefix. The features of the complex must have been computed previously
      and they must be located at self.dataRootPath path.
      @param prefix: str. A pdb id for a complex. If complex is formed by 2 pdb files, then prefix 
            looks like "receptorId<->ligandId" e.g. "1da2<->1lla"
      @return wholeComplexObject: ComplexCodified.ComplexCodified. A ComplexCodified object containing all putative
                                  pairs
    '''

        if self.verbose: print("Codifying %s" % prefix)
        if "<->" in prefix:
            prefixR, prefixL = prefix.split("<->")
        else:
            prefixR, prefixL = (prefix, prefix)
        prefixR = prefix + "_r"
        prefixL = prefix + "_l"

        pairsCodifiedDir = self.CodProtocol.applyProtocol(
            prefix, prefixL, prefixR)
        wholeComplexObject = ComplexCodified(prefix, pairsCodifiedDir)
        if not self.sampledOutPath is None:
            if self.verbose: print("Sampling %s" % prefix)
            sampledComplexObject = wholeComplexObject.getSampledVersion(
                self.samplingFold)
            outName = os.path.join(self.sampledOutPath,
                                   prefix + ".train.pkl.gz")
            try:
                joblib.dump(sampledComplexObject,
                            outName,
                            compress=5,
                            protocol=2)
            except (KeyboardInterrupt, Exception):
                print("Exception happened computing %s" % outName)
                tryToRemove(outName)
                raise

        if not self.wholeComplexOutPath is None:
            outName = os.path.join(self.wholeComplexOutPath,
                                   prefix + ".predict.pkl.gz")
            try:
                if self.verbose: print("Writing results to disk")
                joblib.dump(wholeComplexObject,
                            outName,
                            compress=5,
                            protocol=2)
            except (KeyboardInterrupt, Exception):
                print("Exception happened computing %s" % outName)
                tryToRemove(outName)
                raise
        if self.verbose: print("%s succesfully codified" % (prefix))
        ##    a= raw_input("press enter to continue")
        return wholeComplexObject
예제 #7
0
    def processPSSM(self, seq, prefixExtended, pssmNameRaw, pssmNameProc,
                    areSeqIdsMapped):
        '''
      Reads psiblast pssms output file and writes another one with tabulated format, headers and
      some error checking.
      @param seq: str. Sequence of the chain
      @param prefixExtended: str. unambiguous id of the sequence that will be the prefix of output names
      @param pssmNameRaw: str. Path to psiblast aligments results
      @param pssmNameProc: str. Path where formated results will be saved.
      @param areSeqIdsMapped: boolean. True if psiblast output is obtained from 3dConsDb and thus, 
                structIds are included in first col of pssm files instead of seqIds, false if
                first column are seqIds.
    '''
        try:
            pssmData, pssmResIds, pssmSeq = self.loadPSSM(pssmNameRaw)
            if areSeqIdsMapped:
                seq = pssmSeq
                prefix, chainType, chainId, __ = prefixExtended.split("_")
                self.seqsManager.addResiduesToSeqToStructMap(
                    chainType, chainId, pssmSeq, pssmResIds)
            else:
                assert pssmSeq == seq
        except IOError:
            print("Pssm was not computed. Default value inserted instead")
            pssmSeq = seq
            pssmData = [
                " ".join([
                    PsiBlastManager.BAD_SCORE_CONSERVATION for i in range(42)
                ]) for i in range(len(seq))
            ]
        prefix, chainType, chainId, __ = prefixExtended.split("_")
        try:
            outFile = open(pssmNameProc, "w")
            outFile.write("chainId seqIndex structResId resName " +
                          "pssm " * 20 + "psfm " * 20 + "score " * 2 + "\n")
            assert len(pssmData) == len(seq)
            for i, (pssmArrayJoined, letter) in enumerate(zip(pssmData, seq)):
                structIndex = self.seqsManager.seqToStructIndex(chainType,
                                                                chainId,
                                                                i,
                                                                asString=True)

                if self.filterOutLabels and structIndex[-1].isalpha():
                    continue
                outFile.write("%s %d %s %s " %
                              (chainId, i, structIndex, letter) +
                              pssmArrayJoined + "\n")
            outFile.close()
        except (KeyboardInterrupt, Exception):
            print("Exception happend computing %s" % pssmNameProc)
            tryToRemove(pssmNameProc)
            raise
        return pssmSeq
예제 #8
0
    def computeOneFile(self, fileName):
        '''
      Computes distance for each pair of aminoacids for a given pdb file
      @param fileName: str. fname to pdb file
    '''
        prefixAndChainTypeId = (fileName.split("/")[-1]).split(".pdb")[0]
        outName = os.path.join(self.outPath, prefixAndChainTypeId + ".distMat")
        if os.path.isfile(outName):
            print("Already computed Distance Maps")
            return 0
        structure = self.parser.get_structure(prefixAndChainTypeId, fileName)
        structCenterMass = self.getStructCenterMass(structure)

        try:
            outFile = open(outName, "w")
            outFile.write(
                "chainId1 structResId1 chainId2 structResId2 distance angle_to_protCM\n"
            )
            for res1 in structure[0].get_residues():
                if is_aa(res1, standard=True):
                    ##        print res, res.get_full_id()
                    structId1, modelId1, chainId1, resId1 = res1.get_full_id()
                    resId1 = list(resId1)
                    resId1[1] = str(resId1[1])
                    resId1 = "".join(resId1[1:])
                    if chainId1 == " ":
                        chainId1 = "*"
                    for res2 in structure[0].get_residues():
                        if is_aa(res2, standard=True):
                            ##        print( res, res.get_full_id())
                            structId2, modelId2, chainId2, resId2 = res2.get_full_id(
                            )
                            resId2 = list(resId2)
                            resId2[1] = str(resId2[1])
                            resId2 = "".join(resId2[1:])
                            if chainId2 == " ":
                                chainId2 = "*"
                            magnitude = self.getMagnitude(
                                res1, res2, structCenterMass)
                            #              print( chainId1, resId1, chainId2, resId2, magnitude)
                            #              a= raw_input()
                            outFile.write(
                                chainId1 + " " + resId1 + " " + chainId2 +
                                " " + resId2 + " " +
                                " ".join([str(val)
                                          for val in magnitude]) + "\n")
            outFile.close()
        except (KeyboardInterrupt, Exception):
            print("Exception happend computing %s" % outName)
            tryToRemove(outName)
            raise
        return 0
예제 #9
0
 def makeWindowedPSSM(self, pssmNameProc, winPssmOutName):
     '''
   Computes sliding windows for a given pssmFile. Windows will include aa code and pssm features
   @param pssmNameProc: str. Path to processed pssm file (my format)
   @param winPssmOutName: str. Path to windowed results.
 '''
     try:
         WindowPSSM(self.winSize, True, INCLUDE_PSSM,
                    INCLUDE_PSFM).compute(pssmNameProc, winPssmOutName)
     except (KeyboardInterrupt, Exception):
         print("Exception happend computing %s" % winPssmOutName)
         tryToRemove(winPssmOutName)
         raise
예제 #10
0
 def makeWindowedPSSMHhblits(self, profileNameProc, winProfileOutName):
     '''
   Computes sliding windows for a given profileNameProc.
   @param profileNameProc: str. Path to processed hhblits profile file
   @param winProfileOutName: str. Path to windowed results.
 '''
     try:
         WindowHHblits(self.winSize).compute(profileNameProc,
                                             winProfileOutName)
     except (KeyboardInterrupt, Exception):
         print("Exception happend computing %s" % winProfileOutName)
         tryToRemove(winProfileOutName)
         raise
예제 #11
0
 def runClustalW(self,
                 filteredSeqsFname,
                 psiBlastOut,
                 clustalWOutName=None):
     tmpFnameCommon = ".".join(filteredSeqsFname.split(".")[:-1])
     if clustalWOutName is None:
         clustalWOutName = tmpFnameCommon + ".clustalw"
     clustalCommand = [
         self.clustalW,
         "-infile=%s" % filteredSeqsFname,
         "-outfile=%s" % clustalWOutName, "-outorder=INPUT"
     ]
     print(" ".join(clustalCommand))
     try:
         proc = Popen(clustalCommand, stdin=PIPE, stdout=PIPE, stderr=PIPE)
         output = proc.communicate()
         if output == None or output[
                 1] != "" or "There was an error parsing psiblast, clustalw" in output[
                     0]:
             print(output)
             print("Error when clustalw %s for al2Co" % psiBlastOut)
             raise FeatureComputerException(
                 "Error when clustalw %s for al2Co" % psiBlastOut)
         return clustalWOutName
     except (Exception, KeyboardInterrupt):
         tryToRemove(clustalWOutName)
         raise
     finally:
         tryToRemove(filteredSeqsFname)
         tryToRemove(filteredSeqsFname + ".clstr")
         tryToRemove(tmpFnameCommon + ".dnd")
예제 #12
0
 def makeFakeFile(self, seqL, seqR, corrMutOutName, corrMutQuality, chainIdL, chainIdR):
   try:
     with open(corrMutOutName,"w") as outFile:
       self.writeHeader(outFile)
       for i,letterL in enumerate(seqL):
         structIndexL= self.seqsManager.seqToStructIndex("l", chainIdL, i, asString= True)
         if structIndexL is None or (self.filterOutLabels and structIndexL[-1].isalpha()): continue
         for j,letterR in enumerate(seqR):
           structIndexR= self.seqsManager.seqToStructIndex("r", chainIdR, j, asString= True)
           if structIndexR is None or (self.filterOutLabels and structIndexR[-1].isalpha()): continue
           outFile.write("%s %s %s %s %s %s %f %f\n"%(chainIdL, structIndexL, letterL, chainIdR, structIndexR, letterR, 
                                                 CorrMutGeneric.BAD_SCORE_CONSERVATION, corrMutQuality))
   except (KeyboardInterrupt, Exception):
     print("Exception happend computing %s"%corrMutOutName)
     tryToRemove(corrMutOutName)    
     raise
예제 #13
0
    def runCdHit(self, allHits, inputSeq, psiBlastOut, pairSeqIdThr=0.95):
        tmpName = os.path.basename(psiBlastOut).split(".")[0]
        tmpName = os.path.join(self.tmp, tmpName)
        cdhitInName = tmpName + ".in-cdhit"
        cdhitOutName = tmpName + ".out-cdhit"
        try:
            with open(cdhitInName, "w") as f:
                for hit in allHits:
                    f.write("> %s\n" % (hit["target_full_id"]))
                    f.write("%s\n" % (hit["targetSeq"].replace("-", "")))

            if (pairSeqIdThr > .70 and pairSeqIdThr <= 1.00): n = 5
            elif (pairSeqIdThr <= .70 and pairSeqIdThr >= .55): n = 4
            elif (pairSeqIdThr < .55 and pairSeqIdThr >= .50): n = 3
            elif (pairSeqIdThr < .50 and pairSeqIdThr >= .40): n = 2
            else:
                raise ValueError("Error, just .4<=pairSeqIdThr<=1.00 allowed")

            cdhitCmd = [
                self.cdHitBin, "-i", cdhitInName, "-o", cdhitOutName, "-n",
                str(n), "-c",
                str(pairSeqIdThr), "-T",
                str(self.psiBlastNThrs)
            ]
            print(" ".join(cdhitCmd))
            proc = Popen(cdhitCmd, stdin=PIPE, stdout=PIPE, stderr=PIPE)
            output = proc.communicate()
            if output == None or output[
                    1] != "" or "There was an error cd-hit psiblast" in output[
                        0]:
                print(output)
                print("Error when parsing %s for al2Co" % psiBlastOut)
                raise FeatureComputerException(
                    "Error when cd-hit %s for al2Co" % psiBlastOut)

            with open(cdhitOutName, "r+") as f:
                fileData = f.read()
                f.seek(0, 0)
                f.write("> InputSeq\n")
                f.write("%s\n" % (inputSeq.replace("-", "")))
                f.write(fileData + "\n")
            return cdhitOutName
        except (Exception, KeyboardInterrupt):
            tryToRemove(cdhitOutName)
            raise
        finally:
            tryToRemove(cdhitInName)
예제 #14
0
  def compute(self, HHBlitsFnamesDict, prefix):
    '''
      Computes corrMut for the Multiple Sequence aligment hhBlitsOut after pairing it by taxa. If more than 2 sequences
      are found for one taxa, just best match is choosen
      @param HHBlitsFnamesDict: {"l":{"A":"1A2K_l_A_u.a3m"}, "r":{"B":"1A2K_r_B_u.a3m", "C":"1A2K_r_C_u.a3m"}}
      @param prefix: str. The prefix of the complex, p.e. 1A2K
    '''
    
    aligsDict= {chainType:{ chainId: self.loadOneAligFile(HHBlitsFnamesDict[chainType][chainId]) 
                    for chainId in HHBlitsFnamesDict[chainType]} for chainType in HHBlitsFnamesDict}
    for chainIdL in aligsDict["l"]:
      for chainIdR in aligsDict["r"]:
        print("launching corrMut over chains %s - %s"%(chainIdL, chainIdR))
#        raw_input("press enter to procced")
        aligFormatedName= os.path.join(self.corrMutOutPath, "tmp_"+prefix+"_l-"+chainIdL+"-r-"+chainIdR+"_"+"u.ali")
        try:
          corrMutOutName= os.path.join(self.corrMutOutPath, prefix+"_l-"+chainIdL+"_r-"+chainIdR+"_"+"u.corrMut")
          if self.checkAlreayComputed(corrMutOutName): 
            print("%s already computed"%corrMutOutName)
            continue
          aligOut= self.createPairedAlignmet(aligsDict["l"][chainIdL], aligsDict["r"][chainIdR], 
                                                                      aligFormatedName)
          if aligOut:
            __, __, nAlig, seqL, seqR= aligOut
          else:
            nAlig=0
            seqL, __= self.seqsManager.getSeq("l", chainIdL)
            seqR, __= self.seqsManager.getSeq("r", chainIdR)

          if nAlig> CorrMutGeneric.MIN_N_SEQS_MSA:
            startTime= time.time()
            iterOfCorrelatedRows= self.lauchCorrMutProgram(aligFormatedName)
            print("Time CorrMut", time.time()- startTime)
          else:
            iterOfCorrelatedRows= None #( "*** Sorry", "Error, not enough sequences in MSA") 
#          if len(processOut[1])>0:
#            print("Error computing corrMut. Caught stdin/stderr:\n",processOut[0],processOut[1])
          self.saveProcResults(seqL, seqR, corrMutOutName, iterOfCorrelatedRows, chainIdL, chainIdR, nAlig)
        except (KeyboardInterrupt, Exception):
          print("Exception happend computing corrMut for %s over chains %s - %s"%(prefix, chainIdL, chainIdR))
          tryToRemove(corrMutOutName)
          raise
        finally:
          tryToRemove(aligFormatedName)
          pass
예제 #15
0
  def contactMapOneComplex(self):
    '''
      Computes the contact map of a complex. Initial input for complex codification. Contact map is a file written at
      self.computedFeatsRootDir/common/contactMaps/ with name prefix.cMap.tab where prefix is either the common name of
      ligand and receptor pdb files or the concatenation of ligand and receptor names.
      1A2K_l_u.pdb and 1A2K_r_u.pdb  --> 1A2K.cMap.tab
      1A2K_l_u.pdb and 1A22.pdb  --> 1A2K-1A22.cMap.tab
      
    '''    
    outName= self.outName
    print (outName)
    if os.path.isfile(outName):
      print ('Already computed contact map')
      return 0

    seqL =  self.parseFasta(self.lFname, inputNumber="1")
    seqR =  self.parseFasta( self.rFname, inputNumber="2")
#    print(repr(seqL))
#    print(repr(seqR))
    nResiduesL= len(seqL)
    nResiduesR= len(seqR)
    if not (self.minNumResiduesPartner< nResiduesL < self.maxNumResiduesPartner):
      raise BadNumberOfResidues(nResiduesL, "1")
    if not (self.minNumResiduesPartner< nResiduesR < self.maxNumResiduesPartner):
      raise BadNumberOfResidues(nResiduesL, "2")
      
    with open(outName,"w") as outFile:
      outFile.write("chainIdL structResIdL resNameL chainIdR structResIdR resNameR categ\n")
      try:
        for ixL, resnameL in enumerate(seqL):
          chainIdL="L"
          resIdL= str(ixL)
          if not resnameL in d1_to_index: continue
          for ixR, resnameR in enumerate(seqR):
            if not resnameR in d1_to_index: continue          
            chainIdR="R"
            resIdR= str(ixR)
            categ= np.nan
#            print("%s %s %s %s %s %s %s\n" %(chainIdL, resIdL, resnameL, chainIdR, resIdR, resnameR, categ))
            outFile.write("%s %s %s %s %s %s %s\n" %(chainIdL, resIdL, resnameL, chainIdR, resIdR, resnameR, categ))
      except (KeyboardInterrupt, Exception):
        print("Exception happend computing %s"%outName)
        tryToRemove(outName)    
        raise
예제 #16
0
    def computeOneFile(self, fileName):
        '''
      Computes PSAIA for a given pdb file
      @param fileName: str. fname to pdb file
    '''
        try:
            prefixAndChainTypeId = (
                os.path.split(fileName)[-1]).split(".pdb")[0]
            for fname in os.listdir(self.outPathRaw):  #remove old psaia runs
                if fname.startswith(prefixAndChainTypeId):
                    os.remove(os.path.join(self.outPathRaw, fname))
            if self.checkIfAlreadyComputed(prefixAndChainTypeId):
                print("PSAIA already computed")
                return 0
            f = open(self.listFileNameForPSAIA % prefixAndChainTypeId, "w")
            f.write(fileName)
            f.close()

            f = open(self.configFileName % prefixAndChainTypeId, "w")
            f.write(self.configForPSAIATemplate)
            f.close()

            proc = Popen([
                os.path.join(self.psaiaRootDir, "psa"),
                (self.configFileName % prefixAndChainTypeId),
                self.listFileNameForPSAIA % prefixAndChainTypeId
            ],
                         stdin=PIPE,
                         stdout=PIPE,
                         stderr=PIPE)
            output = proc.communicate(input="y\n")
            if output == None or output[
                    1] != "" or "There was an error in PDB" in output[0]:
                print(output)
                print("Error when computing PSAIA for %s" % fileName)
                raise FeatureComputerException(
                    "Error when computing PSAIA for %s" % fileName)
            else:
                self.processPSAIA(prefixAndChainTypeId)
            return 0
        finally:
            tryToRemove(self.listFileNameForPSAIA % prefixAndChainTypeId)
            tryToRemove(self.configFileName % prefixAndChainTypeId)
예제 #17
0
def getClusterRepresentatives(prefix,
                              filesPattern,
                              outPath,
                              maxPerClus=MAX_ELEMS_PER_CLUS):
    outPath = myMakeDir(os.path.expanduser(outPath))
    if maxPerClus is None:
        path, base = os.path.split(os.path.expanduser(filesPattern))
        nPos = check_output("ls %s/*_T* | wc -l " % (path), shell=True)
        maxPerClus = int(nPos) / 2

    mergedFileName = os.path.join(os.path.expanduser("~/tmp"),
                                  prefix + ".merged.pdb")
    try:
        fileNames = mergePDBs(filesPattern, mergedFileName)
    except (Exception, KeyboardInterrupt) as e:
        print("exception:", e)
        tryToRemove(mergedFileName)
        raise
    print(fileNames)
    clusterize(mergedFileName, prefix, fileNames, outPath, maxPerClus)
예제 #18
0
    def writeGzResults(self, outName, headerStr, listOfRecords):
        '''
    Used to write a list of records as a .gz file

    :param outName: str. path where results will be saved
    :param headerStr: str. string that will be written at the beginning of the file
    :param dataList: a list of str that represents the rows of the dataframe
                 e.g [ "A 123 L B 2 I 0.1 0.8 -1", ...]
    '''
        dirName, baseName = os.path.split(outName)
        tmpOutName = os.path.join(dirName, "tmp-" + baseName)
        try:
            with gzip.open(tmpOutName, "w") as outFile:
                outFile.write(headerStr)
                outFile.write("\n".join(listOfRecords))
            tryToMove(tmpOutName, outName)
        except (KeyboardInterrupt, Exception):
            tryToRemove(outName)
            tryToRemove(tmpOutName)
            raise
예제 #19
0
def clusterize(mergedFileName, prefix, fileNames, outPath, maxPerClus):
    logsClusters = os.path.join(os.path.expanduser("~/tmp"),
                                prefix + ".clusters.log")
    distCut = DIST_CUT
    gmx = GMX_PATH
    cwd = os.getcwd()
    os.chdir(os.path.expanduser("~/tmp"))
    cmd = (
        "echo 5 | %(gmx)s cluster -f %(mergedFileName)s -s %(mergedFileName)s "
        + "-cutoff %(distCut)f -g %(logsClusters)s -nofit -method gromos"
    ) % locals()

    if not os.path.isfile(logsClusters):
        print(cmd)
        try:
            check_output(cmd, shell=True)
            os.chdir(cwd)
        except (Exception, KeyboardInterrupt):
            tryToRemove(logsClusters)
    with open(logsClusters) as f:
        for line in f:
            if line.startswith("cl."):
                break
        members = None
        for line in f:
            lineArray = line.split("|")
            print(lineArray)
            if not lineArray[0].strip().isdigit():
                members += [int(elem) for elem in lineArray[-1].split()]
            else:
                if members:
                    processOneCluster(fileNames, centroid, members, outPath,
                                      maxPerClus)
                centroid = lineArray[2].split(".")[0]
                if centroid[-1].isdigit():
                    centroid = centroid[:-1]
                centroid = int(centroid)
                members = [int(elem) for elem in lineArray[-1].split()]
    processOneCluster(fileNames, centroid, members, outPath, maxPerClus)
예제 #20
0
 def processHhblits(self, seq, prefixExtended, profileNameRaw,
                    profileNameProc):
     '''
   Reads hhblits profile output file and writes another one with tabulated format, headers and
   some error checking.
   @param: seq: str. Sequence of the chain
   @param prefixExtended: str. unambiguous id of the sequence that will be the prefix of output names
   @param profileNameRaw: str.  Path to profiles results
   @param profileNameProc: str. Path where formated results will be saved.
 '''
     try:
         hhBlitsData = self.loadHhblits(profileNameRaw)
     except IOError:
         hhBlitsData = [
             " ".join(
                 [HHBlitsManager.BAD_SCORE_CONSERVATION for i in range(31)])
             for i in range(len(seq))
         ]
     prefix, chainType, chainId, __ = prefixExtended.split("_")
     try:
         outFile = open(profileNameProc, "w")
         outFile.write("chainId seqIndex structResId resName " +
                       "hhblits " * 31 + "\n")
         for i, (hhBlitsArrayJoined,
                 letter) in enumerate(zip(hhBlitsData, seq)):
             structIndex = self.seqsManager.seqToStructIndex(chainType,
                                                             chainId,
                                                             i,
                                                             asString=True)
             if self.filterOutLabels and structIndex[-1].isalpha():
                 continue
             outFile.write("%s %d %s %s " %
                           (chainId, i, structIndex, letter) +
                           hhBlitsArrayJoined + "\n")
         outFile.close()
     except (KeyboardInterrupt, Exception):
         print("Exception happend computing %s" % profileNameProc)
         tryToRemove(profileNameProc)
         raise
예제 #21
0
    def createFileForError(self, pdbStruct, outName):
        '''
      Creates a fake DSSP raw output generated when DSSP fails. All residues will be assigned secStruc= Z
      @param pdbStruct: Bio.PDB.Structure. Structure of the psb that is being analyzed
      @param outName: str. output fname
    '''
        oneResLine = "%5d%5d%2s%2s %2s\n"
        try:
            f = open(outName, "w")
            f.write(DsspComputer.DSSP_HEADER)
            if len(pdbStruct) == 0:
                raise NoValidPDBFile(
                    "No valid pdb File. There are no models contained")
            for chain in pdbStruct[0]:
                for i, res in enumerate(chain):
                    if not is_aa(res): continue
                    ##        print i,res,res.get_id()
                    seqIndex = i + 1
                    structIndex = res.get_id()[1]
                    letter = self.threeLetterAA_to_one(res.resname)
                    fakeSecStruct = "Z"
                    fakeCharacters1 = tuple("f" * 7)
                    fakeDigits1 = tuple([0, 0, "f", 0])
                    fakeStrs = tuple("f" * 4)
                    fakeFloats = tuple(elem + 0.0 for elem in range(8))
                    ##        print ((seqIndex,structIndex, chain.get_id(), letter, fakeSecStruct)+
                    ##                            fakeCharacters1+fakeDigits1+fakeStrs+fakeFloats)
                    ##        print oneResLine%( (seqIndex,structIndex, chain.get_id(), letter, fakeSecStruct)+
                    ##                            fakeCharacters1+fakeDigits1+fakeStrs+fakeFloats)

                    f.write(oneResLine %
                            (seqIndex, structIndex, chain.get_id(), letter,
                             fakeSecStruct))
            f.close()
        except (KeyboardInterrupt, Exception):
            print("Exception happend computing %s" % outName)
            tryToRemove(outName)
            raise
        return 0
예제 #22
0
    def computeOneFile(self, pdbFname, struct):
        '''
      Computes PSAIA for a given pdb file
      :param pdbFname: str. fname to pdb file
      :param struct: ignored
           
    '''
        assert isinstance(
            pdbFname, str
        ), "Error, PSAIA computeOneFile first argument is a path to pdb file (str). given %s" % pdbFname
        prefixExtended = self.getExtendedPrefix(pdbFname)
        prefix, chainType = self.splitExtendedPrefix(prefixExtended)[:2]

        if self.checkAlreayComputed(prefixExtended):
            print("Psaia already computed for %s" % prefixExtended)
            return 0
        print("launching PSAIA over %s" % prefixExtended)

        uncompressFileName = self.uncompressFile(pdbFname, self.tmp)
        try:
            with open(self.listFileNameForPSAIA % prefixExtended, "w") as f:
                f.write(uncompressFileName)

            with open(self.configFileName % prefixExtended, "w") as f:
                f.write(self.configForPSAIATemplate)

            proc = Popen([
                os.path.join(self.psaiaRootDir, "psa"),
                (self.configFileName % prefixExtended),
                self.listFileNameForPSAIA % prefixExtended
            ],
                         stdin=PIPE,
                         stdout=PIPE,
                         stderr=PIPE)
            output = proc.communicate(input="y\n")
            if output == None or output[
                    1] != "" or "There was an error in PDB" in output[0]:
                print(output)
                print("Error when computing PSAIA for %s" % pdbFname)
                raise FeatureComputerException(
                    "Error when computing PSAIA for %s" % pdbFname)
            else:
                self.processPSAIA(prefixExtended)
            return 0
        except (Exception, KeyboardInterrupt):
            self.tryToRemoveAllFnames(prefixExtended)
            raise
        finally:
            tryToRemove(self.listFileNameForPSAIA % prefixExtended)
            tryToRemove(self.configFileName % prefixExtended)
            tryToRemove(uncompressFileName)
            tryToCleanDir(self.outPathRaw,
                          prefixExtended,
                          rootDataDir=self.computedFeatsRootDir)
예제 #23
0
def trainAndTestOneFold(trainData, testPrefixes, trainSubsetN, testPath, outputPath, verbose=False, ncpu=1):
  '''
    Trains and tests one fold
     
     :param trainData: a numpy array for training with first column labels and the others are features
     :param testPrefixes: str[]. A list that contains prefixes for all complexes to be tested
     :param trainSubsetN: int Tuple. The numerical ids of the training split.
     :param testPath: str. Path to a dir where testing data files are stored
     :param outputPath: str. Path to a dir where predictions will be stored. None if results will not be saved
     :param verbose: boolean. Whether or not print to stdout info
     :param ncpu: int. Number of cpu's to use in parallel
  '''

  testPrefixesNotEvaluated = []
  originalTestPrefixToNewPrefix, __ = getOriginalToActualPrefixs(testPrefixes)
  alreadyComputedPrefixes_and_outnames= []
  for testPrefix in originalTestPrefixToNewPrefix:
    if outputPath is not None:
      outName = getResultsOutname(outputPath, testPrefix, trainSubsetN)
      if verbose and os.path.isfile(outName):
        print("Complex already computed: %s" % (outName))
        alreadyComputedPrefixes_and_outnames.append(  (testPrefix, outName) )
      else:
        testPrefixesNotEvaluated.append((testPrefix, outName))
    else:
      testPrefixesNotEvaluated.append((testPrefix, None))

  modelo = None

  from Config import Configuration
  conf = Configuration()
  modelFname= os.path.join(conf.tmp, hashlib.md5("".join(sorted(testPrefixes))).hexdigest()+str(trainSubsetN)+"bipspi2.pckl")

  resultsForEvaluation_list=[]
  if len(testPrefixesNotEvaluated) > 0 or len(testPrefixes) == 0:
    if verbose:
      print("Testing:", [ x[0] for x in testPrefixesNotEvaluated])
      verboseLevel = 1
    else:
      verboseLevel = 0

    if os.path.exists(modelFname):
      print("Loading classifier")
      modelo= joblib_load(modelFname)
    else:
      print("Training classifier")
      modelo = trainMethod(trainData[:, 1:], trainData[:, 0], verboseLevel=verboseLevel, ncpu=ncpu)
      joblib_save(modelo, modelFname)
    del trainData
    gc.collect()
    if verbose: print("Classifier fitted.")
    
    expectedSize= estimateRequiredMemoryPerComplex(testPrefixesNotEvaluated, testPath)
    freeMem= checkFreeMemory()
    nJobs= int(max(1, min(ncpu, freeMem/expectedSize, len(testPrefixesNotEvaluated))))
    print("Free memory for predictOnePrefix: %s GB. Njobs: %s (%s expected size)"%(freeMem, nJobs, expectedSize))

    resultsForEvaluation_list= Parallel(n_jobs=nJobs)(delayed(predictOnePrefix)(originalTestPrefixToNewPrefix[testPrefix],
                                                                      modelo, outName, testPath)
                                      for testPrefix, outName in testPrefixesNotEvaluated )
    gc.collect()

  expectedSize= estimateRequiredMemoryPerComplex(alreadyComputedPrefixes_and_outnames, testPath)
  freeMem= checkFreeMemory()
  nJobs= int(max(1, min(ncpu, freeMem/expectedSize, len(alreadyComputedPrefixes_and_outnames))))     
  resultsForEvaluation_list+= Parallel(n_jobs=nJobs)(delayed(loadExistingResults)( testPrefix, outName,)
                                    for testPrefix, outName in alreadyComputedPrefixes_and_outnames )
    
  if len(resultsForEvaluation_list)>0:
    freeMem = checkFreeMemory()
    totMem= getTotalMemory()
    usedMem= totMem-freeMem
    nJobs = int(max(1, min(ncpu, freeMem / (usedMem/(1+len(resultsForEvaluation_list))))))
    print("Free memory for evaluateOneResultObj: %s GB. Njobs: %s" % (freeMem, nJobs))
    Parallel(n_jobs=nJobs)(delayed(evaluateOneResultObj)(testPrefix, resultObj, False)
                           for testPrefix, resultObj in resultsForEvaluation_list)
    finalResults= zip(*resultsForEvaluation_list)[1]
  else:
    finalResults=[]
  del resultsForEvaluation_list
  tryToRemove(modelFname)
  return finalResults, modelo
예제 #24
0
  def processspider2(self, seq, prefixExtended, spider2Raw, spider2Proc):
    '''
      Reads spider2 output file and writes another one with tabulated format, headers and
      some error checking.
      @param: seq: str. Sequence of the chain
      @param prefixExtended: str. unambiguous id of the sequence that will be the prefix of output names
      @param spider2Raw: str. Path to spider2 results
      @param spider2Proc: str. Path where formated results will be saved.
        head spider2Proc
        chainId seqIndex structResId resName score_asa score_Pc score_Pe score_Ph
        A 0 6 E 146.4 0.982 0.011 0.006
        A 1 7 P 100.2 0.977 0.012 0.012
    '''
    try:
      predictionsData = self.loadspider2(spider2Raw)
    except IOError:
      predictionsData= [ (letter, (tuple([Spider2Manager.BAD_SCORE_CONSERVATION]*4)) ) for letter in seq]
    prefix, chainType, chainId, __= prefixExtended.split("_")
    try:
      outFile= open(spider2Proc,"w")
      outFile.write("chainId seqIndex structResId resName score_asa score_Pc score_Pe score_Ph\n")

      predsIx=0
      seqIx=0
      seqLen= len(seq)
      alcoLen= len(predictionsData)
      while seqIx<seqLen and predsIx<alcoLen:
        letter= seq[seqIx]
        letterspider2, consValTuple= predictionsData[predsIx]
        if letterspider2== letter:
           structIndex= self.seqsManager.seqToStructIndex(chainType, chainId, seqIx, asString= True)
           if self.filterOutLabels and structIndex[-1].isalpha():
             continue
           outFile.write("%s %d %s %s %s %s %s %s\n"%((chainId, seqIx, structIndex, letter)+ consValTuple))
           predsIx+=1
           seqIx+=1
        elif letter=="X" and letterspider2=="-":
           predsIx+=1
           seqIx+=1
        elif letterspider2=="-":
          predsIx+=1
        else:
          print(predictionsData)
          print(seq)
          print(predsIx, seqIx)
          raise ValueError("spider2 mismatch %s %s "%(letterspider2, letter))
#      for i, (letter, (consVal,letterspider2)) in enumerate(zip(seq, predictionsData)):
#        if letter!="X" and  letterspider2!= letter: continue
#        structIndex= self.seqsManager.seqToStructIndex(chainType, chainId, i, asString= True)
#        if self.filterOutLabels and structIndex[-1].isalpha():
#          continue
#        outFile.write("%s %d %s %s %s\n"%(chainId, i, structIndex, letter, consVal))

      outFile.close()
    except (KeyboardInterrupt, Exception):
      print("Exception happend computing %s"%spider2Proc)
      tryToRemove(spider2Proc)
      raise
    finally:
      tryToRemove(spider2Raw)
      pass
예제 #25
0
    def computeOneFile(self, fileName):
        '''
      Computes DSSP for a given pdb file
      @param fileName: str. fname to pdb file
    '''

        prefixAndChainTypeId = (fileName.split("/")[-1]).split(".pdb")[0]
        prefixAndChainTypeId = "_".join(prefixAndChainTypeId.split("_")[:2])
        structure = self.parser.get_structure(prefixAndChainTypeId, fileName)
        model = structure[0]
        outNames = {}
        for chain in structure[0]:
            chainId = chain.get_id()
            nResidues = sum((1 for res in chain if is_aa(res)))
            if chainId == " ": chainId = "*"
            outName = os.path.join(
                self.outPath, prefixAndChainTypeId + "_" + chainId + "_u.hse")
            if nResidues > 5 and not os.path.isfile(outName):
                outNames[chainId] = outName
        if len(outNames) == 0:
            print("HalfSphere already computed")
            return 0

        featuresDict = {}
        hse = HSExposureCA(model)
        for aa, feat in hse:
            featuresDict[aa] = [elem if elem != None else -1 for elem in feat]
        hseDict = {
            aa: [elem if elem != None else -1 for elem in feat]
            for aa, feat in HSExposureCB(model)
        }
        ##    print(len(hseDict))
        ##    raw_input("press enter")
        for aa in set(featuresDict.keys()).union(set(hseDict.keys())):
            try:
                prevFeatures = featuresDict[aa]
            except KeyError:
                prevFeatures = [-1, -1, -1.0]
            try:
                newFeatures = hseDict[aa]
            except KeyError:
                newFeatures = [-1, -1, -1.0]
            featuresDict[aa] = prevFeatures + newFeatures

        hseDict = {
            aa: [feat] if feat != None else -1
            for aa, feat in ExposureCN(model)
        }

        for aa in set(featuresDict.keys()).union(set(hseDict.keys())):
            try:
                prevFeatures = featuresDict[aa]
            except KeyError:
                prevFeatures = [-1, -1, -1.0, -1, -1, -1.0]
            try:
                newFeatures = hseDict[aa]
            except KeyError:
                newFeatures = [-1]
            featuresDict[aa] = prevFeatures + newFeatures

        filesHandlers = {
            chainId: open(outNames[chainId], "w")
            for chainId in outNames.keys()
        }
        for fHand in filesHandlers.values():
            fHand.write(
                "chainId structResId resName HSExposureCA1 HSExposureCA2 HSExposureCA3"
                + " HSExposureCB1 HSExposureCB2 HSExposureCB3 ExposureCN\n")

        resuisduesList = [
            res for chain in structure[0] for res in chain if is_aa(res)
        ]
        badExample = [-1 for elem in list(featuresDict.values())[0]]
        for res in resuisduesList:
            #    for res in featuresDict:
            #      print(res.get_full_id())
            #      print(filesHandlers, res, is_aa(res))
            #      raw_input("press enter to continue")
            #
            structId, modelId, chainId, resId = res.get_full_id()
            resId = list(resId)
            resId[1] = str(resId[1])
            resId = "".join(resId[1:])
            resName = self.threeLetterAA_to_one(res.resname)

            if chainId == " ":
                chainId = "*"
            try:
                if chainId not in filesHandlers: continue  #small chains
                try:
                    valuesForRes = featuresDict[res]
                except KeyError:
                    valuesForRes = badExample
                filesHandlers[chainId].write(
                    chainId + " " + resId + " " + resName + " " +
                    " ".join([str(val) for val in valuesForRes]) + "\n")
            except (KeyboardInterrupt, Exception):
                for outName in outNames.values():
                    print("Exception happend computing %s" % outName)
                    tryToRemove(outName)
                raise
        for outFile in filesHandlers.values():
            outFile.close()

        return 0
예제 #26
0
    def processDSSP(self, prefixAndChainTypeId, pdbStruct, dsspFName):
        '''
      Parses raw output from DSSP and creates results in tab format.
      @param prefixAndChainTypeId: str. fname prefix of raw psaia results (not taking into account 
                                  the full path but the name itself)
      @param pdbStruct: Bio.PDB.Structure. Structure of the pdb that is being analyzed
      @param dsspFName: str. fname of raw Dssp output
    '''
        struct = pdbStruct
        resDict = {}
        for chain in pdbStruct[0]:
            chainId = chain.get_id()
            if chainId == " ": chainId = "*"
            resDict[chainId] = set([])
            for res in chain:
                if not is_aa(res, standard=True): continue
                resId = str(res.get_id()[1])
                if res.get_id()[2] != " ":
                    resId += res.get_id()[2]
                try:
                    resDict[chainId].add(
                        (resId, self.threeLetterAA_to_one(res.resname)))
                except KeyError:
                    continue

        chain = None
        stringDict = {}
        computed = False
        header = ("#Levels: 3:H;B;E;G;I;T;S;Z\n" +
                  "chainId structResId resName 2ndStruct\n")

        f = open(dsspFName)
        error = True
        for line in f:
            if line.startswith('  #  RESIDUE '):
                error = False
                break
        if error == True:
            raise Exception("Error processing DSSP file " + dsspFName)

        for line in f:
            secStruc = line[16]
            if secStruc == " ":
                secStruc = "Z"
            lineArray = line.split()
            chain = line[11]
            resLetter = line[13]
            if resLetter.islower(): resLetter = "C"
            resInd = lineArray[1]
            if chain == " ": chain = "*"
            if resInd.startswith("!"):  #Chain changed
                continue
            resDict[chain].discard((resInd, resLetter))
            out = "\t".join([chain, resInd, resLetter, secStruc])
            try:
                stringDict[chain].append(out)
            except KeyError:
                stringDict[chain] = [out]
        #To fill residues for which we do not have enough information

        for chainId in resDict:
            for resId, resLetter in sorted(resDict[chainId]):
                out = "\t".join([chainId, resId, resLetter, "Z"])
                try:
                    stringDict[chain].append(out)
                except KeyError:
                    stringDict[chain] = [out]
        f.close()
        splitName = prefixAndChainTypeId.split("_")
        for chainId in stringDict:
            if len(splitName) == 3:
                prefix, chainType, unbound = splitName
                outName = os.path.join(
                    self.outPathProc,
                    prefix + "_" + chainType + "_" + chainId + "_u.dssp.tab")
            else:
                outName = os.path.join(
                    self.outPathProc,
                    prefixAndChainTypeId + "_" + chainId + "_u.dssp.tab")


##      print(outName)
            try:
                outFile = open(outName, "w")
                outFile.write(header)
                outFile.write("\n".join(stringDict[chainId]))
                outFile.close()
            except (KeyboardInterrupt, Exception):
                print("Exception happend computing %s" % outName)
                tryToRemove(outName)
                raise
예제 #27
0
    def computeFromSeqStructMapper(self, seqStructMap, prefixExtended,
                                   psiblastOutName, pssmOutNameRaw):
        '''
      Computes al2co for the sequence seqStr, that is contained at fastaInFname. This sequence is
      associated with prefixExtended as an unambiguous id
      :param seqStructMap: computeFeatures.seqStep.seqToolManagers.seqExtraction.SeqStructMapper
      :param prefixExtended: str. unambiguous id of the sequence that will be the prefix of output names
      :param psiblastOutName: str. Path to psiblast aligments results
      :param pssmOutNameRaw: str. Path to psiblast pssms results
    '''
        msaFname = None

        prefix, chainType, chainId = self.splitExtendedPrefix(
            prefixExtended)[:3]
        seqStr, fastaFname = seqStructMap.getSeq(
            chainType, chainId)  # repeat as psiBlastManager can modify seqs
        seqStructMap.setCurrentSeq(seqStr, chainType, chainId)
        if self.checkAlreayComputed(prefixExtended):
            print("Al2co already computed for %s" % prefixExtended)
            return 0
        fNames = self.getFNames(prefixExtended)
        print("launching al2co over %s" % prefixExtended)
        al2coProcName = fNames[0]
        al2coRawName = os.path.join(self.al2coOutPath,
                                    prefixExtended + ".fasta.csv")
        try:
            if os.path.isfile(psiblastOutName):
                alignedSeqsDict = parsePsiBlast(inputSeq=seqStr,
                                                psiBlastOut=psiblastOutName)

                filteredSeqsFname = self.runCdHit(alignedSeqsDict,
                                                  inputSeq=seqStr,
                                                  psiBlastOut=psiblastOutName)
                msaFname = self.runClustalW(filteredSeqsFname,
                                            psiBlastOut=psiblastOutName)

                cmd = [
                    self.al2coBin, "-i", msaFname, "-m", "0", "-f", "2", "-a",
                    "F", "-b", "50", "-g", "0.50", "-w", "1", "-c", "0", "-o",
                    al2coRawName, "-t", al2coProcName
                ]

                print(" ".join(cmd))
                process = Popen(cmd, stdout=PIPE, stderr=PIPE)
                processOut = process.communicate()
                if len(processOut[1]) > 0:
                    print("Error computing al2co. Caught stdin/stderr:\n",
                          processOut[0], processOut[1])
            else:
                print("Error computing al2co. Psiout does not exists for %s" %
                      (prefixExtended))
                al2coRawName = None

            dataList = self.processAl2co(seqStr, seqStructMap, prefixExtended,
                                         al2coRawName, al2coProcName)
            if self.winSize:
                self.makeWindowed(dataList, ["al2co", "al2coNorm"],
                                  [Al2coManager.BAD_SCORE_CONSERVATION] * 2,
                                  [None] * 2, fNames[1])
        except (Exception, KeyboardInterrupt):
            self.tryToRemoveAllFnames(prefixExtended)
            raise
        finally:
            if msaFname: tryToRemove(msaFname)
예제 #28
0
    def contactMapOneComplex(self):
        '''
      Computes the contact map of a complex. Initial input for complex codification. Contact map is a file written at
      self.computedFeatsRootDir/common/contactMaps/ with name prefix.cMap.tab where prefix is either the common name of
      ligand and receptor pdb files or the concatenation of ligand and receptor names.
      1A2K_l_u.pdb and 1A2K_r_u.pdb  --> 1A2K.cMap.tab
      1A2K_l_u.pdb and 1A22.pdb  --> 1A2K-1A22.cMap.tab
      
    '''
        outName = self.outName
        print(outName)
        if os.path.isfile(outName):
            print('Already computed contact map')
            return 0
        lStructId = self.prefixL + "_l_u.pdb"
        rStructId = self.prefixR + "_r_u.pdb"
        structureL_u = self.parser.get_structure(lStructId, self.lFname)
        structureR_u = self.parser.get_structure(rStructId, self.rFname)
        if self.boundAvailable == False or self.isForPrediction:
            structureL_b = None
            structureR_b = None
        else:
            try:
                lStructId_b = self.prefix + "_l_b.pdb"
                rStructId_b = self.prefix + "_r_b.pdb"
                lFname_b = os.path.join(
                    os.path.split(self.lFname)[0], lStructId_b)
                rFname_b = os.path.join(
                    os.path.split(self.rFname)[0], rStructId_b)
                structureL_b = self.parser.get_structure(lStructId_b, lFname_b)
                structureR_b = self.parser.get_structure(rStructId_b, rFname_b)
            except IOError as e:  # in this case there are just unbound pdbs available
                structureL_b = None
                structureR_b = None

        if self.isForPrediction:
            positiveContacts = None
            chainsNotContactR = set([])
            chainsNotContactL = set([])
        elif structureL_b is None or structureR_b is None:  #Compute contacs in bound structures
            positiveContacts, chainsNotContactL, chainsNotContactR = self.getPairsOfResiduesInContact(
                structureL_u, structureR_u)
        else:  #Compute contacs in unbound structures
            positiveContacts, chainsNotContactL, chainsNotContactR = self.getPairsOfResiduesInContact(
                structureL_b, structureR_b)

        if JUST_INTERACTING_CHAINS == False:
            chainsNotContactR = set([])
            chainsNotContactL = set([])

        rResDict = self.mapBoundToUnbound(structureR_u,
                                          structureR_b,
                                          skipBoundChainsIds=chainsNotContactR)
        lResDict = self.mapBoundToUnbound(structureL_u,
                                          structureL_b,
                                          skipBoundChainsIds=chainsNotContactL)
        nResiduesL = len(lResDict)
        nResiduesR = len(rResDict)
        if not (self.minNumResiduesPartner < nResiduesL <
                self.maxNumResiduesPartner):
            raise BadNumberOfResidues(nResiduesL, "1")
        if not (self.minNumResiduesPartner < nResiduesR <
                self.maxNumResiduesPartner):
            raise BadNumberOfResidues(nResiduesR, "2")

        outFile = open(outName, "w")
        outFile.write(
            "chainIdL structResIdL resNameL chainIdR structResIdR resNameR categ\n"
        )
        #    print(sorted(lResDict, key= lambda x: x.get_id()))
        #    a= raw_input()
        try:
            for resL_bound in sorted(lResDict, key=lambda x: x.get_full_id()):
                #      print(resL_bound.get_full_id())
                resL_unbound = lResDict[resL_bound]
                pdbIdL, modelL, chainIdL, resIdL = resL_unbound.get_full_id()
                resIdL = self.makeStrResId(resIdL)

                try:
                    letraL = three_to_one(resL_unbound.resname)
                    if letraL != three_to_one(resL_bound.resname): continue
                except KeyError:
                    continue
                for resR_bound in sorted(rResDict,
                                         key=lambda x: x.get_full_id()):
                    resR_unbound = rResDict[resR_bound]
                    pdbIdR, modelR, chainIdR, resIdR = resR_unbound.get_full_id(
                    )
                    try:
                        letraR = three_to_one(resR_unbound.resname)
                        if letraR != three_to_one(resR_bound.resname): continue
                    except KeyError:
                        continue
                    if self.isForPrediction:
                        categ = np.nan
                    elif (resL_bound.get_full_id(),
                          resR_bound.get_full_id()) in positiveContacts:
                        categ = 1
                    else:
                        categ = -1
                    resIdR = self.makeStrResId(resIdR)
                    if chainIdL == " ": chainIdL = "*"
                    if chainIdR == " ": chainIdR = "*"
                    #        print("%s %s %s %s %s %s %s\n" %(chainIdL, resIdL, letraL, chainIdR, resIdR, letraR, categ) )
                    #        raw_input("enter")
                    outFile.write("%s %s %s %s %s %s %s\n" %
                                  (chainIdL, resIdL, letraL, chainIdR, resIdR,
                                   letraR, categ))
            outFile.close()
        except (KeyboardInterrupt, Exception):
            print("Exception happend computing %s" % outName)
            tryToRemove(outName)
            raise
예제 #29
0
    def processAl2co(self, seq, seqStructMap, prefixExtended, al2coRaw,
                     al2coProc):
        '''
      Reads al2co output file and writes another one with tabulated format, headers and
      some error checking.
      :param: seq: str. Sequence of the chain
      :param prefixExtended: str. unambiguous id of the sequence that will be the prefix of output names
      :param al2coRaw: str. Path to al2co results
      :param al2coProc: str. Path where formatted results will be saved.
    '''
        if al2coRaw is None:
            conserData = [(letter, Al2coManager.BAD_SCORE_CONSERVATION)
                          for letter in seq]
        else:
            try:
                conserData = self.loadRawAl2co(al2coRaw)
            except IOError:
                conserData = [(letter, Al2coManager.BAD_SCORE_CONSERVATION)
                              for letter in seq]

        prefix, chainType, chainId = self.splitExtendedPrefix(
            prefixExtended)[:3]
        #    print(len(conserData)); raw_input("enter")
        try:
            alcoIx = 0
            seqIx = 0
            seqLen = len(seq)
            letters, conserVals = zip(*conserData)
            conserVals = [float(elem) for elem in conserVals]
            alcoLen = len(conserData)
            dataList = []
            listOfRowsToPrint = []
            mean_val = np.mean(conserVals)
            std_val = np.std(conserVals)

            while seqIx < seqLen and alcoIx < alcoLen:
                letter = seq[seqIx]
                letterAl2co, consVal = conserData[alcoIx]
                if letterAl2co == letter or (letterAl2co == "-"
                                             and letter == "X"):
                    structIndex = seqStructMap.seqToStructIndex(chainType,
                                                                chainId,
                                                                seqIx,
                                                                asString=True)
                    #          print(seqIx, letter, alcoIx, structIndex)
                    if structIndex:
                        if self.filterOutLabels and structIndex[-1].isalpha():
                            continue
                    else:
                        structIndex = str(seqIx) + "?"
                    if std_val != 0:
                        consValNormalized = (float(consVal) -
                                             mean_val) / std_val
                    else:
                        consValNormalized = float(consVal)
                    dataList.append(((chainId, structIndex, letter), (
                        [consVal],
                        [str(consValNormalized)],
                    )))
                    listOfRowsToPrint.append("%s %s %s %s %s" %
                                             (chainId, structIndex, letter,
                                              consVal, consValNormalized))
                    alcoIx += 1
                    seqIx += 1
                elif not letter in AA_STANDARD and letterAl2co == "-":
                    alcoIx += 1
                    seqIx += 1
                elif letterAl2co == "-":
                    alcoIx += 1
                else:
                    print(conserData)
                    print(alcoIx, seqIx)
                    raise ValueError("Al2co mismatch %s %s " %
                                     (letterAl2co, letter))


#      print(len(listOfRowsToPrint)); raw_input("enter to continue")
            self.writeResultsFromDataDictSingleChain(
                {chainId: listOfRowsToPrint}, outName=al2coProc)
            return dataList
        except (KeyboardInterrupt, Exception):
            print("Exception happend computing %s" % al2coProc)
            tryToRemove(al2coProc)
            raise
        finally:
            if al2coRaw is not None:
                tryToRemove(al2coRaw)
            pass
예제 #30
0
    def computeFromSeqStructMapper(self, seqStructMap, extendedPrefix,
                                   HHBlitsFnamesDict):
        '''
      Computes corrMut for the Multiple Sequence aligment hhBlitsOut after pairing it by taxa. If more than 2 sequences
      are found for one taxa, just best match is choosen
      :param seqStructMap: computeFeatures.seqStep.seqToolManagers.seqExtraction.SeqStructMapper
      :param HHBlitsFnamesDict: {"l":{"A":"1A2K_F0_l_C_.a3m"}, "r":{"B":"1A2K_F0_r_A_.a3m", "C":"1A2K_F0_r_B_.a3m"}}
    '''
        self.chainsL = set(HHBlitsFnamesDict["l"].keys())
        self.chainsR = set(HHBlitsFnamesDict["r"].keys())
        if self.checkAlreayComputed(extendedPrefix):
            print("%s already computed correlated mutations for " %
                  extendedPrefix)
            return

        prefix, __, chainType, chainId = self.splitExtendedPrefix(
            extendedPrefix)
        aligsDict = {
            chainType_: {
                chainId_:
                self.loadOneAligFile(HHBlitsFnamesDict[chainType_][chainId_])
                for chainId_ in HHBlitsFnamesDict[chainType_]
            }
            for chainType_ in HHBlitsFnamesDict
        }

        for chainIdL in aligsDict["l"]:
            for chainIdR in aligsDict["r"]:
                aligFormatedName = os.path.join(
                    self.corrMutOutPath, "tmp_" + prefix + "_l-" + chainIdL +
                    "-r-" + chainIdR + "_" + "u.ali")
                try:
                    corrMutOutName = self.generateOutName(
                        prefix, chainIdL, chainIdR)
                    if os.path.isfile(corrMutOutName) and self.getNLines(
                            corrMutOutName) > self.minNumResiduesPartner:
                        print("%s already computed" % corrMutOutName)
                        continue
                    print("launching corrMut over chains %s - %s" %
                          (chainIdL, chainIdR))
                    aligOut = self.createPairedAlignmet(
                        aligsDict["l"][chainIdL], aligsDict["r"][chainIdR],
                        aligFormatedName)
                    if aligOut:
                        __, __, nAlig, seqL, seqR = aligOut
                    else:
                        nAlig = 0
                    if nAlig > CorrMutGeneric.MIN_N_SEQS_MSA:
                        startTime = time.time()
                        iterOfCorrelatedRows = self.lauchCorrMutProgram(
                            aligFormatedName)
                        print("Time CorrMut", time.time() - startTime)
                    else:
                        iterOfCorrelatedRows = None  #( "*** Sorry", "Error, not enough sequences in MSA")
                    self.processResults(iterOfCorrelatedRows, seqStructMap,
                                        chainIdL, chainIdR, corrMutOutName,
                                        nAlig)
                except (KeyboardInterrupt, Exception):
                    print(
                        "Exception happend computing corrMut for %s over chains %s - %s"
                        % (prefix, chainIdL, chainIdR))
                    tryToRemove(corrMutOutName)
                    raise
                finally:
                    tryToRemove(aligFormatedName)
                    pass