class ExplorePrediction: "CONSTRUCTOR" def __init__(self, Directory, DerivedoI, PDBoI): """ Class attributes: Figures_L (List): list of all the figure types that will be created FiguresSVG_D (Dict): key is the type of figure, value is the SVG syntax that will draw the figure DerivedoInterest (String): Derived node of interest that the figure will be based on PDBoInterest (String): PDB structure that the derived node sequence aligned to and achieved a significant hit on """ #initial setup of what figures will be created self.Figures_L = [ "TreeAndStates", "Alignment", "Structurecartoon", "Structuresurface" ] self.FigureSVG_D = {Key: [] for Key in self.Figures_L} self.Directory = Directory if self.Directory.endswith("/"): pass else: self.Directory = self.Directory + "/" self.DerivedoInterest = DerivedoI self.PDBoInterest = PDBoI print self.Directory print self.DerivedoInterest print self.PDBoInterest #output directory where files will be written self.OutputDirectory = "%sFigures/%s-%s/" % ( self.Directory, self.DerivedoInterest, self.PDBoInterest) if os.path.exists(self.OutputDirectory): pass else: os.system("mkdir " + self.OutputDirectory) #paths to relevant input files self.ReportPATH = self.Directory + "Report.xml" self.TreePATH = self.Directory + "ModdedTree.nwk" self.MatrixPATH = self.Directory + "ScoringMatrix.xml" #parses the report file for sequences and branch relationships self.NodeToSeq_D = { re.compile("<H>(.+?)</H>").search(Seq).group(1): re.compile("<S>(.+?)</S>").search(Seq).group(1) for Seq in re.findall("<Seq>.+?</Seq>", open(self.ReportPATH, "r").read()) } self.BranchToAlgorithm_D = { re.compile("<Branch_name>(.+?)</Branch_name>").search(Branch). group(1): ScopeAlgorithm(Branch) for Branch in re.findall("<Branch>.+?</Branch>", open(self.ReportPATH, "r").read(), re. DOTALL) } self.RectCount = 0 #dimensions self.TreeFigWIDTH = 750 self.TreeFigHEIGHT = 500 self.TreeFigXOffset = 25 self.TreeFigYOffset = 50 #loads and parses tree, gets evolutionary distances for proper branch lengths self.CogentTree = LoadTree(self.TreePATH) self.FastMLTree = FastMLTree(self.TreePATH, False) self.FastMLTree.setBranchLengths() self.LongestDistance = self.getLongestEvoDistance() self.EvoDistance_D = { Key: self.getEvoDistance(Key) for Key in self.NodeToSeq_D.keys() if Key != self.FastMLTree.TopKey } self.EvoDistance_D[self.FastMLTree.TopKey] = 0.0 self.ModdedEvoDistance_D = self.modEvoDistance() self.TreeCoords_D = self.setTreeCoords() FurthestPosition = 0.0 FurthestClade = "" #gets the furthest evolutionary distance for Key in self.FastMLTree.LeafKey_L: Val = self.TreeCoords_D[Key][0] + (12 * len(Key)) if Val > FurthestPosition: FurthestPosition = Val FurthestClade = Key self.BranchoInterest = "" for Key in self.FastMLTree.BranchKey_L: if Key.split(">>")[1] == self.DerivedoInterest: self.BranchoInterest = Key #gets all relevant information for the states portion of the figure self.StateIndices_L = [ int(X) - 1 for X in self.BranchToAlgorithm_D[self.BranchoInterest]. getAllMutationXMLKeysAccordingToAccession(self.PDBoInterest) ] self.LeafStates_D = { Key: [self.NodeToSeq_D[Key][state] for state in self.StateIndices_L] for Key in self.FastMLTree.LeafKey_L } self.StateColour_D = self.getStateToHex() self.StateInc = 25.0 self.StateFigHEIGHT = 500 self.StateFigWIDTH = self.StateInc * (len(self.StateIndices_L)) + 50 self.StateFigXOffset = self.TreeFigXOffset + self.TreeFigWIDTH + ( 12 * len(FurthestClade)) + 25 self.StateFigYOffset = 50 #creates the states and tree figure self.FigureSVG_D["TreeAndStates"].append( self.getSVGHeader( self.TreeFigHEIGHT + (self.TreeFigYOffset * 2), self.StateFigXOffset + self.StateFigWIDTH + self.TreeFigXOffset)) self.makeTreeFig() self.makeStatesFig() self.FigureSVG_D["TreeAndStates"].append("</svg>") self.TreeAndStatesFOutPATH = self.OutputDirectory + "TreeAndStates.png" TreeStateFOut = open(self.TreeAndStatesFOutPATH, "w") cairosvg.svg2png( bytestring="\n".join(self.FigureSVG_D["TreeAndStates"]), write_to=TreeStateFOut) TreeStateFOut.close() LongestCladeName = "" for Key in self.FastMLTree.LeafKey_L: if len(Key) > len(LongestCladeName): LongestCladeName = Key #gets all relevant information for the alignment cartoon portion of the figure self.MatrixInfo = self.parseScoringMatrix() self.AlnInc = 11.0 self.AlignmentFigWIDTH = self.AlnInc * len( self.MatrixInfo["Sseq"]) + self.AlnInc + ( 8 * len(LongestCladeName)) self.AlignmentFigHEIGHT = self.AlnInc * ( len(self.FastMLTree.LeafKey_L) + 1) + self.AlnInc self.AlignmentFigXOffset = self.AlnInc self.AlignmentFigYOffset = self.AlnInc self.FigureSVG_D["Alignment"].append( self.getSVGHeader(self.AlignmentFigHEIGHT, self.AlignmentFigWIDTH)) self.makeAlignmentFig() self.FigureSVG_D["Alignment"].append("</svg>") self.AlignmentFOutPATH = self.OutputDirectory + "Alignment.png" AlignmentFOut = open(self.AlignmentFOutPATH, "w") cairosvg.svg2png( bytestring="\n".join(self.FigureSVG_D["Alignment"]), write_to=AlignmentFOut) AlignmentFOut.close() #relevant information for the structure file in PDB format self.ColouredStructureFile = self.getColoredStructureFile() self.StructureFOutPATH = self.OutputDirectory + "Structure.pdb" open(self.StructureFOutPATH, "w").write(self.ColouredStructureFile.read()) self.TotalFigWIDTH = 1000 self.TotalFigHEIGHT = 600 self.TotalElement_L = [ self.getSVGHeader(self.TotalFigHEIGHT, self.TotalFigWIDTH) ] self.TotalElement_L.append( '''\t<image x="0" y="0" width="1000" height="500" xlink:href="file://%s"/>''' % (self.TreeAndStatesFOutPATH)) self.TotalElement_L.append( '''\t<image x="0" y="500" width="1000" height="100" xlink:href="file://%s"/>''' % (self.AlignmentFOutPATH)) self.TotalElement_L.append("</svg>") "gets the header for any SVG format file" def getSVGHeader(self, FrameHEIGHT, FrameWIDTH): return """<?xml version="1.0" standalone="no"?> <!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd"> <svg xmlns:xlink="http://www.w3.org/1999/xlink" xmlns='http://www.w3.org/2000/svg' version='1.1' width='%s' height='%s'> """ % (str(FrameWIDTH), str(FrameHEIGHT)) "Dictionary where the key is the amino acid character and the value is the background colour" def getStateToHex(self): return {"A":"80B3E6","C":"E68080","D":"CC4DCC","E":"CC4DCC","F":"80B3E6",\ "G":"E6994D","H":"1AB3B3","I":"80B3E6","K":"E6331A","L":"80B3E6",\ "M":"80B3E6","N":"1ACC1A","P":"CCCC00","Q":"1ACC1A","R":"E6331A",\ "S":"1ACC1A","T":"1ACC1A","V":"80B3E6","W":"80B3E6","Y":"1AB3B3",\ "-":"FFFFFF","X":"FFFFFF"} "returns the total evolutionary distance from the origin to the node of interest" def getEvoDistance(self, startingToNodeKey): distance = 0.0 rootNodeHasNotBeenReached = True ToNodeKey = startingToNodeKey while rootNodeHasNotBeenReached: distance += self.FastMLTree.BranchLength_D[ToNodeKey] branchUpHasNotBeenFound = True for BranchKey in self.FastMLTree.BranchKey_L: if branchUpHasNotBeenFound: if re.compile(">>" + ToNodeKey + "$").search(BranchKey): branchUpHasNotBeenFound = False ToNodeKey = BranchKey.split(">>")[0] if ToNodeKey == self.FastMLTree.TopKey: rootNodeHasNotBeenReached = False return distance "gets the node with the longest evolutionary distance from the origin" def getLongestEvoDistance(self): longestDistance = 0.0 for LeafKey in self.FastMLTree.LeafKey_L: distance = self.getEvoDistance(LeafKey) if distance > longestDistance: longestDistance = distance return longestDistance "modifies evolutionary distance into a different format" def modEvoDistance(self): Ret = {} for Key in self.EvoDistance_D.keys(): if Key == self.FastMLTree.TopKey: Ret[Key] = self.EvoDistance_D[Key] else: if self.EvoDistance_D[Key] == 0: Ret[Key] = self.EvoDistance_D[Key] else: Ret[Key] = self.EvoDistance_D[Key] return Ret "sets tree node coordinates (horizontal and vertical) for the SVG image" def setTreeCoords(self): Lines_L = self.CogentTree.asciiArt().split("\n") MaxVert = 0 VertCoord_D = {} for i in range(0, len(Lines_L)): if re.compile("[a-zA-Z0-9_\.@]+").search(Lines_L[i]): Leaves = re.findall("([a-zA-Z0-9_\.@]+)", Lines_L[i]) for Leaf in Leaves: VertCoord_D[Leaf] = i MaxVert = i TreeCoords_D = { Key: [(self.ModdedEvoDistance_D[Key] / self.LongestDistance) * self.TreeFigWIDTH + self.TreeFigXOffset, float(float(VertCoord_D[Key]) / float(MaxVert)) * self.TreeFigHEIGHT + self.TreeFigYOffset] for Key in self.NodeToSeq_D.keys() } return TreeCoords_D "adds node names at each node vertex" def addNodeNamesAtNodePoints(self): for Key in self.FastMLTree.LeafKey_L: xy = self.TreeCoords_D[Key] xStart = str(xy[0]) yStart = str(xy[1]) self.FigureSVG_D["TreeAndStates"].append( '''\t<text x='%s' y='%s' text-anchor='left' font-size='20' font-family='Courier' style="fill: #000000;" >%s</text>''' % (xStart, yStart, Key)) "adds the vertical lines of the tree image" def addVerticalLines(self): for branchKey in self.FastMLTree.BranchKey_L: fro = branchKey.split(">>")[0] to = branchKey.split(">>")[1] froXY = self.TreeCoords_D[fro] toXY = self.TreeCoords_D[to] if branchKey == self.BranchoInterest: self.FigureSVG_D["TreeAndStates"].append( '''\t<line class='axis' x1='%s' y1='%s' x2='%s' y2='%s' style="stroke:rgb(255,0,0);stroke-width:1 " />''' % (str(froXY[0]), str(froXY[1]), str(froXY[0]), str( toXY[1]))) else: self.FigureSVG_D["TreeAndStates"].append( '''\t<line class='axis' x1='%s' y1='%s' x2='%s' y2='%s' style="stroke:rgb(0,0,0);stroke-width:1 " />''' % (str(froXY[0]), str(froXY[1]), str(froXY[0]), str( toXY[1]))) "adds the horizontal lines of the tree image" def addHorizontalLines(self): for branchKey in self.FastMLTree.BranchKey_L: fro = branchKey.split(">>")[0] to = branchKey.split(">>")[1] froXY = self.TreeCoords_D[fro] toXY = self.TreeCoords_D[to] if branchKey == self.BranchoInterest: self.FigureSVG_D["TreeAndStates"].append( '''\t<line class='axis' x1='%s' y1='%s' x2='%s' y2='%s' style="stroke:rgb(255,0,0);stroke-width:1 " />''' % (str(froXY[0]), str(toXY[1]), str(toXY[0]), str( toXY[1]))) else: self.FigureSVG_D["TreeAndStates"].append( '''\t<line class='axis' x1='%s' y1='%s' x2='%s' y2='%s' style="stroke:rgb(0,0,0);stroke-width:1 " />''' % (str(froXY[0]), str(toXY[1]), str(toXY[0]), str( toXY[1]))) "does all methods necessary to make the tree image" def makeTreeFig(self): self.addNodeNamesAtNodePoints() self.addVerticalLines() self.addHorizontalLines() "adds the rows for the mutated states in each sequence" def addStateRows(self): inc = self.StateInc vertInc = float(self.StateFigHEIGHT / float(len(self.LeafStates_D))) lowestY = float("inf") for Key in self.TreeCoords_D.keys(): if self.TreeCoords_D[Key][1] < lowestY: lowestY = self.TreeCoords_D[Key][1] stateY = lowestY - (1.5 * vertInc) stateX = 0.0 + self.StateFigXOffset for i in self.StateIndices_L: self.FigureSVG_D["TreeAndStates"].append( '''\t<text x='%s' y='%s' text-anchor='middle' font-size='16' font-family='Courier' transform="rotate(90, %s, %s)" style="fill: #000000;" >%s</text>''' % (str(stateX), str(stateY), str(stateX), str(stateY), str(i + 1))) stateX += inc for Key in self.LeafStates_D.keys(): X = 0.0 + self.StateFigXOffset for State in self.LeafStates_D[Key]: Y = self.TreeCoords_D[Key][1] RectX = X - (float(inc / 2.0)) RectY = Y - (float(vertInc / 2.0)) - 5.0 self.FigureSVG_D["TreeAndStates"].append('''\t<rect class='r%s' x='%s' y='%s' width='%s' height='%s' style="fill:#%s" />''' % (str(self.RectCount),\ str(RectX),str(RectY),\ str(inc),str(vertInc),\ self.StateColour_D[State])) self.FigureSVG_D["TreeAndStates"].append( '''\t<text x='%s' y='%s' font-size='20' font-family='Courier' text-anchor='middle' style="fill: #000000;" >%s</text>''' % (str(X), str(Y), State)) X += inc "executes the method to make the states figure" def makeStatesFig(self): self.addStateRows() "parses the scoring matrix for alignment to the PDB sequence information" def parseScoringMatrix(self): allAlignments_L = re.findall("<PDB_alignment>.+?</PDB_alignment>", open(self.MatrixPATH, "r").read(), re.DOTALL) KeyAln = "" NotFound = True for Alignment in allAlignments_L: if NotFound: PDBID = re.compile("<PDB_id>(.+?)</PDB_id>").search( Alignment).group(1).split("|")[0] if self.PDBoInterest.upper() == PDBID: NotFound = False KeyAln = Alignment self.ChainoInterest = re.compile( "<PDB_id>(.+?)</PDB_id>").search(Alignment).group( 1).split("|")[1].lower() return {"Qstart" : int(re.compile("<Alignment_start_query>(.+?)</Alignment_start_query>").search(KeyAln).group(1))-1,\ "Qend" : int(re.compile("<Alignment_end_query>(.+?)</Alignment_end_query>").search(KeyAln).group(1))-1,\ "Sstart" : int(re.compile("<Alignment_start_subject>(.+?)</Alignment_start_subject>").search(KeyAln).group(1))-1,\ "Send" : int(re.compile("<Alignment_end_subject>(.+?)</Alignment_end_subject>").search(KeyAln).group(1))-1,\ "Sseq" : re.compile("<Aligned_subject_sequence>(.+?)</Aligned_subject_sequence>").search(KeyAln).group(1)} "makes the cartoon of all aligned sequences in the protein family" def makeAlignmentFig(self): AllSeqs_L = [self.MatrixInfo["Sseq"]] + [ self.NodeToSeq_D[Key] [self.MatrixInfo["Qstart"]:self.MatrixInfo["Qstart"] + len(self.MatrixInfo["Sseq"])] for Key in self.FastMLTree.LeafKey_L ] l1 = len(AllSeqs_L[0]) AllHeaders_L = [self.PDBoInterest] + self.FastMLTree.LeafKey_L l2 = 0 for Header in AllHeaders_L: if len(Header) > l2: l2 = len(Header) l = l1 xinc = self.AlnInc yinc = self.AlnInc Y = self.AlignmentFigYOffset for i in range(0, len(AllSeqs_L)): X = 0.0 + self.AlignmentFigXOffset for State in AllSeqs_L[i]: RectX = X - (float(xinc / 2.0)) RectY = Y - (float(yinc / 2.0)) - 5.0 self.FigureSVG_D["Alignment"].append('''\t<rect class='r%s' x='%s' y='%s' width='%s' height='%s' style="fill:#%s" />''' % (str(self.RectCount),\ str(RectX),str(RectY),\ str(xinc),str(yinc),\ self.StateColour_D[State])) self.FigureSVG_D["Alignment"].append( '''\t<text x='%s' y='%s' text-anchor='middle' font-size='10' font-family='Courier' style="fill: #000000;" >%s</text>''' % (str(X), str(Y), State)) X += xinc self.FigureSVG_D["Alignment"].append( '''\t<text x='%s' y='%s' text-anchor='left' font-size='10' font-family='Courier' style="fill: #000000;" >%s</text>''' % (str(X + self.AlnInc), str(Y), AllHeaders_L[i])) Y += yinc "gets a PDB format file with the temperature factors coloured to reflect mutated sites" def getColoredStructureFile(self): NotFound = True DesiredBranchKey = "" for BranchKey in self.FastMLTree.BranchKey_L: if BranchKey.split(">>")[1] == self.DerivedoInterest: DesiredBranchKey = BranchKey NotFound = False PDBAndPDBXMLContents = getAllPDBFileDicts([self.PDBoInterest]) SA = self.BranchToAlgorithm_D[DesiredBranchKey] SA.PDBContents_D = PDBAndPDBXMLContents[0] SA.PDBXMLContents_D = PDBAndPDBXMLContents[1] FH = getOutputTempFile() SA.createPDBColoredFile(self.PDBoInterest, FH.name) return FH
class ScopeAlgorithmTreeSet: "CONSTRUCTOR" def __init__(self , Directory , DataDIR): """ Class attributes: DataDIR (String): Directory where main program is held Directory (String): Directory to read in report files and output the final PValue file ProteinFamilyName (String): Protein family descriptor to use in random distribution file generation ScopeXMLFile (String): Path to report (mutation mapping) file ModdedTreeFile (String): Path to newick syntax tree file with branch names according to first module ScoringMatrixXMLFile (String): Path to XML format file of all scoring keys (to be used in random distributions) DistributionPath (String): Path to the random distribution directory HydroPATH (String): Path to hydropathyindex file MassPATH (String): Path to sidechainmass file Hydro_D (Dict): Key is one letter AA code, Value is its hydropathy index Mass_D (Dict): Key is one letter AA code, Value is its side chain mass value Tree (FastMLTree obj): Tree object with the renamed branches NodeSequenceKey_L (List): List containing the names of all nodes NodeToSequence_D (Dict): Key is the node name, Value is the ancestral or extant sequence at that node BranchToAlgorithm_D (Dict): Key is the Branch key name, Value is a ScopeAlgorithm instance for that branch segment PDBContents_D (Dict): Key is PDB ID, value is dictionaries of all atom and residue information for that PDB file PDBXMLContents_D (Dict): Key is PDB ID, value is dictionaries of all atom and residue information for that PDBXML file ScoringMatrixCoverageKeys_D (Dict): Key is PDB ID, value is list of chain and position keys that correspond to successfully aligned regions AccsToMutationCount_D (Dict): Key is PDB ID, value is a list of integers for all branch segments with those number of mutations AccsToDistanceCount_D (Dict): Key is PDB ID, value is a list of integers for all branch segments with those number of mutations that can be joined by pairwise distances MassChanges_L (List): all mass change calculations that have happened anywhere on the tree HydroChanges_L (List): all hydropathy index change calculations that have happened anywhere on the tree RandomDistributions_D (Dict): Dictionary structure pointing to various number arrays based on the PDB ID used and the number of items drawn before averaging BranchToPValues_D (Dict): Dictionary structure pointing to the four P-Values for an ancestral, derived, PDB alignment triad """ self.DataDIR = DataDIR #gets input/output directory and protein family name self.Directory = Directory if self.Directory.endswith("/"): self.Directory = self.Directory[:-1] self.ProteinFamilyName = self.Directory if re.compile("/").search(self.Directory): self.ProteinFamilyName = self.Directory.split("/")[-1] #gets all path information for the relevant input files self.ScopeXMLFile = self.Directory+"/"+"Report.xml" self.ModdedTreeFile = self.Directory+"/"+"ModdedTree.nwk" self.ScoringMatrixXMLFile = self.Directory+"/"+"ScoringMatrix.xml" self.PDBToEvalue_D = self.getPDBToEvalue_D() #paths to more input files self.HydroPATH = self.DataDIR+"misc/hydropathyindex" self.MassPATH = self.DataDIR+"misc/sidechainmass" #makes a dictionary out of hydropathy index and mass input files self.Hydro_D = {line.split()[0] : float(line.replace("\n","").split()[1]) for line in open(self.HydroPATH,"r").readlines()} self.Mass_D = {line.split()[0] : float(line.replace("\n","").split()[1]) for line in open(self.MassPATH,"r").readlines()} #create the FastMLTree object and set branch lengths self.Tree = FastMLTree(self.ModdedTreeFile , False) self.Tree.setBranchLengths() #parse out sequence information NodeToSequence_LD = self.getNodeToSequence_LD() self.NodeSequenceKey_L = NodeToSequence_LD[0] self.NodeToSequence_D = NodeToSequence_LD[1] #parse out report XML file and create ScopeAlgorithm instances self.BranchToAlgorithm_D = self.getBranchToAlgorithm_D() #set PDB content dictionary and PDBXML content dictionary PDB_L = [] #for each branch key, check for new PDB ID keys for BranchKey in self.BranchToAlgorithm_D.keys(): AccKeysSearch = re.compile("<PDBs>(.+?)</PDBs>").search(self.BranchToAlgorithm_D[BranchKey].alignmentSet) if AccKeysSearch: AccKeys_L = AccKeysSearch.group(1).split(";") #for each PDB ID key found for AccKey in AccKeys_L: #only executes if the PDB ID key has not already been added to the dictionary if AccKey in set(PDB_L): pass else: PDB_L.append(AccKey) PDBAndPDBXMLContents_Dicts = getAllPDBFileDicts(PDB_L) self.PDBContents_D = PDBAndPDBXMLContents_Dicts[0] self.PDBXMLContents_D = PDBAndPDBXMLContents_Dicts[1] [self.setPDBAndPDBXMLContentDictionaries(self.BranchToAlgorithm_D[Key]) for Key in self.BranchToAlgorithm_D.keys()] #parses out ScoringMatrixCoverage file self.ScoringMatrixCoverageKeys_D = self.getScoringMatrixCoverageKeys() self.ScoringMatrixPDBXMLMatchedKeys_D = self.getScoringMatrixPDBXMLMatchedKeys_D() #gets the indices for PDB IDs to be used in SAS and distance random distribution generation self.AccsToMutationCount_D = self.getNCoveredMutations_D() #self.AccsToDistanceCount_D = self.getNDistances_D() #get list of mass and hydropathy index change values for use in random distributions BranchSegmentMutations_L = self.getAllBranchSegmentMutations() self.MassChanges_L = BranchSegmentMutations_L[0] self.HydroChanges_L = BranchSegmentMutations_L[1] self.RandomDistributions_D = self.getRandomDistributions_D() #create all random distributions #print self.RandomDistributions_D self.BranchToPValues_D = self.getAllBranchSegmentPValues() #get all PValues self.output() #output to PValue file def getPDBToEvalue_D(self): Ret = {} allAlignments_L = re.findall("<PDB_alignment>.+?</PDB_alignment>", open(self.ScoringMatrixXMLFile,"r").read() , re.DOTALL) for aln in allAlignments_L: pdbid = re.compile("<PDB_id>(.+?)</PDB_id>").search(aln).group(1) e = re.compile("<E_value>(.+?)</E_value>").search(aln).group(1) acc = pdbid.split("|")[0].lower() chain = pdbid.split("|")[1] Ret[acc+"|"+chain] = float(e) return Ret "gets list and dictionary of node keys to sequences" def getNodeToSequence_LD(self): ret = {} retKey_L = [] #parses all Seq headers in the xml report file, and makes a dictionary and list entry for each one AllSequences = re.findall("<Seq>.+?</Seq>" , open(self.ScopeXMLFile , "r").read()) for Seq in AllSequences: SeqKey = re.compile("<H>(.+?)</H>").search(Seq).group(1) SeqObj = FASequence(SeqKey , re.compile("<S>(.+?)</S>").search(Seq).group(1)) ret[SeqKey] = SeqObj retKey_L.append(SeqKey) return [retKey_L,ret] "gets dictionary of branch keys to ScopeAlgorithm objects representing those branches" def getBranchToAlgorithm_D(self): ret = {} #finds all branches, parses out the branch name and sets this as the key AllBranches = re.findall("<Branch>.+?</Branch>" , open(self.ScopeXMLFile , "r").read() , re.DOTALL) for Branch in AllBranches: BranchKey = re.compile("<Branch_name>(.+?)</Branch_name>").search(Branch).group(1) ret[BranchKey] = ScopeAlgorithm(Branch) return ret "sets the ScopeAlgorithm instance's PDBContents_D and PDBXMLContents_D" def setPDBAndPDBXMLContentDictionaries(self,SA): SA.PDBContents_D = self.PDBContents_D SA.PDBXMLContents_D = self.PDBXMLContents_D "gets dictionary of covered residues in each PDB accession" def getScoringMatrixCoverageKeys(self): ret = {} #finds all PDBIDs and adds their key as a separate entry in the dictionary for CoverageKeySet in re.findall("<Coverage>.+?</Coverage>" , re.compile("(<Coverages>.+?</Coverages>)",re.DOTALL).search(open(self.ScoringMatrixXMLFile,"r").read()).group(1)): ret[re.compile("<ID>(.+?)</ID>",).search(CoverageKeySet).group(1)] = re.compile("<Keys>(.+?)</Keys>").search(CoverageKeySet).group(1).split(",") #get all covered keys return ret "gets dictionary of residues that are covered by the alignment and also covered by the parse PDB structure file" def getScoringMatrixPDBXMLMatchedKeys_D(self): Ret = {} for AccKey in self.PDBXMLContents_D.keys(): Ret[AccKey] = [] for PosKey in self.ScoringMatrixCoverageKeys_D[AccKey]: #checks if the coverage key is also in the PDBXML residue dictionary if PosKey in sets.Set(self.PDBXMLContents_D[AccKey]["XMLResidue_D"].keys()): Ret[AccKey].append(PosKey) return Ret "gets dictionary of PDB IDs and the list of integers used to draw SAS random distributions" def getNCoveredMutations_D(self): AccsToMutationCount_D = {} for BranchKey in self.BranchToAlgorithm_D.keys(): SA = self.BranchToAlgorithm_D[BranchKey] #for each ScopeAlgorithm instance, if mutations are present if SA.mutationsPresent: AccsToMutationCountForSingleSA_D = {} AccsToKey_D = {} #for each mutated site for MutationXMLKey in SA.mutationsXMLkey_L: #checks if that mutation has coverage, then gets the accession and position of that mutation if SA.mutationScore_D[MutationXMLKey]["Coverage"]: AllAccPos = SA.getAccessionPosition_L(SA.mutationsXML_D[MutationXMLKey]) for AccPos in AllAccPos: Acc = AccPos[0] Pos = AccPos[1] if Acc in AccsToMutationCount_D.keys(): pass else: AccsToMutationCount_D[Acc] = [] #adds the PDB ID to the count dictionary for a single ScopeAlgorithm if Acc in AccsToMutationCountForSingleSA_D.keys(): pass else: AccsToMutationCountForSingleSA_D[Acc] = 0 AccsToKey_D[Acc] = [] #adds one count to that PDB ID AccsToMutationCountForSingleSA_D[Acc] += 1 AccsToKey_D[Acc].append(Pos) #sets the ScopeAlgorithm AccsToMutationCount to the SingleSA_D SA.AccsToMutationCount = AccsToMutationCountForSingleSA_D SA.AccsToKey_D = AccsToKey_D #adds the index to the overall dictionary if it is not already in there for Key in AccsToMutationCountForSingleSA_D.keys(): if AccsToMutationCountForSingleSA_D[Key] in sets.Set(AccsToMutationCount_D[Key]): pass else: #only adds the index if it is greater than 2 mutations if AccsToMutationCountForSingleSA_D[Key] >= 2: AccsToMutationCount_D[Key].append(AccsToMutationCountForSingleSA_D[Key]) return AccsToMutationCount_D "gets hydropathy index and mass change lists for all mutations that have occurred in the tree" def getAllBranchSegmentMutations(self): MassRet = [] HydroRet = [] #gets all mutations from all branches and all states M_L = [[self.BranchToAlgorithm_D[B].getMutationType(self.BranchToAlgorithm_D[B].mutationsXML_D[MutationXMLKey]) for MutationXMLKey in self.BranchToAlgorithm_D[B].mutationsXMLkey_L if self.BranchToAlgorithm_D[B].mutationScore_D[MutationXMLKey]["Coverage"]] for B in self.BranchToAlgorithm_D.keys() if self.BranchToAlgorithm_D[B].mutationsPresent] Mut_L = [] for M in M_L: for Mut in M: Mut_L.append(Mut) #makes lists of hydropathy index changes and mass changes from the list of state changes HydroRet = [self.getHydroDif(Mut[0],Mut[1]) for Mut in Mut_L] MassRet = [self.getMassDif(Mut[0],Mut[1]) for Mut in Mut_L] return [MassRet,HydroRet] #################################################################################################### "gets squared difference in side chain mass between ancestral and derived sequence states" def getMassDif(self,StateA,StateB): return math.pow(self.Mass_D[StateA] - self.Mass_D[StateB] , 2) "gets squared difference in hydropathy index between ancestral and derived sequence states" def getHydroDif(self,StateA,StateB): return math.pow(self.Hydro_D[StateA] - self.Hydro_D[StateB] , 2) "gets list of all possible combinations of distances in a list of mutated sites from the same PDB structure" def getCombinatorialListOfPairwiseDistances(self,AccKey,AccsA_L): AccsB_L = AccsA_L[1:] count = 0 Distances_L = [] SA = self.BranchToAlgorithm_D[self.BranchToAlgorithm_D.keys()[0]] #combinations of pairwise distances for AccA in AccsA_L: for AccB in AccsB_L: #gets both points APDBXMLLine = SA.getPDBXMLLine(AccKey,AccA) BPDBXMLLine = SA.getPDBXMLLine(AccKey,AccB) if APDBXMLLine and BPDBXMLLine: APoint = SA.getAlphaCarbonPoint(APDBXMLLine) BPoint = SA.getAlphaCarbonPoint(BPDBXMLLine) #as long as they are not null, calculate the magnitude of distance between them and add it to a list if APoint and BPoint: Distances_L.append(SA.getDistanceMagnitude(APoint , BPoint)) AccsB_L = AccsB_L[1:] return Distances_L "general method for writing/retrieving a random distribution array" def getAnyAverageRandomDist(self,AveragedNumbers_L): FinalNumbers_L = [AveragedNumber for AveragedNumber in AveragedNumbers_L if math.isnan(AveragedNumber) == False] return gaussian_kde(array(FinalNumbers_L)) "gets the average of numbers in a list" def getAveragedData(self,Numbers_L): return numpy.mean([Number for Number in Numbers_L if Number != None]) "gets a random sample of integers to be used as random indices to draw numbers for the random distributions" def getRandomSampleOfIntegers(self,MaxLength,Index): return random.sample(range(MaxLength),Index) "get SAS random distribution for a single PDB ID and index" def getRelativeSASRandDistForIndex(self,AccKey,Index): Ret = None #print AccKey #print Index #gets 10000 SAS averages of randomly selected indices on the protein structure within the alignment bounds try: Ret = self.getAnyAverageRandomDist(\ [self.getAveragedData(\ [self.BranchToAlgorithm_D[self.BranchToAlgorithm_D.keys()[0]].getRelativeGlobalSAS(self.PDBXMLContents_D[AccKey]["XMLResidue_D"][self.ScoringMatrixPDBXMLMatchedKeys_D[AccKey][xIndex]]) \ for xIndex in self.getRandomSampleOfIntegers(len(self.ScoringMatrixPDBXMLMatchedKeys_D[AccKey]),Index)]) for i in range(0,10000)]) except Exception as e: print e return Ret "get distance random distribution for a single PDB ID and index" def getRelativeDistanceRandDistForIndex(self,AccKey,Index): Ret = None #gets 10000 distance averages of randomly selected indices on the protein structure within the alignment bounds try: Ret = self.getAnyAverageRandomDist(\ [self.getAveragedData(\ self.getCombinatorialListOfPairwiseDistances(AccKey,[self.ScoringMatrixPDBXMLMatchedKeys_D[AccKey][xIndex]\ for xIndex in self.getRandomSampleOfIntegers(len(self.ScoringMatrixPDBXMLMatchedKeys_D[AccKey]),Index)])) for i in range(0,10000)]) except Exception as e: print e return Ret "gets SAS random distributions for all indices for one PDB ID" def getRelativeSASRandDistAllIndices(self,AccKey,Indices): return {str(Index) : self.getRelativeSASRandDistForIndex(AccKey,Index) for Index in Indices} "gets distance random distributions for all indices for one PDB ID" def getDistanceRandDistAllIndices(self,AccKey,Indices): return {str(Index) : self.getRelativeDistanceRandDistForIndex(AccKey,Index) for Index in Indices} "gets hydropathy index change random distributions for all indices for one PDB ID" "gets all random distributions for each criterion and each PDB ID key and each index (ie. highest method for random distributions)" def getRandomDistributions_D(self): return {"SAS":{AccKey : self.getRelativeSASRandDistAllIndices(AccKey,self.AccsToMutationCount_D[AccKey]) for AccKey in self.AccsToMutationCount_D.keys()},\ "Dist": {AccKey : self.getDistanceRandDistAllIndices(AccKey,self.AccsToMutationCount_D[AccKey]) for AccKey in self.AccsToMutationCount_D.keys()}} #################################################################################################### "general method for retrieving a P-value" def getGeneralPValue(self,Criterion,Acc,Index,Point): Ret = None #does not execute if there are only 0.0's in the Random distribution (faulty distribution) if self.RandomDistributions_D[Criterion][Acc][str(Index)].n == 1: pass #gets CDF function (percentile) using the observed point as the input and distribution as the background else: neginf = float("inf") * -1.0 Ret = self.RandomDistributions_D[Criterion][Acc][str(Index)].integrate_box_1d(neginf,Point) return Ret "get SAS p-value for a particular ancestral, derived, triple alignment triad" def getSASPValue(self,SA,AccKey): Avg = None PVal = None Ret = None if str(SA.AccsToMutationCount[AccKey]) in self.RandomDistributions_D["SAS"][AccKey].keys(): try: RSAS_L = [] for AccPos in SA.getAllPositionKeysAccordingToAccession(AccKey): pdbxmlLine = SA.getPDBXMLLine(AccPos[0],AccPos[1]) if pdbxmlLine: SASToAdd = SA.getRelativeGlobalSAS(pdbxmlLine) RSAS_L.append(SASToAdd) Avg = self.getAveragedData(RSAS_L) if numpy.isnan(Avg): pass else: PVal = self.getGeneralPValue("SAS",AccKey,SA.AccsToMutationCount[AccKey],Avg) Ret = [Avg,PVal] except Exception as e: pass return Ret "get distance p-value for a particular ancestral, derived, triple alignment triad" def getDistPValue(self,SA,AccKey): PVal = None Avg = None Ret = None if AccKey in SA.AccsToMutationCount.keys(): if str(SA.AccsToMutationCount[AccKey]) in self.RandomDistributions_D["Dist"][AccKey].keys(): try: Pairwise_L = self.getCombinatorialListOfPairwiseDistances(AccKey,SA.AccsToKey_D[AccKey]) Avg = self.getAveragedData(Pairwise_L) if numpy.isnan(Avg): pass else: PVal = self.getGeneralPValue("Dist",AccKey,SA.AccsToMutationCount[AccKey],Avg) Ret = [Avg,PVal] except Exception as e: print e return Ret "gets PValues for all four criteria for one ancestral,derived, PDB alignment triad" def getAllPValuesForAccession(self,SA,AccKey): return {"SAS":self.getSASPValue(SA,AccKey),\ "Dist":self.getDistPValue(SA,AccKey)} "gets all PValues for one ancestral, derived alignment pair" def getAllPValuesForBranchSegment(self,SA): return {Acc:self.getAllPValuesForAccession(SA,Acc) for Acc in self.AccsToMutationCount_D.keys() if Acc in SA.AccsToMutationCount.keys()} "gets all PValues for all ancestral, derived alignment pairs" def getAllBranchSegmentPValues(self): return {BranchKey : self.getAllPValuesForBranchSegment(self.BranchToAlgorithm_D[BranchKey]) for BranchKey in self.BranchToAlgorithm_D.keys() if self.BranchToAlgorithm_D[BranchKey].mutationsPresent} #################################################################################################### "writes PValue information to the appropriate file" def output(self): AllOutput_L = ["Branch PDB #M Msas Psas Mdis Pdis"] #header line #for each ancestral, derived, PDB alignment triad for BranchKey in self.BranchToPValues_D.keys(): for AccKey in self.BranchToPValues_D[BranchKey].keys(): #if there is a PValue to this triad, then it will format an appropriate output string OutputString = None if self.BranchToPValues_D[BranchKey][AccKey]["SAS"] and self.BranchToPValues_D[BranchKey][AccKey]["Dist"]: if self.BranchToPValues_D[BranchKey][AccKey]["Dist"][0] != 0.0: OutputString = "%s%s %s%s %s%s %s%s %s%s %s%s %s" % (BranchKey, " "*(20-len(BranchKey)),\ AccKey," "*(5-len(AccKey)),\ str(len(self.BranchToAlgorithm_D[BranchKey].getAllMutationXMLAccordingToAccession(AccKey))), " "*(4-len(str(len(self.BranchToAlgorithm_D[BranchKey].getAllMutationXMLAccordingToAccession(AccKey))))),\ str(round(self.BranchToPValues_D[BranchKey][AccKey]["SAS"][0] , 3)), " "*(5-len(str(round(self.BranchToPValues_D[BranchKey][AccKey]["SAS"][0] , 3)))),\ '%.2E' % self.BranchToPValues_D[BranchKey][AccKey]["SAS"][1], " "*(5-len('%.2E' % self.BranchToPValues_D[BranchKey][AccKey]["SAS"][1])),\ str(round(self.BranchToPValues_D[BranchKey][AccKey]["Dist"][0] , 3)), " "*(5-len(str(round(self.BranchToPValues_D[BranchKey][AccKey]["Dist"][0] , 3)))),\ '%.2E' % self.BranchToPValues_D[BranchKey][AccKey]["Dist"][1]) AllOutput_L.append(OutputString) #writes PValues to output file and displays end prompt. AllPath = "%s/PValues.txt" % (self.Directory) with open(AllPath,"w") as w: w.write("\n".join(AllOutput_L)) print "Done.\nAll P-Values written to %s" % (AllPath)
class ExplorePrediction: "CONSTRUCTOR" def __init__(self, Directory, DerivedoI, PDBoI): """ Class attributes: Figures_L (List): list of all the figure types that will be created FiguresSVG_D (Dict): key is the type of figure, value is the SVG syntax that will draw the figure DerivedoInterest (String): Derived node of interest that the figure will be based on PDBoInterest (String): PDB structure that the derived node sequence aligned to and achieved a significant hit on """ #initial setup of what figures will be created self.Figures_L = [ "TreeAndStates", "Alignment", "Structurecartoon", "Structuresurface" ] self.FigureSVG_D = {Key: [] for Key in self.Figures_L} self.Directory = Directory if self.Directory.endswith("/"): pass else: self.Directory = self.Directory + "/" self.DerivedoInterest = DerivedoI self.PDBoInterest = PDBoI print self.Directory print self.DerivedoInterest print self.PDBoInterest #output directory where files will be written self.OutputDirectory = "%sFigures/%s-%s/" % ( self.Directory, self.DerivedoInterest, self.PDBoInterest) if os.path.exists(self.OutputDirectory): pass else: os.system("mkdir " + self.OutputDirectory) #paths to relevant input files self.ReportPATH = self.Directory + "Report.xml" self.TreePATH = self.Directory + "ModdedTree.nwk" self.MatrixPATH = self.Directory + "ScoringMatrix.xml" #parses the report file for sequences and branch relationships self.NodeToSeq_D = { re.compile("<H>(.+?)</H>").search(Seq).group(1): re.compile("<S>(.+?)</S>").search(Seq).group(1) for Seq in re.findall("<Seq>.+?</Seq>", open(self.ReportPATH, "r").read()) } self.BranchToAlgorithm_D = { re.compile("<Branch_name>(.+?)</Branch_name>").search( Branch).group(1): ScopeAlgorithm(Branch) for Branch in re.findall("<Branch>.+?</Branch>", open(self.ReportPATH, "r").read(), re.DOTALL) } self.RectCount = 0 #dimensions self.TreeFigWIDTH = 750 self.TreeFigHEIGHT = 500 self.TreeFigXOffset = 25 self.TreeFigYOffset = 50 #loads and parses tree, gets evolutionary distances for proper branch lengths self.CogentTree = LoadTree(self.TreePATH) self.FastMLTree = FastMLTree(self.TreePATH, False) self.FastMLTree.setBranchLengths() self.LongestDistance = self.getLongestEvoDistance() self.EvoDistance_D = { Key: self.getEvoDistance(Key) for Key in self.NodeToSeq_D.keys() if Key != self.FastMLTree.TopKey } self.EvoDistance_D[self.FastMLTree.TopKey] = 0.0 self.ModdedEvoDistance_D = self.modEvoDistance() self.TreeCoords_D = self.setTreeCoords() FurthestPosition = 0.0 FurthestClade = "" #gets the furthest evolutionary distance for Key in self.FastMLTree.LeafKey_L: Val = self.TreeCoords_D[Key][0] + (12 * len(Key)) if Val > FurthestPosition: FurthestPosition = Val FurthestClade = Key self.BranchoInterest = "" for Key in self.FastMLTree.BranchKey_L: if Key.split(">>")[1] == self.DerivedoInterest: self.BranchoInterest = Key #gets all relevant information for the states portion of the figure self.StateIndices_L = [ int(X) - 1 for X in self.BranchToAlgorithm_D[self.BranchoInterest]. getAllMutationXMLKeysAccordingToAccession(self.PDBoInterest) ] self.LeafStates_D = { Key: [self.NodeToSeq_D[Key][state] for state in self.StateIndices_L] for Key in self.FastMLTree.LeafKey_L } self.StateColour_D = self.getStateToHex() self.StateInc = 25.0 self.StateFigHEIGHT = 500 self.StateFigWIDTH = self.StateInc * (len(self.StateIndices_L)) + 50 self.StateFigXOffset = self.TreeFigXOffset + self.TreeFigWIDTH + ( 12 * len(FurthestClade)) + 25 self.StateFigYOffset = 50 #creates the states and tree figure self.FigureSVG_D["TreeAndStates"].append( self.getSVGHeader( self.TreeFigHEIGHT + (self.TreeFigYOffset * 2), self.StateFigXOffset + self.StateFigWIDTH + self.TreeFigXOffset)) self.makeTreeFig() self.makeStatesFig() self.FigureSVG_D["TreeAndStates"].append("</svg>") self.TreeAndStatesFOutPATH = self.OutputDirectory + "TreeAndStates.png" TreeStateFOut = open(self.TreeAndStatesFOutPATH, "w") cairosvg.svg2png(bytestring="\n".join( self.FigureSVG_D["TreeAndStates"]), write_to=TreeStateFOut) TreeStateFOut.close() LongestCladeName = "" for Key in self.FastMLTree.LeafKey_L: if len(Key) > len(LongestCladeName): LongestCladeName = Key #gets all relevant information for the alignment cartoon portion of the figure self.MatrixInfo = self.parseScoringMatrix() self.AlnInc = 11.0 self.AlignmentFigWIDTH = self.AlnInc * len( self.MatrixInfo["Sseq"]) + self.AlnInc + (8 * len(LongestCladeName)) self.AlignmentFigHEIGHT = self.AlnInc * ( len(self.FastMLTree.LeafKey_L) + 1) + self.AlnInc self.AlignmentFigXOffset = self.AlnInc self.AlignmentFigYOffset = self.AlnInc self.FigureSVG_D["Alignment"].append( self.getSVGHeader(self.AlignmentFigHEIGHT, self.AlignmentFigWIDTH)) self.makeAlignmentFig() self.FigureSVG_D["Alignment"].append("</svg>") self.AlignmentFOutPATH = self.OutputDirectory + "Alignment.png" AlignmentFOut = open(self.AlignmentFOutPATH, "w") cairosvg.svg2png(bytestring="\n".join(self.FigureSVG_D["Alignment"]), write_to=AlignmentFOut) AlignmentFOut.close() #relevant information for the structure file in PDB format self.ColouredStructureFile = self.getColoredStructureFile() self.StructureFOutPATH = self.OutputDirectory + "Structure.pdb" open(self.StructureFOutPATH, "w").write(self.ColouredStructureFile.read()) self.TotalFigWIDTH = 1000 self.TotalFigHEIGHT = 600 self.TotalElement_L = [ self.getSVGHeader(self.TotalFigHEIGHT, self.TotalFigWIDTH) ] self.TotalElement_L.append( '''\t<image x="0" y="0" width="1000" height="500" xlink:href="file://%s"/>''' % (self.TreeAndStatesFOutPATH)) self.TotalElement_L.append( '''\t<image x="0" y="500" width="1000" height="100" xlink:href="file://%s"/>''' % (self.AlignmentFOutPATH)) self.TotalElement_L.append("</svg>") "gets the header for any SVG format file" def getSVGHeader(self, FrameHEIGHT, FrameWIDTH): return """<?xml version="1.0" standalone="no"?> <!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd"> <svg xmlns:xlink="http://www.w3.org/1999/xlink" xmlns='http://www.w3.org/2000/svg' version='1.1' width='%s' height='%s'> """ % (str(FrameWIDTH), str(FrameHEIGHT)) "Dictionary where the key is the amino acid character and the value is the background colour" def getStateToHex(self): return {"A":"80B3E6","C":"E68080","D":"CC4DCC","E":"CC4DCC","F":"80B3E6",\ "G":"E6994D","H":"1AB3B3","I":"80B3E6","K":"E6331A","L":"80B3E6",\ "M":"80B3E6","N":"1ACC1A","P":"CCCC00","Q":"1ACC1A","R":"E6331A",\ "S":"1ACC1A","T":"1ACC1A","V":"80B3E6","W":"80B3E6","Y":"1AB3B3",\ "-":"FFFFFF","X":"FFFFFF"} "returns the total evolutionary distance from the origin to the node of interest" def getEvoDistance(self, startingToNodeKey): distance = 0.0 rootNodeHasNotBeenReached = True ToNodeKey = startingToNodeKey while rootNodeHasNotBeenReached: distance += self.FastMLTree.BranchLength_D[ToNodeKey] branchUpHasNotBeenFound = True for BranchKey in self.FastMLTree.BranchKey_L: if branchUpHasNotBeenFound: if re.compile(">>" + ToNodeKey + "$").search(BranchKey): branchUpHasNotBeenFound = False ToNodeKey = BranchKey.split(">>")[0] if ToNodeKey == self.FastMLTree.TopKey: rootNodeHasNotBeenReached = False return distance "gets the node with the longest evolutionary distance from the origin" def getLongestEvoDistance(self): longestDistance = 0.0 for LeafKey in self.FastMLTree.LeafKey_L: distance = self.getEvoDistance(LeafKey) if distance > longestDistance: longestDistance = distance return longestDistance "modifies evolutionary distance into a different format" def modEvoDistance(self): Ret = {} for Key in self.EvoDistance_D.keys(): if Key == self.FastMLTree.TopKey: Ret[Key] = self.EvoDistance_D[Key] else: if self.EvoDistance_D[Key] == 0: Ret[Key] = self.EvoDistance_D[Key] else: Ret[Key] = self.EvoDistance_D[Key] return Ret "sets tree node coordinates (horizontal and vertical) for the SVG image" def setTreeCoords(self): Lines_L = self.CogentTree.asciiArt().split("\n") MaxVert = 0 VertCoord_D = {} for i in range(0, len(Lines_L)): if re.compile("[a-zA-Z0-9_\.@]+").search(Lines_L[i]): Leaves = re.findall("([a-zA-Z0-9_\.@]+)", Lines_L[i]) for Leaf in Leaves: VertCoord_D[Leaf] = i MaxVert = i TreeCoords_D = { Key: [(self.ModdedEvoDistance_D[Key] / self.LongestDistance) * self.TreeFigWIDTH + self.TreeFigXOffset, float(float(VertCoord_D[Key]) / float(MaxVert)) * self.TreeFigHEIGHT + self.TreeFigYOffset] for Key in self.NodeToSeq_D.keys() } return TreeCoords_D "adds node names at each node vertex" def addNodeNamesAtNodePoints(self): for Key in self.FastMLTree.LeafKey_L: xy = self.TreeCoords_D[Key] xStart = str(xy[0]) yStart = str(xy[1]) self.FigureSVG_D["TreeAndStates"].append( '''\t<text x='%s' y='%s' text-anchor='left' font-size='20' font-family='Courier' style="fill: #000000;" >%s</text>''' % (xStart, yStart, Key)) "adds the vertical lines of the tree image" def addVerticalLines(self): for branchKey in self.FastMLTree.BranchKey_L: fro = branchKey.split(">>")[0] to = branchKey.split(">>")[1] froXY = self.TreeCoords_D[fro] toXY = self.TreeCoords_D[to] if branchKey == self.BranchoInterest: self.FigureSVG_D["TreeAndStates"].append( '''\t<line class='axis' x1='%s' y1='%s' x2='%s' y2='%s' style="stroke:rgb(255,0,0);stroke-width:1 " />''' % (str(froXY[0]), str(froXY[1]), str(froXY[0]), str( toXY[1]))) else: self.FigureSVG_D["TreeAndStates"].append( '''\t<line class='axis' x1='%s' y1='%s' x2='%s' y2='%s' style="stroke:rgb(0,0,0);stroke-width:1 " />''' % (str(froXY[0]), str(froXY[1]), str(froXY[0]), str( toXY[1]))) "adds the horizontal lines of the tree image" def addHorizontalLines(self): for branchKey in self.FastMLTree.BranchKey_L: fro = branchKey.split(">>")[0] to = branchKey.split(">>")[1] froXY = self.TreeCoords_D[fro] toXY = self.TreeCoords_D[to] if branchKey == self.BranchoInterest: self.FigureSVG_D["TreeAndStates"].append( '''\t<line class='axis' x1='%s' y1='%s' x2='%s' y2='%s' style="stroke:rgb(255,0,0);stroke-width:1 " />''' % (str(froXY[0]), str(toXY[1]), str(toXY[0]), str(toXY[1]))) else: self.FigureSVG_D["TreeAndStates"].append( '''\t<line class='axis' x1='%s' y1='%s' x2='%s' y2='%s' style="stroke:rgb(0,0,0);stroke-width:1 " />''' % (str(froXY[0]), str(toXY[1]), str(toXY[0]), str(toXY[1]))) "does all methods necessary to make the tree image" def makeTreeFig(self): self.addNodeNamesAtNodePoints() self.addVerticalLines() self.addHorizontalLines() "adds the rows for the mutated states in each sequence" def addStateRows(self): inc = self.StateInc vertInc = float(self.StateFigHEIGHT / float(len(self.LeafStates_D))) lowestY = float("inf") for Key in self.TreeCoords_D.keys(): if self.TreeCoords_D[Key][1] < lowestY: lowestY = self.TreeCoords_D[Key][1] stateY = lowestY - (1.5 * vertInc) stateX = 0.0 + self.StateFigXOffset for i in self.StateIndices_L: self.FigureSVG_D["TreeAndStates"].append( '''\t<text x='%s' y='%s' text-anchor='middle' font-size='16' font-family='Courier' transform="rotate(90, %s, %s)" style="fill: #000000;" >%s</text>''' % (str(stateX), str(stateY), str(stateX), str(stateY), str(i + 1))) stateX += inc for Key in self.LeafStates_D.keys(): X = 0.0 + self.StateFigXOffset for State in self.LeafStates_D[Key]: Y = self.TreeCoords_D[Key][1] RectX = X - (float(inc / 2.0)) RectY = Y - (float(vertInc / 2.0)) - 5.0 self.FigureSVG_D["TreeAndStates"].append('''\t<rect class='r%s' x='%s' y='%s' width='%s' height='%s' style="fill:#%s" />''' % (str(self.RectCount),\ str(RectX),str(RectY),\ str(inc),str(vertInc),\ self.StateColour_D[State])) self.FigureSVG_D["TreeAndStates"].append( '''\t<text x='%s' y='%s' font-size='20' font-family='Courier' text-anchor='middle' style="fill: #000000;" >%s</text>''' % (str(X), str(Y), State)) X += inc "executes the method to make the states figure" def makeStatesFig(self): self.addStateRows() "parses the scoring matrix for alignment to the PDB sequence information" def parseScoringMatrix(self): allAlignments_L = re.findall("<PDB_alignment>.+?</PDB_alignment>", open(self.MatrixPATH, "r").read(), re.DOTALL) KeyAln = "" NotFound = True for Alignment in allAlignments_L: if NotFound: PDBID = re.compile("<PDB_id>(.+?)</PDB_id>").search( Alignment).group(1).split("|")[0] if self.PDBoInterest.upper() == PDBID: NotFound = False KeyAln = Alignment self.ChainoInterest = re.compile( "<PDB_id>(.+?)</PDB_id>").search(Alignment).group( 1).split("|")[1].lower() return {"Qstart" : int(re.compile("<Alignment_start_query>(.+?)</Alignment_start_query>").search(KeyAln).group(1))-1,\ "Qend" : int(re.compile("<Alignment_end_query>(.+?)</Alignment_end_query>").search(KeyAln).group(1))-1,\ "Sstart" : int(re.compile("<Alignment_start_subject>(.+?)</Alignment_start_subject>").search(KeyAln).group(1))-1,\ "Send" : int(re.compile("<Alignment_end_subject>(.+?)</Alignment_end_subject>").search(KeyAln).group(1))-1,\ "Sseq" : re.compile("<Aligned_subject_sequence>(.+?)</Aligned_subject_sequence>").search(KeyAln).group(1)} "makes the cartoon of all aligned sequences in the protein family" def makeAlignmentFig(self): AllSeqs_L = [self.MatrixInfo["Sseq"]] + [ self.NodeToSeq_D[Key] [self.MatrixInfo["Qstart"]:self.MatrixInfo["Qstart"] + len(self.MatrixInfo["Sseq"])] for Key in self.FastMLTree.LeafKey_L ] l1 = len(AllSeqs_L[0]) AllHeaders_L = [self.PDBoInterest] + self.FastMLTree.LeafKey_L l2 = 0 for Header in AllHeaders_L: if len(Header) > l2: l2 = len(Header) l = l1 xinc = self.AlnInc yinc = self.AlnInc Y = self.AlignmentFigYOffset for i in range(0, len(AllSeqs_L)): X = 0.0 + self.AlignmentFigXOffset for State in AllSeqs_L[i]: RectX = X - (float(xinc / 2.0)) RectY = Y - (float(yinc / 2.0)) - 5.0 self.FigureSVG_D["Alignment"].append('''\t<rect class='r%s' x='%s' y='%s' width='%s' height='%s' style="fill:#%s" />''' % (str(self.RectCount),\ str(RectX),str(RectY),\ str(xinc),str(yinc),\ self.StateColour_D[State])) self.FigureSVG_D["Alignment"].append( '''\t<text x='%s' y='%s' text-anchor='middle' font-size='10' font-family='Courier' style="fill: #000000;" >%s</text>''' % (str(X), str(Y), State)) X += xinc self.FigureSVG_D["Alignment"].append( '''\t<text x='%s' y='%s' text-anchor='left' font-size='10' font-family='Courier' style="fill: #000000;" >%s</text>''' % (str(X + self.AlnInc), str(Y), AllHeaders_L[i])) Y += yinc "gets a PDB format file with the temperature factors coloured to reflect mutated sites" def getColoredStructureFile(self): NotFound = True DesiredBranchKey = "" for BranchKey in self.FastMLTree.BranchKey_L: if BranchKey.split(">>")[1] == self.DerivedoInterest: DesiredBranchKey = BranchKey NotFound = False PDBAndPDBXMLContents = getAllPDBFileDicts([self.PDBoInterest]) SA = self.BranchToAlgorithm_D[DesiredBranchKey] SA.PDBContents_D = PDBAndPDBXMLContents[0] SA.PDBXMLContents_D = PDBAndPDBXMLContents[1] FH = getOutputTempFile() SA.createPDBColoredFile(self.PDBoInterest, FH.name) return FH