def pdb_to_json(text, name, parser=None): ''' Create a graph-layout displaying a pdb file which presumably contains some RNA The text is the contents of the pdb file. :param text: The text of the pdb file. :param name: The name of the pdb file. :param parser: The PDB parser to use (Bio.PDB.PDBParser or Bio.PDB.MMCIFParser) ''' with fus.make_temp_directory() as output_dir: fname = op.join(output_dir, '{}.pdb'.format(name)) with open(fname, 'w') as f: # dump the pdb text to a temporary file f.write(text) f.flush() struct = parser.get_structure('temp', fname) chains = struct.get_chains() molecules = [] proteins = set() rnas = set() cgs = dict() for chain in chains: # create a graph json for each structure in the pdb file if ftup.is_protein(chain): print >> sys.stderr, "protein", chain proteins.add(chain.id) # process protein molecules += [{ "type": "protein", "header": "{}_{}".format(name, chain.id), "seq": "", "ss": "", "size": len(chain.get_list()), "uids": [uuid.uuid4().hex] }] pass elif ftup.is_rna(chain): print >> sys.stderr, "rna", chain rnas.add(chain.id) # process RNA molecules (hopefully) cg = ftmc.from_pdb(fname, chain_id=chain.id, remove_pseudoknots=True, parser=parser) positions = fasta_to_positions(cg.to_fasta_string()) cg = ftmc.from_pdb(fname, chain_id=chain.id, remove_pseudoknots=False, parser=parser) cgs[chain.id] = cg molecules += [{ "type": "rna", "header": "{}_{}".format(name, chain.id), "seq": cg.seq, "ss": cg.to_dotbracket_string(), "size": cg.seq_length, "uids": [uuid.uuid4().hex for i in range(cg.seq_length)], "positions": positions }] else: # hetatm type chains which are present in MMCIF files pass # create a lookup table linking the id and residue number to the uid of # that nucleotide and residue number node_ids = dict() for m in molecules: for i, uid in enumerate(m['uids']): node_ids["{}_{}".format(m['header'], i + 1)] = uid links = [] for (a1, a2) in ftup.interchain_contacts(struct): if (a1.parent.id[0] != ' ' or a2.parent.id[0] != ' '): #hetatm's will be ignored for now continue chain1 = a1.parent.parent.id chain2 = a2.parent.parent.id # the source and target values below need to be reduced by the length of the # nodes array because when the jsons are added to the graph, the link # source and target are incremented so as to correspond to the new indeces # of the nodes # so a link to a node at position 10, if there are 50 nodes, will have to have # a source value of -40 if (chain1 in proteins and chain2 in rnas): # get the index of this nucleotide in the secondary structure sid = cgs[chain2].seq_ids.index(a2.parent.id) links += [{ "source": node_ids["{}_{}_{}".format(name, chain2, sid + 1)], "target": node_ids["{}_{}_{}".format(name, chain1, 1)], "link_type": "protein_chain", "value": 3 }] elif (chain2 in proteins and chain1 in rnas): # get the index of this nucleotide in the secondary structure sid = cgs[chain1].seq_ids.index(a1.parent.id) links += [{ "source": node_ids["{}_{}_{}".format(name, chain1, sid + 1)], "target": node_ids["{}_{}_{}".format(name, chain2, 1)], "link_type": "protein_chain", "value": 3 }] elif (chain2 in rnas and chain1 in rnas): # get the index of this nucleotide in the secondary structure sid1 = cgs[chain1].seq_ids.index(a1.parent.id) sid2 = cgs[chain2].seq_ids.index(a2.parent.id) links += [{ "source": node_ids["{}_{}_{}".format(name, chain1, sid1 + 1)], "target": node_ids["{}_{}_{}".format(name, chain2, sid2 + 1)], "link_type": "chain_chain", "value": 3 }] return {"molecules": molecules, "extra_links": links}
def plot_pdb(filename, ax=None): """ Plot a pdb file. :param structure: A Bio.PDB.Structure :return: An Axes object (ax) """ structure = Bio.PDB.PDBParser().get_structure('blah', filename) model = list(structure)[0] ax = None chain_coords = {} cgs = {} import collections as col # store a list of RNA nucleotides that each protein interacts with protein_interactions = col.defaultdict(set) protein_circles = [] for chain in model: # iterate over RNAs if ftup.is_rna(chain): # convert to cg and store so that we can convert pdb nucleotide ids to # secondary structure indexes later cg = ftmc.from_pdb(filename, chain_id=chain.id) cgs[chain.id] = cg # plot the structure and store the coordinates (ax, coords) = plot_rna(cg, offset=True, ax=ax) chain_coords[chain.id] = coords for (a1, a2) in ftup.interchain_contacts(structure): # iterate over all the interactions in order to find out which # nucleotides this protein interacts with chain1 = a1.parent.parent chain2 = a2.parent.parent if ftup.is_protein(chain1) and ftup.is_rna(chain2): # collect all the RNA nucleotides that a protein interacts with sid = cgs[chain2.id].seq_ids.index(a2.parent.id) protein_interactions[chain1.id].add((chain2.id, sid)) if ftup.is_rna(chain1) and ftup.is_rna(chain2): sid1 = cgs[chain1.id].seq_ids.index(a1.parent.id) sid2 = cgs[chain2.id].seq_ids.index(a2.parent.id) coord1 = chain_coords[chain1.id][sid1] coord2 = chain_coords[chain2.id][sid2] ax.plot([coord1[0], coord2[0]], [coord1[1], coord2[1]], 'k-', alpha=0.5) for chain in model: # draw each protein and the links that it has to other nucleotides if ftup.is_protein(chain): # the protein will be positioned at the centroid of the nucleotides # that it interacts with interacting_coords = [np.array(chain_coords[chain_id][nuc_num]) for (chain_id, nuc_num) in protein_interactions[chain.id]] centroid = np.sum(interacting_coords, axis=0) / len(interacting_coords) # the size of the circle representing it will be proportional to its # length (in nucleotides) radius = 2 * math.sqrt(len(chain.get_list())) protein_circles += [[centroid[0], centroid[1], radius]] # draw all of the interactions as lines for coord in interacting_coords: ax.plot([coord[0], centroid[0]], [coord[1], centroid[1]], 'k-', alpha=0.5) protein_circles = np.array(protein_circles) if len(protein_circles) > 0: circles(protein_circles[:,0], protein_circles[:,1], protein_circles[:,2], 'grey', alpha=0.5) #plt.axis('off') pass
def pdb_to_json(text, name): ''' Create a graph-layout displaying a pdb file which presumably contains some RNA The text is the contents of the pdb file. ''' with fus.make_temp_directory() as output_dir: fname = op.join(output_dir, '{}.pdb'.format(name)) with open(fname, 'w') as f: # dump the pdb text to a temporary file f.write(text) f.flush struct = bpdb.PDBParser().get_structure('temp', fname) chains = struct.get_chains() jsons = [] proteins = set() rnas = set() cgs = dict() for chain in chains: # create a graph json for each structure in the pdb file if ftup.is_protein(chain): proteins.add(chain.id) # process protein jsons += [{ "nodes": [{ "group": 2, "struct_name": "{}_{}".format(name, chain.id), "id": 1, "size": len(chain.get_list()), "name": chain.id, "node_type": "protein" }], "links": [] }] pass else: rnas.add(chain.id) # process RNA molecules (hopefully) cg = ftmc.from_pdb(fname, chain_id=chain.id) cgs[chain.id] = cg jsons += [bg_to_json(cg)] # create a lookup table to find out the index of each node in the # what will eventually become the large list of nodes counter = 0 node_ids = dict() for j in jsons: for n in j['nodes']: node_ids["{}_{}".format(n['struct_name'], n['id'])] = counter counter += 1 links = [] for (a1, a2) in ftup.interchain_contacts(struct): if (a1.parent.id[0] != ' ' or a2.parent.id[0] != ' '): #hetatm's will be ignored for now continue chain1 = a1.parent.parent.id chain2 = a2.parent.parent.id # the source and target values below need to be reduced by the length of the # nodes array because when the jsons are added to the graph, the link # source and target are incremented so as to correspond to the new indeces # of the nodes # so a link to a node at position 10, if there are 50 nodes, will have to have # a source value of -40 if (chain1 in proteins and chain2 in rnas): # get the index of this nucleotide in the secondary structure sid = cgs[chain2].seq_ids.index(a2.parent.id) links += [{ "source": node_ids["{}_{}_{}".format(name, chain2, sid + 1)] - counter, "target": node_ids["{}_{}_{}".format(name, chain1, 1)] - counter, "link_type": "protein_chain", "value": 3 }] elif (chain2 in proteins and chain1 in rnas): # get the index of this nucleotide in the secondary structure sid = cgs[chain1].seq_ids.index(a1.parent.id) links += [{ "source": node_ids["{}_{}_{}".format(name, chain1, sid + 1)] - counter, "target": node_ids["{}_{}_{}".format(name, chain2, 1)] - counter, "link_type": "protein_chain", "value": 3 }] elif (chain2 in rnas and chain1 in rnas): # get the index of this nucleotide in the secondary structure sid1 = cgs[chain1].seq_ids.index(a1.parent.id) sid2 = cgs[chain2].seq_ids.index(a2.parent.id) links += [{ "source": node_ids["{}_{}_{}".format(name, chain1, sid1 + 1)] - counter, "target": node_ids["{}_{}_{}".format(name, chain2, sid2 + 1)] - counter, "link_type": "chain_chain", "value": 3 }] #jsons += [{'nodes': [], "links": links}] jsons += [{"nodes": [], "links": links}] return {"jsons": jsons, "extra_links": links}
def test_interchain_contacts(self): struct = bpdb.PDBParser().get_structure( "temp", 'test/forgi/threedee/data/1MFQ.pdb') ftup.interchain_contacts(struct)
def test_interchain_contacts(self): struct = bpdb.PDBParser().get_structure("temp", 'test/forgi/threedee/data/1MFQ.pdb') ftup.interchain_contacts(struct)
def test_interchain_contacts(self): with warnings.catch_warnings(): warnings.simplefilter("ignore") struct = bpdb.PDBParser().get_structure( "temp", 'test/forgi/threedee/data/1MFQ.pdb') ftup.interchain_contacts(struct)
def plot_pdb(filename, ax=None): """ Plot the secondary structure of an RNA in a PDB file using the Graph Layout from the ViennaRNA package and indicate long-range interations and RNA-protein interactions. Interchain RNA-RNA interactions are shown as red lines. Proteins are shown as transparent gray circles with lines indicating the interacting residues. The circle radius corresponds to the number of interacting nucleotides. :param structure: A Bio.PDB.Structure :param ax: Optional. An matplotlib axis object :return: An Axes object (ax) """ structure = Bio.PDB.PDBParser().get_structure('blah', filename) model = list(structure)[0] chain_coords = {} cgs = {} import collections as col # store a list of RNA nucleotides that each protein interacts with protein_interactions = col.defaultdict(set) protein_circles = [] for chain in model: # iterate over RNAs if ftup.contains_rna(chain): # convert to cg and store so that we can convert pdb nucleotide ids to # secondary structure indexes later cg, = ftmc.CoarseGrainRNA.from_pdb(filename, load_chains=chain.id) cgs[chain.id] = cg # plot the structure and store the coordinates (ax, coords) = plot_rna(cg, offset=True, ax=ax) chain_coords[chain.id] = coords for (a1, a2) in ftup.interchain_contacts(structure): # iterate over all the interactions in order to find out which # nucleotides this protein interacts with chain1 = a1.parent.parent chain2 = a2.parent.parent if ftup.is_protein(chain1) and ftup.contains_rna(chain2): # collect all the RNA nucleotides that a protein interacts with sid = cgs[chain2.id].seq.to_integer(fgr.RESID(chain2.id, a2.parent.id))-1 protein_interactions[chain1.id].add((chain2.id, sid)) if ftup.contains_rna(chain1) and ftup.contains_rna(chain2): try: sid1 = cgs[chain1.id].seq.to_integer(fgr.RESID(chain1.id, a1.parent.id))-1 sid2 = cgs[chain2.id].seq.to_integer(fgr.RESID(chain2.id, a2.parent.id))-1 except ValueError: continue coord1 = chain_coords[chain1.id][sid1] coord2 = chain_coords[chain2.id][sid2] ax.plot([coord1[0], coord2[0]], [coord1[1], coord2[1]], 'k-', alpha=0.5) for chain in model: # draw each protein and the links that it has to other nucleotides if ftup.is_protein(chain): # the protein will be positioned at the centroid of the nucleotides # that it interacts with interacting_coords = [np.array(chain_coords[chain_id][nuc_num]) for (chain_id, nuc_num) in protein_interactions[chain.id]] centroid = np.sum(interacting_coords, axis=0) / \ len(interacting_coords) # the size of the circle representing it will be proportional to its # length (in nucleotides) radius = 2 * math.sqrt(len(chain.get_list())) protein_circles += [[centroid[0], centroid[1], radius]] # draw all of the interactions as lines for coord in interacting_coords: ax.plot([coord[0], centroid[0]], [ coord[1], centroid[1]], 'k-', alpha=0.5) protein_circles = np.array(protein_circles) if len(protein_circles) > 0: circles(protein_circles[:, 0], protein_circles[:, 1], protein_circles[:, 2], 'grey', alpha=0.5) # plt.axis('off') return ax