def write_backbone_angles(chain, region=None, offset=0, outfile=sys.stdout, header=False): """ Write Psi/Phi angles from a pdb file """ if region is None: region = (0, float('inf')) polypeptide_builder = PPBuilder() polypeptides = polypeptide_builder.build_peptides(chain) if header: print(HEADER, file=outfile) for peptide in polypeptides: angles = peptide.get_phi_psi_list() for residue, (phi, psi) in zip(peptide, angles): position = residue.get_id()[1] if region[0] <= position <= region[1]: print(chain.id, position, seq1(residue.get_resname()), position + offset, 'NA' if phi is None else phi * RAD_FACTOR, 'NA' if psi is None else psi * RAD_FACTOR, sep='\t', file=outfile)
def SplitChain(PDB_objects): """ Splits a list of PDB files by chain creating one PDB and one FASTA file per chain. Arguments: PDB_objects: list of PDB objects (with many chains) generated by the PDB parser. """ File_prefix = [] for pdb in PDB_objects: chain_names = set() io = PDBIO() # Creates a PDB file for each chain of the original file. for chain in pdb.get_chains(): if chain.get_id() not in chain_names: io.set_structure(chain) io.save(pdb.get_id() + "_" + chain.get_id() + ".pdb") File_prefix.append(pdb.get_id() + "_" + chain.get_id()) # Creates a FASTA file for each chain of the original file. polipeptide = PPBuilder() for pp in polipeptide.build_peptides(pdb): fasta = open(pdb.get_id() + "_" + chain.get_id() + ".fa", "w") fasta.write(">" + pdb.get_id() + "_" + chain.get_id() + "\n") fasta.write(str(pp.get_sequence())) chain_names.add(chain.get_id()) return File_prefix
def CreateJoinedFastas(input_PDB_objects): """ Joins many PDB objects and creates a FASTA file with all objects joined. Arguments: input_PDB_objects: list of PDB objects whose sequence will be added to the FASTA file. """ polipeptide = PPBuilder() first_line = True filename = "" # Create FASTA files. for obj in input_PDB_objects: filename = filename + obj.get_id() + "_" filename = filename + ".fa" joined_fasta = open(filename, 'w') # Write FASTA files. for obj in input_PDB_objects: if first_line: joined_fasta.write(">" + obj.get_id() + "\n") first_line = False else: joined_fasta.write("\n" + ">" + obj.get_id() + "\n") for polipep in polipeptide.build_peptides(obj): joined_fasta.write(str(polipep.get_sequence())) return filename
def compute_secondary_structure(self, model): """ This function defines all the secondary structures of the model passed in input :param model: one model :return: the matrix of secondary structures """ # Calculate PSI and PHI ppb = PPBuilder() rama = { } # { chain : [[residue_1, ...], [phi_residue_1, ...], [psi_residue_2, ...] ] } residue_found = 0 for chain in model: for pp in ppb.build_peptides(chain): phi_psi = pp.get_phi_psi_list() for i, residue in enumerate(pp): if phi_psi[i][0] is not None and phi_psi[i][1] is not None: # Conversion to degrees when the values are not None (for first and last) rama.setdefault(chain.id, [[], [], []]) rama[chain.id][0].append(residue) rama[chain.id][1].append(math.degrees(phi_psi[i][0])) rama[chain.id][2].append(math.degrees(phi_psi[i][1])) else: # Adding of Nan if the angles are None (for first and last) rama.setdefault(chain.id, [[], [], []]) rama[chain.id][0].append(residue) rama[chain.id][1].append(math.nan) rama[chain.id][2].append(math.nan) residue_found += 1 # Eventual nan-padding if something goes wrong during the angle computation if residue_found < self._residues: for i in range(self._residues - residue_found): rama.setdefault('Z', [[], [], []]) rama['Z'][0].append(None) rama['Z'][1].append(math.nan) rama['Z'][2].append(math.nan) # Comparison of the angles with the Ramachandran regions ss = [] for chain_id in rama: for residue, phi, psi in zip(*rama[chain_id]): ss_class = None if math.isnan(phi) and math.isnan(psi): # If nan (angles not available) insert a symbol indicating this situation ss_class = '-' else: # Determine the correspondent region and store it for x, y, width, height, ss_c, color in self._ranges: if x <= phi < x + width and y <= psi < y + height: ss_class = ss_c break ss.append(ss_class) return ss
def get_secondary_structure(structure): rama_ss_ranges = [(-180, -180, 80, 60, 'E', 'blue'), (-180, 50, 80, 130, 'E', 'blue'), (-100, -180, 100, 60, 'P', 'green'), (-100, 50, 100, 130, 'P', 'green'), (-180, -120, 180, 170, 'H', 'red'), (0, -180, 180, 360, 'L', 'yellow')] # Calculate PSI and PHI ppb = PPBuilder() # PolyPeptideBuilder ss = ["" for x in range(N)] for chain in structure: for pp in ppb.build_peptides(chain): phi_psi = pp.get_phi_psi_list( ) # [(phi_residue_1, psi_residue_1), ...] for i, residue in enumerate(pp): # print(model, chain, i, residue, phi_psi[i]) # Convert radians to degrees and remove first and last value that are None if phi_psi[i][0] is not None and phi_psi[i][1] is not None: for x, y, w, h, ss_c, color in rama_ss_ranges: if x <= phi_psi[i][0] < x + w and y <= phi_psi[i][ 1] < y + h: ss[i] = ss_c break return ss
def test_ppbuilder_torsion(self): """Test phi/psi angles calculated with PPBuilder.""" ppb = PPBuilder() pp = ppb.build_peptides(self.structure) phi_psi = pp[0].get_phi_psi_list() self.assertIsNone(phi_psi[0][0]) self.assertAlmostEqual(phi_psi[0][1], -0.46297171497725553, places=3) self.assertAlmostEqual(phi_psi[1][0], -1.0873937604007962, places=3) self.assertAlmostEqual(phi_psi[1][1], 2.1337707832637109, places=3) self.assertAlmostEqual(phi_psi[2][0], -2.4052232743651878, places=3) self.assertAlmostEqual(phi_psi[2][1], 2.3807316946081554, places=3) phi_psi = pp[1].get_phi_psi_list() self.assertIsNone(phi_psi[0][0]) self.assertAlmostEqual(phi_psi[0][1], -0.6810077089092923, places=3) self.assertAlmostEqual(phi_psi[1][0], -1.2654003477656888, places=3) self.assertAlmostEqual(phi_psi[1][1], -0.58689987042756309, places=3) self.assertAlmostEqual(phi_psi[2][0], -1.7467679151684763, places=3) self.assertAlmostEqual(phi_psi[2][1], -1.5655066256698336, places=3) phi_psi = pp[2].get_phi_psi_list() self.assertIsNone(phi_psi[0][0]) self.assertAlmostEqual(phi_psi[0][1], -0.73222884210889716, places=3) self.assertAlmostEqual(phi_psi[1][0], -1.1044740234566259, places=3) self.assertAlmostEqual(phi_psi[1][1], -0.69681334592782884, places=3) self.assertAlmostEqual(phi_psi[2][0], -1.8497413300164958, places=3) self.assertAlmostEqual(phi_psi[2][1], 0.34762889834809058, places=3)
def test_c_n(self): """Extract polypeptides using C-N.""" ppbuild = PPBuilder() polypeptides = ppbuild.build_peptides(self.structure[1]) self.assertEqual(len(polypeptides), 1) pp = polypeptides[0] # Check the start and end positions self.assertEqual(pp[0].get_id()[1], 2) self.assertEqual(pp[-1].get_id()[1], 86)
def get_structure_sequence(struct): # type: (Structure) -> str """ Gets the structure sequence using PPBuilder :param struct: Structure object :return: struct sequence """ ppb = PPBuilder() return ''.join( [str(pp.get_sequence()) for pp in ppb.build_peptides(struct)])
def test_polypeptide(self): """Tests on polypetide class and methods.""" p = PDBParser(PERMISSIVE=True) pdb1 = "PDB/1A8O.pdb" s = p.get_structure("scr", pdb1) ppb = PPBuilder() pp = ppb.build_peptides(s) self.assertEqual(str(pp[0].get_sequence()), "DIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNW") self.assertEqual(str(pp[1].get_sequence()), "TETLLVQNANPDCKTILKALGPGATLEE") self.assertEqual(str(pp[2].get_sequence()), "TACQG") phi_psi = pp[0].get_phi_psi_list() self.assertEqual(phi_psi[0][0], None) self.assertAlmostEqual(phi_psi[0][1], -0.46297171497725553, places=3) self.assertAlmostEqual(phi_psi[1][0], -1.0873937604007962, places=3) self.assertAlmostEqual(phi_psi[1][1], 2.1337707832637109, places=3) self.assertAlmostEqual(phi_psi[2][0], -2.4052232743651878, places=3) self.assertAlmostEqual(phi_psi[2][1], 2.3807316946081554, places=3) phi_psi = pp[1].get_phi_psi_list() self.assertEqual(phi_psi[0][0], None) self.assertAlmostEqual(phi_psi[0][1], -0.6810077089092923, places=3) self.assertAlmostEqual(phi_psi[1][0], -1.2654003477656888, places=3) self.assertAlmostEqual(phi_psi[1][1], -0.58689987042756309, places=3) self.assertAlmostEqual(phi_psi[2][0], -1.7467679151684763, places=3) self.assertAlmostEqual(phi_psi[2][1], -1.5655066256698336, places=3) phi_psi = pp[2].get_phi_psi_list() self.assertEqual(phi_psi[0][0], None) self.assertAlmostEqual(phi_psi[0][1], -0.73222884210889716, places=3) self.assertAlmostEqual(phi_psi[1][0], -1.1044740234566259, places=3) self.assertAlmostEqual(phi_psi[1][1], -0.69681334592782884, places=3) self.assertAlmostEqual(phi_psi[2][0], -1.8497413300164958, places=3) self.assertAlmostEqual(phi_psi[2][1], 0.34762889834809058, places=3) ppb = CaPPBuilder() pp = ppb.build_peptides(s) self.assertEqual(str(pp[0].get_sequence()), "DIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNW") self.assertEqual(str(pp[1].get_sequence()), "TETLLVQNANPDCKTILKALGPGATLEE") self.assertEqual(str(pp[2].get_sequence()), "TACQG") self.assertEqual([ca.serial_number for ca in pp[0].get_ca_list()], [ 10, 18, 26, 37, 46, 50, 57, 66, 75, 82, 93, 104, 112, 124, 131, 139, 150, 161, 173, 182, 189, 197, 208, 213, 222, 231, 236, 242, 251, 260, 267, 276, 284 ]) taus = pp[1].get_tau_list() self.assertAlmostEqual(taus[0], 0.3597907225123525, places=3) self.assertAlmostEqual(taus[1], 0.43239284636769254, places=3) self.assertAlmostEqual(taus[2], 0.99820157492712114, places=3) thetas = pp[2].get_theta_list() self.assertAlmostEqual(thetas[0], 1.6610069445335354, places=3) self.assertAlmostEqual(thetas[1], 1.7491703334817772, places=3) self.assertAlmostEqual(thetas[2], 2.0702447422720143, places=3)
def is_protein(chain): """ Check if chain is a protein. :param chain: :return: """ ppb = PPBuilder() for pp in ppb.build_peptides(chain): if len(pp.get_sequence()) > 0: return True return False
def chain_to_one_pp(chain): ppb = PPBuilder() polypeptides = ppb.build_peptides(chain) if len(polypeptides) != 1: print('warning ', len(polypeptides), ' polypeptides from one chain, extending first pp') for pp in polypeptides[1:]: polypeptides[0].extend(pp) return polypeptides[0]
def structure_filtered_dca_get_sequence_from_structure(structure): from Bio.PDB import PPBuilder sequence = "" ppb = PPBuilder(radius=10.0) for pp in ppb.build_peptides(structure, aa_only=False): sequence += '%s\n' % pp.get_sequence() return sequence.replace('\n', '')
def run_test(): from Bio.PDB import PDBParser, PPBuilder, CaPPBuilder # first make a PDB parser object p=PDBParser(PERMISSIVE=1) # get the structure, call it "example" structure=p.get_structure("example", "PDB/a_structure.pdb") # now loop over content and print some info for model in structure.get_list(): model_id=model.get_id() print "Model %i contains %i chains." % (model_id, len(model)) for chain in model.get_list(): chain_id=chain.get_id() print "\tChain '%s' contains %i residues." % (chain_id, len(chain)) for residue in chain.get_list(): residue_id=residue.get_id() hetfield, resseq, icode=residue_id print "\t\tResidue ('%s', %i, '%s') contains %i atoms." % (hetfield, resseq, icode, len(residue)) # check if there is disorder due to a point mutation --- this is rare if residue.is_disordered()==2: print "\t\t\tThere is a point mutation present in the crystal at this position." s="\t\t\tResidues at this position are " for resname in residue.disordered_get_id_list(): s=s+resname+" " print s[:-1]+"." # count the number of disordered atoms if residue.is_disordered()==1: disordered_count=0 for atom in residue.get_list(): if atom.is_disordered(): disordered_count=disordered_count+1 if disordered_count>0: print "\t\t\tThe residue contains %i disordered atoms." % disordered_count print "Polypeptides using C-N" ppb=PPBuilder() for pp in ppb.build_peptides(structure[1]): print pp print "Polypeptides using CA-CA" ppb=CaPPBuilder() for pp in ppb.build_peptides(structure[1]): print pp print "NeighborSearch test" quick_neighbor_search_test()
def run(infile, splitpdb): parser = PDBParser() struct = parser.get_structure('mystruct', infile) ppb = PPBuilder() basename = os.path.basename(infile) prefix = os.path.splitext(basename)[0] if splitpdb == 0: # We do NOT split the PDB and fasta files! seqfile = open(prefix + '.fasta', 'w') pdbio = PDBIO_RPL.PDBIO() pdbio.set_structure(struct) cleanfile = prefix + '_clean.pdb' pdbio.save(cleanfile) ListChains = [] for model in struct: for chain in model: ListChains.append(chain.id) ListPpdb = ppb.build_peptides(chain) if (len(ListPpdb) > 0): for index, pp in enumerate(ListPpdb): # print(chain.id,index,pp.get_sequence(),pp if splitpdb == 1: # We split the PDB and fasta files! seqfile = open( prefix + '_' + chain.id + '.' + str(index) + '.fasta', 'w') seq = pp.get_sequence() seqfile.write('>%s %s\n' % (prefix + '_chain_' + chain.id + '_' + str(index), len(seq))) seqfile.write('%s' % seq) seqfile.write('\n') if splitpdb == 1: # We split the PDB and fasta files! seqfile.close() startres = pp[0].id[1] endres = pp[-1].id[1] ofile = prefix + '_' + chain.id + '.' + str( index) + '.pdb' Dice_RPL.extract(struct, chain.id, startres, endres, ofile) else: # Also split chains that do not consist of amino acids! ChainList = chain.get_list() startres = ChainList[0].id[1] endres = ChainList[0].id[-1] ofile = prefix + '_' + chain.id + '.' + str(index) + '.pdb' Dice_RPL.extract(struct, chain.id, startres, endres, ofile) if splitpdb == 0: # We do NOT split the PDB and fasta files! seqfile.close() return ListChains
def run_test(): from Bio.PDB import PDBParser, PPBuilder, CaPPBuilder # first make a PDB parser object p = PDBParser(PERMISSIVE=1) # get the structure, call it "example" structure = p.get_structure("example", "PDB/a_structure.pdb") # now loop over content and print some info for model in structure.get_list(): model_id = model.get_id() print "Model %i contains %i chains." % (model_id, len(model)) for chain in model.get_list(): chain_id = chain.get_id() print "\tChain '%s' contains %i residues." % (chain_id, len(chain)) for residue in chain.get_list(): residue_id = residue.get_id() hetfield, resseq, icode = residue_id print "\t\tResidue ('%s', %i, '%s') contains %i atoms." % ( hetfield, resseq, icode, len(residue)) # check if there is disorder due to a point mutation --- this is rare if residue.is_disordered() == 2: print "\t\t\tThere is a point mutation present in the crystal at this position." s = "\t\t\tResidues at this position are " for resname in residue.disordered_get_id_list(): s = s + resname + " " print s[:-1] + "." # count the number of disordered atoms if residue.is_disordered() == 1: disordered_count = 0 for atom in residue.get_list(): if atom.is_disordered(): disordered_count = disordered_count + 1 if disordered_count > 0: print "\t\t\tThe residue contains %i disordered atoms." % disordered_count print "Polypeptides using C-N" ppb = PPBuilder() for pp in ppb.build_peptides(structure[1]): print pp print "Polypeptides using CA-CA" ppb = CaPPBuilder() for pp in ppb.build_peptides(structure[1]): print pp print "NeighborSearch test" quick_neighbor_search_test()
def test_c_n(self): """Extract polypeptides using C-N.""" ppbuild = PPBuilder() polypeptides = ppbuild.build_peptides(self.structure[1]) self.assertEqual(len(polypeptides), 1) pp = polypeptides[0] # Check the start and end positions self.assertEqual(pp[0].get_id()[1], 2) self.assertEqual(pp[-1].get_id()[1], 86) # Check the sequence s = pp.get_sequence() self.assertTrue(isinstance(s, Seq)) self.assertEqual(s.alphabet, generic_protein) self.assertEqual("RCGSQGGGSTCPGLRCCSIWGWCGDSEPYCGRTCENKCWSGER" "SDHRCGAAVGNPPCGQDRCCSVHGWCGGGNDYCSGGNCQYRC", str(s))
def test_insertions(self): """Test file with residue insertion codes.""" parser = MMCIFParser(QUIET=1) with warnings.catch_warnings(): warnings.simplefilter("ignore", PDBConstructionWarning) structure = parser.get_structure("example", "PDB/4ZHL.cif") for ppbuild in [PPBuilder(), CaPPBuilder()]: # First try allowing non-standard amino acids, polypeptides = ppbuild.build_peptides(structure[0], False) self.assertEqual(len(polypeptides), 2) pp = polypeptides[0] # Check the start and end positions (first segment only) self.assertEqual(pp[0].get_id()[1], 16) self.assertEqual(pp[-1].get_id()[1], 244) # Check the sequence refseq = ( "IIGGEFTTIENQPWFAAIYRRHRGGSVTYVCGGSLISPCWVISATHCFIDYPKKEDYIVYLGR" "SRLNSNTQGEMKFEVENLILHKDYSADTLAYHNDIALLKIRSKEGRCAQPSRTIQTIALPSMY" "NDPQFGTSCEITGFGKEQSTDYLYPEQLKMTVVKLISHRECQQPHYYGSEVTTKMLCAADPQW" "KTDSCQGDSGGPLVCSLQGRMTLTGIVSWGRGCALKDKPGVYTRVSHFLPWIRSHTKE" ) s = pp.get_sequence() self.assertIsInstance(s, Seq) self.assertEqual(s.alphabet, generic_protein) self.assertEqual(refseq, str(s))
def read_pdb_file(file_name, name=None): """ Extract info from a PDB file file_name: path of pdb file name: name of the structure (default name of the file without extension) return:: (structure,R,polypeptides,sequence,seq_res_dict) structure: structure object residues: list of residues polypeptides: list of polypeptides in the structure sequence: combined sequence (for all polypeptides) seq_res_dict: Sequence to residues mapping index list, sequence[i] corresponds to residues[seq_res_dict[i]] """ if name is None: name = splitext(file_name)[0] structure = PDBParser().get_structure(name, file_name) if len(structure) != 1: raise ValueError("Unexpected number of structures in " + name) # residues = Selection.unfold_entities(structure, 'R') atoms = Selection.unfold_entities(structure, 'A') polypeptides = PPBuilder().build_peptides(structure) if len(polypeptides) == 0: polypeptides = CaPPBuilder().build_peptides(structure) sequence = ''.join([str(p.get_sequence()) for p in polypeptides]) residues = [ residue for polypeptide in polypeptides for residue in polypeptide ] protein_name = os.path.basename(file_name).replace(".pdb", "") return protein_name, structure, residues, sequence, atoms
def get_sequence(self, chain_id): """ Input: self: Use Biopython.PDB structure which has been stored in an object variable chain_id : String (usually in ['A','B', 'C' ...]. The number of chains depends on the specific protein and the resulting structure) Return: Return the amino acid sequence (single-letter alphabet!) of a given chain (chain_id) in a Biopython.PDB structure as a string. """ sequence = 'SEQWENCE' ppb = PPBuilder() for pp in ppb.build_peptides(self.structure[0][chain_id]): return pp.get_sequence()
def _pp(self, pdb_path, chain_id): pdb_id = Path(pdb_path).stem pp_list = PPBuilder().build_peptides(PDBParser().get_structure( pdb_id, pdb_path)[0][chain_id]) pp = pp_list[0] for i in pp_list[1:]: pp += i return pp
def test_polypeptide(self): """Tests on polypetide class and methods.""" p = PDBParser(PERMISSIVE=True) pdb1 = "PDB/1A8O.pdb" s = p.get_structure("scr", pdb1) ppb = PPBuilder() pp = ppb.build_peptides(s) self.assertEqual(str(pp[0].get_sequence()), "DIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNW") self.assertEqual(str(pp[1].get_sequence()), "TETLLVQNANPDCKTILKALGPGATLEE") self.assertEqual(str(pp[2].get_sequence()), "TACQG") phi_psi = pp[0].get_phi_psi_list() self.assertEqual(phi_psi[0][0], None) self.assertAlmostEqual(phi_psi[0][1], -0.46297171497725553, places=3) self.assertAlmostEqual(phi_psi[1][0], -1.0873937604007962, places=3) self.assertAlmostEqual(phi_psi[1][1], 2.1337707832637109, places=3) self.assertAlmostEqual(phi_psi[2][0], -2.4052232743651878, places=3) self.assertAlmostEqual(phi_psi[2][1], 2.3807316946081554, places=3) phi_psi = pp[1].get_phi_psi_list() self.assertEqual(phi_psi[0][0], None) self.assertAlmostEqual(phi_psi[0][1], -0.6810077089092923, places=3) self.assertAlmostEqual(phi_psi[1][0], -1.2654003477656888, places=3) self.assertAlmostEqual(phi_psi[1][1], -0.58689987042756309, places=3) self.assertAlmostEqual(phi_psi[2][0], -1.7467679151684763, places=3) self.assertAlmostEqual(phi_psi[2][1], -1.5655066256698336, places=3) phi_psi = pp[2].get_phi_psi_list() self.assertEqual(phi_psi[0][0], None) self.assertAlmostEqual(phi_psi[0][1], -0.73222884210889716, places=3) self.assertAlmostEqual(phi_psi[1][0], -1.1044740234566259, places=3) self.assertAlmostEqual(phi_psi[1][1], -0.69681334592782884, places=3) self.assertAlmostEqual(phi_psi[2][0], -1.8497413300164958, places=3) self.assertAlmostEqual(phi_psi[2][1], 0.34762889834809058, places=3) ppb = CaPPBuilder() pp = ppb.build_peptides(s) self.assertEqual(str(pp[0].get_sequence()), "DIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNW") self.assertEqual(str(pp[1].get_sequence()), "TETLLVQNANPDCKTILKALGPGATLEE") self.assertEqual(str(pp[2].get_sequence()), "TACQG") self.assertEqual([ca.serial_number for ca in pp[0].get_ca_list()], [10, 18, 26, 37, 46, 50, 57, 66, 75, 82, 93, 104, 112, 124, 131, 139, 150, 161, 173, 182, 189, 197, 208, 213, 222, 231, 236, 242, 251, 260, 267, 276, 284]) taus = pp[1].get_tau_list() self.assertAlmostEqual(taus[0], 0.3597907225123525, places=3) self.assertAlmostEqual(taus[1], 0.43239284636769254, places=3) self.assertAlmostEqual(taus[2], 0.99820157492712114, places=3) thetas = pp[2].get_theta_list() self.assertAlmostEqual(thetas[0], 1.6610069445335354, places=3) self.assertAlmostEqual(thetas[1], 1.7491703334817772, places=3) self.assertAlmostEqual(thetas[2], 2.0702447422720143, places=3)
def test_ppbuilder_real_nonstd(self): """Test PPBuilder on real PDB file allowing non-standard amino acids.""" ppb = PPBuilder() pp = ppb.build_peptides(self.structure, False) self.assertEqual(len(pp), 1) # Check the start and end positions self.assertEqual(pp[0][0].get_id()[1], 151) self.assertEqual(pp[0][-1].get_id()[1], 220) # Check the sequence s = pp[0].get_sequence() self.assertIsInstance(s, Seq) # Here non-standard MSE are shown as M self.assertEqual( "MDIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNWMTETLLVQNANPDCKTILKALGPGATLEEMMTACQG", s)
def getSeqLocation(self, seq): # return sequence position and chain id ppb=PPBuilder() bltpep = ppb.build_peptides(self.__struct[0]) for pp in bltpep: beg = 0 end = 0 s = str(pp.get_sequence()) ind = s.find(seq, 0, len(s)) if (ind != -1): beg = beg + ind end = beg + len(seq) - 1 chain = pp[0].get_parent().get_id() break if beg == end == 0: line = '\n' + seq + ' not found in '+str(self.__struct.get_id()) + '!\n' self.printerr(line) return None, None, None return beg, end, chain
def get_sequence(pdb, chain): pdb_parser = PDBParser(PERMISSIVE=0) # The PERMISSIVE instruction allows PDBs presenting errors. pdb_structure = pdb_parser.get_structure(pdb,pdb) pdb_chain = pdb_structure[0][chain] ppb=PPBuilder() Sequence = "" for pp in ppb.build_peptides(pdb_chain): Sequence = Sequence + pp.get_sequence() io = PDBIO() io.set_structure(pdb_structure) output = pdb[-8:-4] +"_"+chain+".pdb" # output = pdb out = open(output[:-4]+chain+".fasta.txt","w") out.write(">"+pdb[:-4]+chain+"\n") out.write(str(Sequence)+"\n") out.close() io.save(output,SelectChains(chain))
def get_pp(pdb, chain, start, length, seq): """retrieve the residiues for a given pdb file and chain as polypeptides""" f = make_filename(pdb) p = PDBParser(PERMISSIVE=1) pdb_struct = p.get_structure( pdb, f) # Load the pdb structure pdb contained on the file f. pdb_chain = pdb_struct[0][ chain] # Select the right Chain of the structure. ppb = PPBuilder() # Initialize a peptide builder. peptides = ppb.build_peptides( pdb_chain) # Load the given chain as a peptide. for i, pep in enumerate(peptides): if str(pep.get_sequence()).find(seq) != -1: start = str(pep.get_sequence()).find(seq) break if start > 0 and (start + length + 2) <= len(pep): pp = pep[(start - 1):(start + length + 2)] return pp else: raise
def test_parser(self): """Extract polypeptides from 1A80.""" parser = MMCIFParser() structure = parser.get_structure("example", "PDB/1A8O.cif") self.assertEqual(len(structure), 1) for ppbuild in [PPBuilder(), CaPPBuilder()]: # ========================================================== # Check that serial_num (model column) is stored properly self.assertEqual(structure[0].serial_num, 1) # First try allowing non-standard amino acids, polypeptides = ppbuild.build_peptides(structure[0], False) self.assertEqual(len(polypeptides), 1) pp = polypeptides[0] # Check the start and end positions self.assertEqual(pp[0].get_id()[1], 151) self.assertEqual(pp[-1].get_id()[1], 220) # Check the sequence s = pp.get_sequence() self.assertTrue(isinstance(s, Seq)) self.assertEqual(s.alphabet, generic_protein) # Here non-standard MSE are shown as M self.assertEqual( "MDIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNWMTETLLVQ" "NANPDCKTILKALGPGATLEEMMTACQG", str(s)) # ========================================================== # Now try strict version with only standard amino acids # Should ignore MSE 151 at start, and then break the chain # at MSE 185, and MSE 214,215 polypeptides = ppbuild.build_peptides(structure[0], True) self.assertEqual(len(polypeptides), 3) # First fragment pp = polypeptides[0] self.assertEqual(pp[0].get_id()[1], 152) self.assertEqual(pp[-1].get_id()[1], 184) s = pp.get_sequence() self.assertTrue(isinstance(s, Seq)) self.assertEqual(s.alphabet, generic_protein) self.assertEqual("DIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNW", str(s)) # Second fragment pp = polypeptides[1] self.assertEqual(pp[0].get_id()[1], 186) self.assertEqual(pp[-1].get_id()[1], 213) s = pp.get_sequence() self.assertTrue(isinstance(s, Seq)) self.assertEqual(s.alphabet, generic_protein) self.assertEqual("TETLLVQNANPDCKTILKALGPGATLEE", str(s)) # Third fragment pp = polypeptides[2] self.assertEqual(pp[0].get_id()[1], 216) self.assertEqual(pp[-1].get_id()[1], 220) s = pp.get_sequence() self.assertTrue(isinstance(s, Seq)) self.assertEqual(s.alphabet, generic_protein) self.assertEqual("TACQG", str(s))
def getRegionsResidues(self): # fill self.__regions_res dictionary with list of residues ppb=PPBuilder() # for every region contained in self.__regions_res res = [] bltpep = ppb.build_peptides(self.__struct[0]) for key in self.__regions_res: for pp in bltpep: s = str(pp.get_sequence()) reg_seq = list(self.__regions.get_group(key)['tcr_region_seq'])[0] ind = s.find(reg_seq, 0, len(s)) if (ind != -1): for i in range(ind, ind + len(reg_seq)): res.append(pp[i]) self.__regions_res[key] = res break if not res: line = '\n' + reg_seq + ' not found in '+ self.__name + '!\n' self.printerr('getRegionResidues(): ' + line) return 0 res = [] return 1
def testModels(self): """Test file with multiple models.""" parser = MMCIFParser(QUIET=1) f_parser = FastMMCIFParser(QUIET=1) with warnings.catch_warnings(): warnings.simplefilter("ignore", PDBConstructionWarning) structure = parser.get_structure("example", "PDB/1LCD.cif") f_structure = f_parser.get_structure("example", "PDB/1LCD.cif") self.assertEqual(len(structure), 3) self.assertEqual(len(f_structure), 3) for ppbuild in [PPBuilder(), CaPPBuilder()]: # ========================================================== # Check that serial_num (model column) is stored properly self.assertEqual(structure[0].serial_num, 1) self.assertEqual(structure[1].serial_num, 2) self.assertEqual(structure[2].serial_num, 3) # First try allowing non-standard amino acids, polypeptides = ppbuild.build_peptides(structure[0], False) self.assertEqual(len(polypeptides), 1) pp = polypeptides[0] # Check the start and end positions self.assertEqual(pp[0].get_id()[1], 1) self.assertEqual(pp[-1].get_id()[1], 51) # Check the sequence s = pp.get_sequence() self.assertIsInstance(s, Seq) self.assertEqual(s.alphabet, generic_protein) # Here non-standard MSE are shown as M self.assertEqual( "MKPVTLYDVAEYAGVSYQTVSRVVNQASHVSAKTREKVEAAMAELNYIPNR", str(s) ) # ========================================================== # Now try strict version with only standard amino acids polypeptides = ppbuild.build_peptides(structure[0], True) self.assertEqual(len(polypeptides), 1) pp = polypeptides[0] # Check the start and end positions self.assertEqual(pp[0].get_id()[1], 1) self.assertEqual(pp[-1].get_id()[1], 51) # Check the sequence s = pp.get_sequence() self.assertIsInstance(s, Seq) self.assertEqual(s.alphabet, generic_protein) self.assertEqual( "MKPVTLYDVAEYAGVSYQTVSRVVNQASHVSAKTREKVEAAMAELNYIPNR", str(s) ) # This structure contains several models with multiple lengths. # The tests were failing. structure = parser.get_structure("example", "PDB/2OFG.cif") self.assertEqual(len(structure), 3)
def get_sequence(pdb, chain): if chain is "%": chain = " " warnings.filterwarnings('always', message='.*discontinuous at.*') pdb_parser = PDBParser( PERMISSIVE=0, QUIET=True ) # The PERMISSIVE instruction allows PDBs presenting errors. pdb_structure = pdb_parser.get_structure(pdb, pdb) pdb_chain = pdb_structure[0][chain] ppb = PPBuilder() Sequence = "" for pp in ppb.build_peptides(pdb_chain, aa_only=False): Sequence = Sequence + pp.get_sequence() io = PDBIO() io.set_structure(pdb_structure) output = pdb[0:-4] + ".pdb" out = open(output[:-4] + ".fasta.atom", "w") out.write(">" + pdb[0:-4] + "\n") out.write(str(Sequence) + "\n") out.close()
def get_sequence(pdb, chain, first, last, output): pdb_parser = PDBParser(PERMISSIVE=0) # The PERMISSIVE instruction allows PDBs presenting errors. pdb_structure = pdb_parser.get_structure(pdb,pdb) pdb_chain = pdb_structure[0][chain] ppb=PPBuilder() Sequence = "" for pp in ppb.build_peptides(pdb_chain): Sequence = Sequence + pp.get_sequence() io = PDBIO() io.set_structure(pdb_structure) # if pdb[-5] == chain: # output = pdb # else: # output = pdb[:-4]+chain+".pdb" ### writing out sequence to fasta # out = open(output[:-4]+".fasta.txt","w") # out.write(">"+output[:-4]+"\n") # out.write(str(Sequence[first-1: last-2])+"\n") # out.close() io.save(output,SelectDomain(chain, first, last))
def test_ppbuilder_real(self): """Test PPBuilder on real PDB file.""" ppb = PPBuilder() pp = ppb.build_peptides(self.structure) self.assertEqual(len(pp), 3) # Check termini self.assertEqual(pp[0][0].get_id()[1], 152) self.assertEqual(pp[0][-1].get_id()[1], 184) self.assertEqual(pp[1][0].get_id()[1], 186) self.assertEqual(pp[1][-1].get_id()[1], 213) self.assertEqual(pp[2][0].get_id()[1], 216) self.assertEqual(pp[2][-1].get_id()[1], 220) # Now check sequences pp0_seq = pp[0].get_sequence() pp1_seq = pp[1].get_sequence() pp2_seq = pp[2].get_sequence() self.assertIsInstance(pp0_seq, Seq) self.assertEqual(pp0_seq, "DIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNW") self.assertEqual(pp1_seq, "TETLLVQNANPDCKTILKALGPGATLEE") self.assertEqual(pp2_seq, "TACQG")
def get_sequence(pdb, chain): pdb_parser = PDBParser( PERMISSIVE=0 ) # The PERMISSIVE instruction allows PDBs presenting errors. pdb_structure = pdb_parser.get_structure(pdb, pdb) pdb_chain = pdb_structure[0][chain] ppb = PPBuilder() Sequence = "" for pp in ppb.build_peptides(pdb_chain): Sequence = Sequence + pp.get_sequence() start = [residue.id[1] for residue in pdb_chain][0] if start is not 1: for residue in pdb_chain: residue.id = (' ', residue.id[1] - start + 1, ' ') io = PDBIO() io.set_structure(pdb_structure) # output = pdb[-8:-4] +"_"+chain+".pdb" output = "renumbered_" + pdb # out = open(output[:-4]+".fasta.txt","w") # out.write(">"+pdb[-8:-4]+"_"+chain+"\n") # out.write(str(Sequence)) # out.close() io.save(output, SelectChains(chain))
def get_ignored_res(file: str): x, y, ignored, output = [], [], [], {} for model in PDBParser().get_structure(id=None, file=file): for chain in model: peptides = PPBuilder().build_peptides(chain) for peptide in peptides: for aa, angles in zip(peptide, peptide.get_phi_psi_list()): residue = chain.id + ":" + aa.resname + str(aa.id[1]) output[residue] = angles for key, value in output.items(): # Only get residues with both phi and psi angles if value[0] and value[1]: x.append(value[0] * 180 / pi) y.append(value[1] * 180 / pi) else: ignored.append((key, value)) return output, ignored, x, y
def split_pdb_by_chain(pdb_id): if not os.path.isdir("pdb_chains/" + pdb_id.upper()): os.mkdir("pdb_chains/" + pdb_id.upper()) actual_pdbfile = PDBParser().get_structure( pdb_id, "ent_files/pdb" + pdb_id.lower() + ".ent") return_dict = dict() for model in actual_pdbfile: for chain in model: outfilename = pdb_id.upper() + "-" + str( model.get_id() + 1) + "_" + str(chain.get_id()) + ".pdb" if not os.path.isfile("pdb_chains/" + pdb_id.upper() + "/" + outfilename): io = PDBIO() io.set_structure(chain) io.save("pdb_chains/" + pdb_id.upper() + "/" + outfilename) ppb = PPBuilder().build_peptides(chain) this_seq = Seq("", generic_protein) for pp in ppb: this_seq += pp.get_sequence() return_dict[outfilename] = this_seq return return_dict
def testModels(self): """Test file with multiple models""" parser = MMCIFParser() structure = parser.get_structure("example", "PDB/1LCD.cif") self.assertEqual(len(structure), 3) for ppbuild in [PPBuilder(), CaPPBuilder()]: #========================================================== # Check that serial_num (model column) is stored properly self.assertEqual(structure[0].serial_num, 1) self.assertEqual(structure[1].serial_num, 2) self.assertEqual(structure[2].serial_num, 3) #First try allowing non-standard amino acids, polypeptides = ppbuild.build_peptides(structure[0], False) self.assertEqual(len(polypeptides), 1) pp = polypeptides[0] # Check the start and end positions self.assertEqual(pp[0].get_id()[1], 1) self.assertEqual(pp[-1].get_id()[1], 51) # Check the sequence s = pp.get_sequence() self.assertTrue(isinstance(s, Seq)) self.assertEqual(s.alphabet, generic_protein) #Here non-standard MSE are shown as M self.assertEqual( "MKPVTLYDVAEYAGVSYQTVSRVVNQASHVSAKTREKVEAAMAELNYIPNR", str(s)) #========================================================== #Now try strict version with only standard amino acids polypeptides = ppbuild.build_peptides(structure[0], True) self.assertEqual(len(polypeptides), 1) pp = polypeptides[0] # Check the start and end positions self.assertEqual(pp[0].get_id()[1], 1) self.assertEqual(pp[-1].get_id()[1], 51) # Check the sequence s = pp.get_sequence() self.assertTrue(isinstance(s, Seq)) self.assertEqual(s.alphabet, generic_protein) self.assertEqual( "MKPVTLYDVAEYAGVSYQTVSRVVNQASHVSAKTREKVEAAMAELNYIPNR", str(s))
def create_rotamers(self, structure, pdb_path): wt_lookup = {} #used to match WT seq_number to WT residue record pdbseq = {} #used to keep track of pdbseq residue positions vs index in seq ref_positions = {} #WT postions in alignment mapped_seq = {} # index in contruct, tuple of AA and WT [position,AA] preferred_chain = structure.preferred_chain if len(preferred_chain.split(','))>1: #if A,B preferred_chain = preferred_chain.split(',')[0] AA = {'ALA':'A', 'ARG':'R', 'ASN':'N', 'ASP':'D', 'CYS':'C', 'GLN':'Q', 'GLU':'E', 'GLY':'G', 'HIS':'H', 'ILE':'I', 'LEU':'L', 'LYS':'K', 'MET':'M', 'PHE':'F', 'PRO':'P', 'SER':'S', 'THR':'T', 'TRP':'W', 'TYR':'Y', 'VAL':'V'} s = PDBParser(PERMISSIVE=True, QUIET=True).get_structure('ref', pdb_path)[0] chain = s[preferred_chain] #select only one chain (avoid n-mer receptors) ppb=PPBuilder() seq = '' i = 1 check_1000 = 0 for pp in ppb.build_peptides(chain): #remove >1000 pos (fusion protein / gprotein) for res in pp: id = res.id if id[1]<600: check_1000 += 1 #need check_1000 to catch structures where they lie in 1000s (4LDE, 4LDL, 4LDO, 4N4W, 4QKX) if id[1]>1000 and check_1000>200: chain.detach_child(id) for pp in ppb.build_peptides(chain): seq += str(pp.get_sequence()) #get seq from fasta (only chain A) for residue in pp: residue_id = residue.get_full_id() chain = residue_id[2] if chain not in pdbseq: pdbseq[chain] = {} pos = residue_id[3][1] pdbseq[chain][pos] = [i,AA[residue.resname]] i += 1 parent_seq = str(structure.protein_conformation.protein.parent.sequence) rs = Residue.objects.filter(protein_conformation__protein=structure.protein_conformation.protein.parent).prefetch_related('display_generic_number','generic_number','protein_segment') for r in rs: #required to match WT position to a record (for duplication of GN values) wt_lookup[r.sequence_number] = r #align WT with structure seq -- make gaps penalties big, so to avoid too much overfitting pw2 = pairwise2.align.localms(parent_seq, seq, 2, -4, -4, -.1) gaps = 0 unmapped_ref = {} for i, r in enumerate(pw2[0][0], 1): #loop over alignment to create lookups (track pos) #print(i,r,pw2[0][1][i-1]) #print alignment for sanity check if r == "-": gaps += 1 if r != "-": ref_positions[i] = [i-gaps,r] elif r == "-": ref_positions[i] = [None,'-'] if pw2[0][1][i-1]=='-': unmapped_ref[i-gaps] = '-' gaps = 0 for i, r in enumerate(pw2[0][1], 1): #make second lookup if r == "-": gaps += 1 if r != "-": mapped_seq[i-gaps] = [r,ref_positions[i]] pdb = structure.pdb_data.pdb protein_conformation=structure.protein_conformation temp = '' check = 0 errors = 0 mismatch_seq = 0 match_seq = 0 not_matched = 0 matched_by_pos = 0 aa_mismatch = 0 pdblines_temp = pdb.splitlines() pdblines = [] for line in pdblines_temp: #Get rid of all odd records if line.startswith('ATOM'): pdblines.append(line) pdblines.append('') #add a line to not "run out" for i,line in enumerate(pdblines): if line.startswith('ATOM'): chain = line[21] if preferred_chain and chain!=preferred_chain: #If perferred is defined and is not the same as the current line, then skip pass else: nextline = pdblines[i+1] residue_number = line[22:26].strip() if (check==0 or nextline[22:26].strip()==check) and nextline.startswith('TER')==False and nextline.startswith('ATOM')==True: #If this is either the begining or the same as previous line add to current rotamer temp += line + "\n" #print('same res',pdb.splitlines()[i+1]) else: #if this is a new residue #print(pdb.splitlines()[i+1][22:26].strip(),check) temp += line + "\n" if int(check.strip())<2000: residue = Residue() residue.sequence_number = int(check.strip()) residue.amino_acid = AA[residue_name.upper()] residue.protein_conformation = protein_conformation #print(residue.sequence_number,residue.amino_acid) #sanity check try: seq_num_pos = pdbseq[chain][residue.sequence_number][0] except: #print('failed residue',pdb_path,residue.sequence_number) temp = "" #start new line for rotamer check = pdblines[i+1][22:26].strip() continue if seq_num_pos in mapped_seq: if mapped_seq[seq_num_pos][1][0]==None: #print('no match found') #sanity check #print(residue.sequence_number,residue.amino_acid) #sanity check residue.display_generic_number = None residue.generic_number = None residue.protein_segment = None not_matched +=1 else: wt_r = wt_lookup[mapped_seq[seq_num_pos][1][0]] if residue.sequence_number!=wt_r.sequence_number and residue.amino_acid!=wt_r.amino_acid and residue.sequence_number in wt_lookup: #if pos numbers not work -- see if the pos number might be in WT and unmapped if wt_lookup[residue.sequence_number].amino_acid==residue.amino_acid: if residue.sequence_number in unmapped_ref: #WT was not mapped, so could be it # print(residue.sequence_number,residue.amino_acid) #sanity check #print('wrongly matched, better match on pos+aa',residue.sequence_number,residue.amino_acid,wt_r.sequence_number,wt_r.amino_acid) wt_r = wt_lookup[residue.sequence_number] matched_by_pos +=1 match_seq += 1 else: mismatch_seq += 1 #print('could have been matched, but already aligned to another position',residue.sequence_number,residue.amino_acid,wt_r.sequence_number,wt_r.amino_acid) else: #print('WT pos not same AA, mismatch',residue.sequence_number,residue.amino_acid,wt_r.sequence_number,wt_r.amino_acid) mismatch_seq += 1 elif residue.sequence_number!=wt_r.sequence_number: #print('WT pos not same pos, mismatch',residue.sequence_number,residue.amino_acid,wt_r.sequence_number,wt_r.amino_acid) mismatch_seq += 1 elif residue.amino_acid!=wt_r.amino_acid: #print('aa mismatch',residue.sequence_number,residue.amino_acid,wt_r.sequence_number,wt_r.amino_acid) aa_mismatch += 1 else: match_seq += 1 if wt_r.generic_number is not None: residue.display_generic_number = wt_r.display_generic_number residue.generic_number = wt_r.generic_number else: residue.display_generic_number = None residue.generic_number = None #print('no GN') residue.protein_segment = wt_r.protein_segment else: #print('wierd error') #sanity check residue.display_generic_number = None residue.generic_number = None residue.protein_segment = None #print('inserted',residue.sequence_number) #sanity check residue.save() rotamer_data, created = PdbData.objects.get_or_create(pdb=temp) rotamer, created = Rotamer.objects.get_or_create(residue=residue, structure=structure, pdbdata=rotamer_data) temp = "" #start new line for rotamer check = pdblines[i+1][22:26].strip() check = pdblines[i+1][22:26].strip() chain = line[21] residue_name = line[17:20].title() #use title to get GLY to Gly so it matches #print(structure.pdb_code.index,'length',len(seq),len(mapped_seq),'mapped res',str(mismatch_seq+match_seq+aa_mismatch),'pos mismatch',mismatch_seq,'aa mismatch',aa_mismatch,'not mapped',not_matched,' mapping off, matched on pos,aa',matched_by_pos) return None
from Bio.PDB import PDBParser from Bio.PDB import PPBuilder from Bio.PDB import Polypeptide item = '2bnr' structure = PDBParser().get_structure(item, '../pdbs/'+item+'.pdb') ppb=PPBuilder() peps = ppb.build_peptides(structure) print structure.get_id() print peps[0] #print peps[0][1:-3] print peps[0][3:9] p = peps[0][3:9] print peps[0][1].get_resname()
# n += 1 # list[n].append(atom) # previous = atom # return list if __name__ == "__main__": current_path = os.path.dirname(sys.argv[0]) pdb_path = current_path + '../pdb/' pdb_id = '2vb1' structure = get_structure(pdb_id, pdb_path) model = structure[0] ppb = PPBuilder() pp_list = ppb.build_peptides(model) # orient orient(pp_list) # first split stage fs = first_split(pp_list) for seg in fs: pp = Polypeptide.Polypeptide(seg) print pp.get_sequence()