def chain2pos_scan_str(chain, pdb, mutation_set='a'): """ Takes a chain ID and a model.PDBFile object, returns a string suitable as the PositionScan line for FoldX. """ parser = PDBParser(PERMISSIVE=1) pdbfn = pdb.fullpath() struct = parser.get_structure(pdb.uuid, pdbfn)[0] #chains = pdb_extract_chain_seqs(struct) chainlist = Selection.unfold_entities(struct, 'C') position_scan_str = '' for c in chainlist: if c.id == chain: for r in c: try: aa = three_to_one(r.get_resname()) resnum = r.id[1] position_scan_str += '%s%s%i%s,' % (aa, chain, resnum, mutation_set) except: # non-native amino acid or water pass position_scan_str = position_scan_str[:-1] return position_scan_str
def create_seqrecord(sequence, name): sequence_string = "" for aa in sequence: if aa is None: symbol = "-" else: try: symbol = three_to_one(aa.get_resname()) except: symbol = "?" sequence_string += symbol return SeqRecord(Seq(sequence_string, generic_protein), id=name)
def getResidueStrings(structure): seqs = [] for model in structure: for ch in model.get_chains(): seq = '' for residue in model.get_residues(): resname = residue.get_resname() if is_aa(resname, standard=True): seq += three_to_one(resname) elif resname in {'HIE', 'HID'}: seq += 'H' elif resname in {'CYX', 'CYM'}: seq += 'C' else: seq += 'X' seqs.append(seq) return seqs
def print_martini_dihedrals(dihedrals, atoms, params): """ atoms: the id:resname dict created by read_atoms params should be in the form: params = { # i.e. phase angle, force constant, multiplicity 'GVPG': [[phi1, k1, n1], [ph2, k2, n2], [phi3, k3, n3] ... ], 'VPGV': [[phi1, k1, n1], [ph2, k2, n2], [phi3, k3, n3] ... ], .... } Since MARTINI uses proper dih. function type 1 """ for d in dihedrals: dih_name = ''.join([three_to_one(atoms[di]) for di in d]) for (phii, ki, ni) in params[dih_name]: print "{0:5d}{1:6d}{2:6d}{3:6d}{4:6d}{5:12.6f}{6:12.6f}{7:6d}".format( d[0], d[1], d[2], d[3], 1, phii, ki, ni)
print("parsing PDB") PDB_list = glob.glob("../../../../PDBMining/*/*.ent") p = PDBParser() secondaryStruct = [] Valid = [False for _ in proteins] PDBNames = [] for f in PDB_list: name = os.path.splitext(basename(f))[0] PDBNames.append(name) struct = p.get_structure(name,f) res_list = Selection.unfold_entities(struct, 'R') try: seq = [three_to_one(a.get_resname()).lower() for a in res_list] except (KeyError): seq = [] try: if seq == [a for a in proteins[nameInd[name]]]: Valid[nameInd[name]] = True except KeyError: pass struct_dssp = p.get_structure(name,f) try: dssp = DSSP(struct_dssp[0], f) except Exception: Valid[nameInd[name]] = False a_keys = list(dssp.keys()) sec = [dssp[a][2] for a in a_keys] try:
def __init__(self, model, in_file, dssp="dssp", acc_array="Sander", file_type='PDB'): """Create a DSSP object. Parameters ---------- model : Model The first model of the structure in_file : string Either a PDB file or a DSSP file. dssp : string The dssp executable (ie. the argument to os.system) acc_array : string Accessible surface area (ASA) from either Miller et al. (1987), Sander & Rost (1994), or Wilke: Tien et al. 2013, as string Sander/Wilke/Miller. Defaults to Sander. file_type: string File type switch, either PDB or DSSP with PDB as default. """ self.residue_max_acc = residue_max_acc[acc_array] # create DSSP dictionary file_type = file_type.upper() assert (file_type in ['PDB', 'DSSP']) # If the input file is a PDB file run DSSP and parse output: if file_type == 'PDB': # Newer versions of DSSP program call the binary 'mkdssp', so # calling 'dssp' will not work in some operating systems # (Debian distribution of DSSP includes a symlink for 'dssp' argument) try: dssp_dict, dssp_keys = dssp_dict_from_pdb_file(in_file, dssp) except OSError: # TODO: Use FileNotFoundError once drop Python 2 if dssp == 'dssp': dssp = 'mkdssp' elif dssp == 'mkdssp': dssp = 'dssp' else: raise dssp_dict, dssp_keys = dssp_dict_from_pdb_file(in_file, dssp) # If the input file is a DSSP file just parse it directly: elif file_type == 'DSSP': dssp_dict, dssp_keys = make_dssp_dict(in_file) dssp_map = {} dssp_list = [] def resid2code(res_id): """Serialize a residue's resseq and icode for easy comparison.""" return '%s%s' % (res_id[1], res_id[2]) # Now create a dictionary that maps Residue objects to # secondary structure and accessibility, and a list of # (residue, (secondary structure, accessibility)) tuples for key in dssp_keys: chain_id, res_id = key chain = model[chain_id] try: res = chain[res_id] except KeyError: # In DSSP, HET field is not considered in residue identifier. # Thus HETATM records may cause unnecessary exceptions. # (See 3jui chain A res 593.) # Try the lookup again with all HETATM other than water res_seq_icode = resid2code(res_id) for r in chain: if r.id[0] not in (' ', 'W'): # Compare resseq + icode if resid2code(r.id) == res_seq_icode: # Found a matching residue res = r break else: raise KeyError(res_id) # For disordered residues of point mutations, Biopython uses the # last one as default, But DSSP takes the first one (alternative # location is blank, A or 1). See 1h9h chain E resi 22. # Here we select the res in which all atoms have altloc blank, A or # 1. If no such residues are found, simply use the first one appears # (as DSSP does). if res.is_disordered() == 2: for rk in res.disordered_get_id_list(): # All atoms in the disordered residue should have the same # altloc, so it suffices to check the altloc of the first # atom. altloc = res.child_dict[rk].get_list()[0].get_altloc() if altloc in tuple('A1 '): res.disordered_select(rk) break else: # Simply select the first one res.disordered_select(res.disordered_get_id_list()[0]) # Sometimes point mutations are put into HETATM and ATOM with altloc # 'A' and 'B'. # See 3piu chain A residue 273: # <Residue LLP het=H_LLP resseq=273 icode= > # <Residue LYS het= resseq=273 icode= > # DSSP uses the HETATM LLP as it has altloc 'A' # We check the altloc code here. elif res.is_disordered() == 1: # Check altloc of all atoms in the DisorderedResidue. If it # contains blank, A or 1, then use it. Otherwise, look for HET # residues of the same seq+icode. If not such HET residues are # found, just accept the current one. altlocs = set(a.get_altloc() for a in res.get_unpacked_list()) if altlocs.isdisjoint('A1 '): # Try again with all HETATM other than water res_seq_icode = resid2code(res_id) for r in chain: if r.id[0] not in (' ', 'W'): if resid2code(r.id) == res_seq_icode and \ r.get_list()[0].get_altloc() in tuple('A1 '): res = r break (aa, ss, acc, phi, psi, dssp_index, NH_O_1_relidx, NH_O_1_energy, O_NH_1_relidx, O_NH_1_energy, NH_O_2_relidx, NH_O_2_energy, O_NH_2_relidx, O_NH_2_energy) = dssp_dict[key] res.xtra["SS_DSSP"] = ss res.xtra["EXP_DSSP_ASA"] = acc res.xtra["PHI_DSSP"] = phi res.xtra["PSI_DSSP"] = psi res.xtra["DSSP_INDEX"] = dssp_index res.xtra["NH_O_1_RELIDX_DSSP"] = NH_O_1_relidx res.xtra["NH_O_1_ENERGY_DSSP"] = NH_O_1_energy res.xtra["O_NH_1_RELIDX_DSSP"] = O_NH_1_relidx res.xtra["O_NH_1_ENERGY_DSSP"] = O_NH_1_energy res.xtra["NH_O_2_RELIDX_DSSP"] = NH_O_2_relidx res.xtra["NH_O_2_ENERGY_DSSP"] = NH_O_2_energy res.xtra["O_NH_2_RELIDX_DSSP"] = O_NH_2_relidx res.xtra["O_NH_2_ENERGY_DSSP"] = O_NH_2_energy # Relative accessibility resname = res.get_resname() try: rel_acc = acc / self.residue_max_acc[resname] except KeyError: # Invalid value for resname rel_acc = 'NA' else: if rel_acc > 1.0: rel_acc = 1.0 res.xtra["EXP_DSSP_RASA"] = rel_acc # Verify if AA in DSSP == AA in Structure # Something went wrong if this is not true! # NB: DSSP uses X often try: resname = three_to_one(resname) except KeyError: resname = 'X' if resname == "C": # DSSP renames C in C-bridges to a,b,c,d,... # - we rename it back to 'C' if _dssp_cys.match(aa): aa = 'C' # Take care of HETATM again if (resname != aa) and (res.id[0] == ' ' or aa != 'X'): raise PDBException("Structure/DSSP mismatch at %s" % res) dssp_vals = (dssp_index, aa, ss, rel_acc, phi, psi, NH_O_1_relidx, NH_O_1_energy, O_NH_1_relidx, O_NH_1_energy, NH_O_2_relidx, NH_O_2_energy, O_NH_2_relidx, O_NH_2_energy) dssp_map[key] = dssp_vals dssp_list.append(dssp_vals) AbstractResiduePropertyMap.__init__(self, dssp_map, dssp_keys, dssp_list)
def get_one_letter(list_of_three): fasta_one=[] for x in list_of_three: x=three_to_one(x) fasta_one.append(x) return fasta_one
print("You should not be here...") rowNum += 1 print("Number of proteins: {}".format(len(threeline_data))) print(bridges.keys()) print("Number of total bridges: {}".format(len(bridges["all"]))) print("Number of membrane bridges: {}".format(len(bridges["mems"]))) print("Number of local bridges: {}".format(len(bridges["local"]))) r = 0 w = 0 index = 0 for key, bridge in bridges["local"].items(): if key in threeline_data: seq = threeline_data[key][0] for b in bridge: first_aa = three_to_one(b[1]) first_ix = b[0] second_aa = three_to_one(b[3]) second_ix = b[2] if first_ix > len(seq): index += 1 continue if first_aa == seq[first_ix - 1]: r += 1 else: print(key) print(seq) print(b) w += 1 # print(seq[first_ix-1], first_aa)
# parsing PDB files PDB_list = glob.glob("../../../../PDBMining/*/*.ent") p = PDBParser() Valid = [False for _ in proteins] PDBNames = [] for f in PDB_list: name = os.path.splitext(basename(f))[0] PDBNames.append(name) struct = p.get_structure(name,f) res_list = Selection.unfold_entities(struct, 'R') try: seq = [three_to_one(a.get_resname()).lower() for a in res_list] except (KeyError): seq = [] try: if seq == proteins[nameInd[name]]: Valid[nameInd[name]] = True except KeyError: pass PDBInd = dict((c, i) for i, c in enumerate(PDBNames)) occurences = [-1 for _ in proteins] ind = -1 for rec in record: ind +=1
def computeOneFileFromPDB(self, fileName, chainType): ''' Gets the seq to struct mapping for a given pdb file @param fileName: str. fname to pdb file @param chainType: str. "l" for ligand and "r" for receptor ''' self.seqsDict[chainType] = {} if not (fileName.endswith("_r_u.pdb") or fileName.endswith("_l_u.pdb")): prefixAndChainType = ( os.path.split(fileName)[-1]).split(".pdb")[0] + "_" + chainType else: prefixAndChainType = ( os.path.split(fileName)[-1]).split("_u.pdb")[0] ## print(fileName) struct = self.parser.get_structure(prefixAndChainType, fileName) for chain in struct[0]: chainId = chain.get_id() if chainId == " ": chainId = "*" nResStandard = sum( [1 for res in chain if is_aa(res, standard=True)]) resList = [ res for res in sorted(chain.child_list, key=lambda x: x.get_id()[1:]) if is_aa(res, standard=False) ] #New version feature nResAll = len(resList) # print(chainId, len(resList)) if nResStandard < int(0.5 * nResAll): continue #skip if most residues are not standard if len( resList ) > SMALL_CHAINS_LIMIT: #Too small chains will not be considered sequence = [] resIds = [] for i, res in enumerate(resList): try: letter = three_to_one(res.resname) except KeyError: # New version feature print("Exception", res) letter = "X" if i == (nResAll - 1): break #This case is for TCGR....TLRX where X is GDP or other molecule resId = res.get_full_id()[3] sequence.append(letter) ## print(sequence[-1]) resIds.append("%d;%s;%s" % (i, letter, resId)) self.seqToStruct[(chainType, chainId, i)] = resId self.structToSeq[(chainType, chainId, resId)] = i sequence = "".join(sequence) outNameFasta = os.path.join( self.fastaOutDir, prefixAndChainType + "_" + chainId + "_u.fasta") f = open(outNameFasta, "w") f.write(">" + prefixAndChainType + "_" + chainId + "\n" + sequence) f.close() resIds = "\n".join(resIds) outName = os.path.join( self.seqToStructDir, prefixAndChainType + "_" + chainId + "_u.seqStruMap") self.seqToStructFnames[(chainType, chainId)] = (outName, prefixAndChainType) f = open(outName, "w") f.write(">" + prefixAndChainType + "_" + chainId + "\n" + resIds) f.close() self.seqsDict[chainType][chainId] = (sequence, outNameFasta)
def _add_flanking_seq_fragments(ddg_data_dict: Dict, dataset: str, pdb_filename: str): if "left_flank" not in ddg_data_dict[dataset].columns: ddg_data_dict[dataset]["left_flank"] = np.nan if "wt_restype" not in ddg_data_dict[dataset].columns: ddg_data_dict[dataset]["wt_restype"] = np.nan if "mt_restype" not in ddg_data_dict[dataset].columns: ddg_data_dict[dataset]["mt_restype"] = np.nan if "right_flank" not in ddg_data_dict[dataset].columns: ddg_data_dict[dataset]["right_flank"] = np.nan pdbid = pdb_filename.split(r"/")[-1][0:4].upper() # # Load SEQRES # chain_id_to_seq_res = {} # for record in SeqIO.parse(pdb_filename, "pdb-seqres"): # seq_res = str(record.seq) # chain_id = record.id[-1] # chain_id_to_seq_res[chain_id] = seq_res # print(record.annotations) # # Load PDBSEQ # from Bio.SeqIO.PdbIO import PdbAtomIterator # chain_id_to_pdb_seq = {} # with open(pdb_filename) as handle: # for record in PdbAtomIterator(handle): # pdb_seq = str(record.seq) # chain_id = record.id[-1] # chain_id_to_pdb_seq[chain_id] = pdb_seq from Bio.PDB.PDBParser import PDBParser p = PDBParser() model_first = p.get_structure(pdbid, pdb_filename)[0] chain_id_to_pdb_seq = {} chain_id_to_pdb_residue_numbers = {} for chain in model_first: pdb_seq = [] pdb_residue_numbers = [] for residue in chain.get_residues(): if residue.resname.strip() in [ index_to_three(i) for i in range(20) ]: pdb_residue_numbers.append(residue.id[1]) pdb_seq.append(three_to_one(residue.resname.strip())) chain_id_to_pdb_seq[chain.id] = "".join(pdb_seq) chain_id_to_pdb_residue_numbers[chain.id] = pdb_residue_numbers for idx, row in ddg_data_dict[dataset].iterrows(): if row["pdbid"] == pdbid: residue_number = int(row["variant"][1:-1]) chain_id = row["chainid"] pdb_sequence = chain_id_to_pdb_seq[chain_id] resid = chain_id_to_pdb_residue_numbers[chain_id].index( residue_number) if row["variant"][0] == pdb_sequence[resid]: ddg_data_dict[dataset].loc[idx, "left_flank"] = _trim_left_flank( pdb_sequence[:resid]) ddg_data_dict[dataset].loc[idx, "right_flank"] = _trim_right_flank( pdb_sequence[resid + 1:]) ddg_data_dict[dataset].loc[idx, "wt_restype"] = row["variant"][0] ddg_data_dict[dataset].loc[idx, "mt_restype"] = row["variant"][-1] else: print("WRONG", row[["pdbid", "variant"]])
from Bio.PDB import PDBParser, PDBIO from Bio.PDB.Polypeptide import is_aa, three_to_one import sys path = sys.argv[1] code = path[:-4] io = PDBIO() pdb = PDBParser().get_structure(code, path) for chain in pdb.get_chains(): io.set_structure(chain) io.save(pdb.get_id() + "_" + chain.get_id() + ".pdb") seq = list() out = open(code + "_" + chain.get_id() + '.fasta', 'w') for residue in chain: if is_aa(residue.get_resname(), standard=True): seq.append(three_to_one(residue.get_resname())) else: seq.append("X") ## This line is used to display the sequence from each chain print(">Chain_" + chain.get_id() + "\n" + str("".join(seq)), file=out) out.close()
DSSPsequence = tplContent[5].split('= ')[-1] name = os.path.basename(tplfile).split('.')[0] if len(name)!=5: print 'the protein target name is incorrect. It must be composed of PDB ID and chain letter' exit(-1) pdbfile = sys.argv[2] parser = PDBParser() structure = parser.get_structure(name, pdbfile) chain=name[4] residues = structure[0][chain].get_residues() residueList = [ r for r in residues if is_aa(r) ] #numResidues = len(residueList) pdbseq = ''.join( [ three_to_one(r.get_resname()) for r in residueList ] ) #print pdbseq ### check if DSSPsequence is equivalent to pdbseq validDSSPseq = DSSPsequence.replace('-', '') if validDSSPseq != pdbseq: print 'Inconsistency between DSSPsequence in ', tplfile, ' and pdbseq in ', pdbfile print 'pdbseq: ', pdbseq print 'DSPseq: ', validDSSPseq diffs = [i for i in xrange(min(len(pdbseq), len(validDSSPseq) ) ) if pdbseq[i] != validDSSPseq[i] ] print 'inconsistent positions: ', diffs exit(-1)
pos = frag[9] score = "%.2f" % (math.exp((float(frag[12].rstrip())) / 1000)) if float(score) < 0.1 or math.isnan(float(score)): score = "1.0" try: pp = get_pp( pdb, chain, start, length, seq ) # Polypeptide pp now contains atomic information for fragment. except: print >> stderr, "E: failed to process", pdb, chain, start # Cannot open file. fails += 1 else: str1 = "" for res in pp[1:length + 1]: str1 += three_to_one(res.get_resname()) if str(seq[0:length]) == str(str1): # F 0 P 0 L 9 S 1.511 = 1fcd A 116 R 0 print "F ", s, " P ", pos, " L ", str( length), " S ", score, " = ", pdb, " ", chain, " ", str( start) # try: print_pp(pp, offset=1, lng=length) s += 1 if length > 6: print "F ", s, " P ", pos, " L ", str( 6 ), " S ", score, " = ", pdb, " ", chain, " ", str( start) print_pp(pp, offset=1, lng=6) s += 1
def convert3to1(s): try: return three_to_one(s) except KeyError: return "X"
def getSequenceFromChain(self, modelID, chainID): self.checkRead() seq = list() for model in self.structure: if model.id == modelID: for chain in model: if str(chain.id) == chainID: if len(chain.get_unpacked_list()[0].resname) == 1: print("Your sequence is a nucleotide sequence (" "RNA)\n") # alphabet = IUPAC.IUPACAmbiguousRNA._upper() for residue in chain: # Check if the residue belongs to the # standard RNA and add those residues to the # seq if residue.get_resname() in [ 'A', 'C', 'G', 'U' ]: seq.append(residue.get_resname()) else: seq.append("X") elif len(chain.get_unpacked_list()[0].resname) == 2: print("Your sequence is a nucleotide sequence (" "DNA)\n") # alphabet = IUPAC.ExtendedIUPACDNA._upper() for residue in chain: # Check if the residue belongs to the # standard DNA and add those residues to the # seq if residue.get_resname()[1] in [ 'A', 'C', 'G', 'T' ]: seq.append(residue.get_resname()[1]) else: seq.append("X") elif len(chain.get_unpacked_list()[0].resname) == 3: counter = 0 for residue in chain: if is_aa(residue.get_resname(), standard=True): # alphabet = IUPAC.ExtendedIUPACProtein._upper() # The test checks if the amino acid # is one of the 20 standard amino acids # Some proteins have "UNK" or "XXX", or other symbols # for missing or unknown residues seq.append( three_to_one(residue.get_resname())) counter += 1 else: seq.append("X") if counter != 0: # aminoacids print("Your sequence is an aminoacid sequence") else: # HETAM print("Your sequence is a HETAM sequence") for residue in chain: seq.append(residue.get_resname()) while seq[-1] == "X": del seq[-1] while seq[0] == "X": del seq[0] # return Seq(str(''.join(seq)), alphabet=alphabet) return Seq(str(''.join(seq)))
def getModelsChains(self): """ given an atomic structure returns two dictionaries: (1) for all models and respective chains (chainID and length of residues) (2) for each chain list of residues """ self.checkRead() listOfChains = OrderedDict() listOfResidues = OrderedDict() for model in self.structure: chainDicLength = OrderedDict() chainDicFirstResidue = OrderedDict() for chain in model: if len(chain.get_unpacked_list() [0].resname.strip()) == 1: # RNA seq = list() seq_number = list() for residue in chain: if residue.get_resname() in ['A', 'C', 'G', 'U']: seq.append(residue.get_resname()) else: seq.append("X") seq_number.append( (residue.get_id()[1], residue.get_resname())) elif len(chain.get_unpacked_list() [0].resname.strip()) == 2: # DNA seq = list() seq_number = list() for residue in chain: if residue.get_resname()[1] in ['A', 'C', 'G', 'T']: seq.append(residue.get_resname()[1]) else: seq.append("X") seq_number.append( (residue.get_id()[1], residue.get_resname())) elif len(chain.get_unpacked_list() [0].resname.strip()) == 3: # Protein seq = list() seq_number = list() counter = 0 for residue in chain: if is_aa(residue.get_resname(), standard=True): # aminoacids seq.append(three_to_one(residue.get_resname())) counter += 1 else: seq.append("X") seq_number.append( (residue.get_id()[1], residue.get_resname())) if counter == 0: # HETAM for residue in chain: seq.append(residue.get_resname()) while seq[-1] == "X": del seq[-1] while seq[0] == "X": del seq[0] chainDicLength[chain.id] = len(seq) chainDicFirstResidue[chain.id] = seq_number listOfChains[model.id] = chainDicLength listOfResidues[model.id] = chainDicFirstResidue return listOfChains, listOfResidues
def get_res_type(res_list, residue): return res_type_map[three_to_one(residue.get_resname())]
def to_label(a): from Bio.PDB.Polypeptide import three_to_one if (a == 'HID') | (a == 'HIP') | (a == 'HIE'): a = 'HIS' return "%s" % (three_to_one(a))
def resn_one(self): return three_to_one(self.resn)
def pdb_to_casp(rr_name, pdb_file=False, mmCIF_file=False, cutoff=16, confidence=0.99, std=1, method="From_structure"): """ Convert a pdb/mcif to CASP rr-format """ if pdb_file: from Bio.PDB.PDBParser import PDBParser bio_parser = PDBParser(PERMISSIVE=1) structure_file = pdb_file structure_id = pdb_file[:-4] elif mmCIF_file: from Bio.PDB.MMCIFParser import MMCIFParser bio_parser = MMCIFParser() structure_file = mmCIF_file structure_id = mmCIF_file[:-4] else: print("No file given: one pdb or one mmCIF file has to be definied") sys.exit() line = "{i} {j} 0 {m:.5f} {c:.2f} {sd:.4f}\n" first_chain = '' chain_length = defaultdict() # Load structure structure = bio_parser.get_structure(structure_id, structure_file) # Get residues and length of protein residues = "" c_len = 0 for chain in structure[0]: if not first_chain: first_chain = chain for residue1 in structure[0][chain.id]: residue1 if not is_aa(residue1): continue c_len += 1 residues += three_to_one(residue1.get_resname()) chain_length[chain] = c_len plen = len(residues) header = '\n'.join( ("PFRMAT RR", "TARGET {}".format(structure_id), "AUTHOR pyconsFold", "METHOD {}".format(method), "MODEL 1", residues + '\n')) minvalue = 0.36 dist_mat = np.full((plen, plen, 37), minvalue / 36) # Iterate over all residues and calculate distances i = 1 j = 1 content = [header] for chain in structure[0]: for residue1 in structure[0][chain.id]: # Only use real atoms, not HET or water if not is_aa(residue1): continue # If the residue lacks CB (Glycine etc), create a virtual if residue1.has_id('CB'): c1B = residue1['CB'].get_vector() else: c1B = _virtual_cb_vector(residue1) j = 1 for chain in structure[0]: for residue2 in structure[0][chain.id]: if not is_aa(residue2): continue if i == j: j += 1 continue if i > j: j += 1 continue # If the residue lacks CB (Glycine etc), create a virtual if residue2.has_id('CB'): c2B = residue2['CB'].get_vector() else: c2B = _virtual_cb_vector(residue2) ############################################### dist = (c2B - c1B).norm() if dist < cutoff: content.append( line.format(i=i, j=j, m=dist, c=confidence, sd=std)) j += 1 i += 1 content.append("END\n") with open(rr_name, 'w') as contacts_handle: contacts_handle.write(''.join(content))
def to_label(a): from Bio.PDB.Polypeptide import three_to_one if a.rId.serial%5==0: return "%s\n%d"%(three_to_one(a.rName.str), a.rId.serial) else: return "%s"%(three_to_one(a.rName.str))
def launch(self): """Launches the pipeline to build a box around a selection of residues """ #out_log, err_log = fu.get_logs(path=self.path, mutation=self.mutation, step=self.step) ## ## Loading and parsing reference PDB structure parser = Bio.PDB.PDBParser() # Parse input structure print "Loading input PDB structure %s..." % self.input_pdb_path structure_name = os.path.basename(self.input_pdb_path.split('.')[0]) structPDB = parser.get_structure(structure_name,self.input_pdb_path)[0] structPDB_seq = [three_to_one(res.get_resname()) for res in structPDB.get_residues() if is_aa(res.get_resname(), standard=True)] print structPDB_seq # Parse residue structure print "Loading residue PDB selection %s..." % self.resid_pdb_path resid_name = os.path.basename(self.resid_pdb_path.split('.')[0]) residPDB = parser.get_structure(resid_name,self.resid_pdb_path)[0] residPDB_seq = [three_to_one(res.get_resname()) for res in residPDB.get_residues() if is_aa(res.get_resname(), standard=True)] print residPDB_seq ## ## Mapping residue structure into input structure # Listing residues to be selected from the residue structure # residPDB_res_list = [] # p = re.compile('H_|W_') # for residPDB_res in residPDB.get_residues(): # m_het = p.match(residPDB_res.get_id()[0]) # if not m_het: # residPDB_res_list.append(residPDB_res.get_id()) # binding_site_CA_list.append(residPDB_res['CA']) # Aligning # alignments = pairwise2.align.localxx("".join(structPDB_seq), "".join(residPDB_seq)) # print alignments[0] # mappings = Bio.PDB.StructureAlignment(alignments[0], structPDB, residPDB).get_maps() # print mappings # # Mapping selected residues to input structure # selection_res_list = [] # selection_atoms_num = 0 # for struct_chain in structPDB: # for struct_res in struct_chain: # if struct_res.get_id() in residPDB_res_list: # selection_res_list.append(struct_res) # selection_atoms_num += len(struct_res.get_list()) # Get AA sequence clusterPDB_seq = self.__get_pdb_sequence(clusterPDB) # Pairwise align aln, residue_map = self.__align_sequences(structPDB_seq,clusterPDB_seq) print residue_map print " Matching residues to input PDB structure. Alignment is:\n %s" % aln[1] # Calculate (gapless) sequence identity seq_identity, gap_seq_identity = self.__calculate_alignment_identity(aln[0], aln[1]) print " Sequence identity (%%): %s" % seq_identity print " Gap less identity (%%): %s" % gap_seq_identity ## ## Selecting aligned CA atoms from first model, first chain struct_atoms = [] cluster_atoms = [] for struct_res in residue_map: try: cluster_atoms.append(clusterPDB[residue_map[struct_res]]['CA']) struct_atoms.append(structPDB[struct_res]['CA']) except KeyError: print "Cannot find CA atom for residue %s (input PDB %s)" % (structPDB[struct_res],struct_res) pass if len(cluster_atoms)==0: raise Exception('Cannot find CA atoms (1st model, 1st chain) in cluster member {1} when aligning against {2}. Ignoring this member.'.format(clusterPDB_path,structure_name)) else: print " Superimposing %s aligned protein residues" % len(cluster_atoms) # Align against input structure si = Bio.PDB.Superimposer() si.set_atoms(struct_atoms, cluster_atoms) si.apply(clusterPDB.get_atoms()) print " RMSD: %s" %si.rms # Save transformed structure (and ligand) clusterPDB_ligand_aligned = clusterPDB[clusterPDB_ligand.get_id()] print " Saving transformed ligand coordinates" clusterPDB_ligands_aligned.append(clusterPDB_ligand_aligned) if len(selection_res_list) == 0: raise Exception('Cannot match any of the residues listed in %s into %s' % (self.resid_pdb_path,self.input_pdb_path) ) elif len(selection_res_list) != len(residPDB_res_list): warnings.warn('Cannot match all the residues listed in %s into %s. Found %s out of %s' % (self.resid_pdb_path,self.input_pdb_path,len(selection_res_list),len(residPDB_res_list))) else: print "Selection residues successfully matched" ## ## Compute binding site box size # compute box center selection_box_center = numpy.sum(atom.coord for res in selection_res_list for atom in res.get_atoms()) / selection_atoms_num print "Binding site center (Amstrongs): %8.3f%8.3f%8.3f" % (selection_box_center[1],selection_box_center[1],selection_box_center[2]) # compute box size selection_coords_max = numpy.amax([atom.coord for res in selection_res_list for atom in res.get_atoms()],axis=0) selection_box_size = selection_coords_max - selection_box_center if self.offset: selection_box_size = [c + self.offset for c in selection_box_size] print "Binding site size (Amstrongs): %8.3f%8.3f%8.3f" % (selection_box_size[0],selection_box_size[1],selection_box_size[2]) vol = numpy.prod(selection_box_size) * 2**3 print "Volume (cubic Amstrongs): %.0f" % vol # add box details as PDB remarks #remarks = "REMARK 900\nREMARK 900 RELATED ENTRIES\nREMARK 900 RELATED ID:%s CHAIN:%s\n" % (self.pdb_code,self.pdb_chain) remarks = "REMARK BOX CENTER:%8.3f%8.3f%8.3f" % (selection_box_center[1],selection_box_center[1],selection_box_center[2]) remarks += " SIZE:%8.3f%8.3f%8.3f" % (selection_box_size[0],selection_box_size[1],selection_box_size[2]) # add (optional) box coordinates as 8 ATOM records #selection_box_coords_txt = self.get_box_coordinates(selection_box_center,selection_box_size) selection_box_coords_txt = "" # write output pdb shutil.copy2(self.input_pdb_path, self.output_pdb_path) with open(self.output_pdb_path, 'r+') as f: content = f.read() if "END" in content: content = content.replace("END", selection_box_coords_txt + "END") else: content += selection_box_coords_txt f.seek(0, 0) f.write(remarks.rstrip('\r\n') + '\n' + content) print "Output PDB file (with box setting annotations): %s" % self.output_pdb_path
if mem_len < min_mem_len: # Only use membranes longer than 17 continue global_place = mem_data[0] + pdb_seq_offset[full_chain_id] mem_start = global_place + 1 mem_end = mem_start + mem_len ### bridge = [resi1, res1, resi2, res2, chain, dist] # print(mem) # print(mem_start, mem_end) for bridge in bridges: save_bridge = [bridge[0], bridge[1], bridge[2], bridge[3], bridge[4], bridge[5]] # print(mem[bridge[0]-mem_start], mem[bridge[2]-mem_start]) # print(save_bridge) if (bridge[0] >= mem_start and bridge[0] <= mem_end) or (bridge[2] >= mem_start and bridge[2] <= mem_end): # print("Mem bridge") s = save_bridge[0] first_aa = three_to_one(save_bridge[1]) e = save_bridge[2] second_aa = three_to_one(save_bridge[3]) seq_first = int(save_bridge[0])-mem_start seq_second = int(save_bridge[2])-mem_start # print("**********************") # print(bridge) # print(mem_start, mem_end) if seq_first > -1 and seq_first < mem_len: if mem[seq_first] != first_aa: print("Membrane bridge out of sync {}".format(full_chain_id)) keep_running = False break # print("ERROR! Membrane first") # print(mem[seq_first], first_aa) # print(full_chain_id, bridge, mem, mem_start)
'Gly': 3.400, 'His': 13.690, 'Ile': 21.400, 'Leu': 21.400, 'Lys': 15.710, 'Met': 16.250, 'Phe': 19.800, 'Pro': 17.430, 'Ser': 9.470, 'Thr': 15.770, 'Trp': 21.670, 'Tyr': 18.030, 'Val': 21.570 } bulkiness_one = { three_to_one(k.upper()): v for k, v in bulkiness_three.items() } _human_readable_pepstats = { 'A_percent-biop': '% Ala', 'C_percent-biop': '% Cys', 'D_percent-biop': '% Asp', 'E_percent-biop': '% Glu', 'F_percent-biop': '% Phe', 'G_percent-biop': '% Gly', 'H_percent-biop': '% His', 'I_percent-biop': '% Ile', 'K_percent-biop': '% Lys', 'L_percent-biop': '% Leu', 'M_percent-biop': '% Met',
return shannon_entropy_list #START ACTUAL PROGRAM ############################################ #find match ############################################ #read in .pdb sequence parser = PDBParser() structure = parser.get_structure('', '1OTH.pdb') header = parser.get_header() trailer = parser.get_trailer() pdbSequence = '' for residue in structure[0]['A'].get_residues(): if (residue.get_id()[0]==' '): residueName = three_to_one(residue.get_resname()) pdbSequence += residueName #check each fasta sequence for match to pdb sequence bestMatch = '' bestScore = 0 handle = open("uniprot-ornithine+transcarbamylase-2.fasta", "rU") for record in SeqIO.parse(handle, "fasta") : foundMatch = False if foundMatch == False: tempFile = open("temp.fasta", "w") deleteContent(tempFile) tempFile.write(">sp|000000|FAKE HEADER OS=Fakus Faky GN=FAK PE=0 SV=0\n") tempFile.write(pdbSequence + "\n") tempFile.write(">sp|%s|FAKE HEADER OS=Fakus Faky GN=FAK PE=0 SV=0\n"%(record.name)) for x in record.seq: tempFile.write(x)
def main(): parser = argparse.ArgumentParser() parser.add_argument('pdb_directory', action="store", type=str) inputs = parser.parse_args() #takes name of pdb file without the extention for pdb_file in glob.glob(inputs.pdb_directory + '*.pdb'): clean_pdb_file = pdb_file.replace('.pdb', '.clean.pdb') print('#######################') print('#######################{}'.format(pdb_file)) if 'clean' in pdb_file: print('Will overwrite an existing clean pdb so am skipping') continue fasta_outfile_loc = pdb_file.replace('/PDBs/', '/wt_fastas/').replace( '.pdb', '.fasta') #Load and clean up pdb file cleanATOM(pdb_file) with open(clean_pdb_file, 'r') as infile: old_lines = infile.readlines() pdb_io = PDB.PDBIO() pdb_parser = PDB.PDBParser() structure = pdb_parser.get_structure(" ", clean_pdb_file) if len(structure) != 1: print( 'THERE APPEARS TO BE MORE THAN ONE MODEL IN THIS STRUCTURE BEHAVIOR OF PRORAM IS UNKNOWN ({}). EXITING' .format(clean_pdb_file)) continue chain_counts = {} for model in structure: for chain in model: new_number = 1 for i, residue in enumerate(chain.get_residues()): res_id = list(residue.id) if res_id[1] != new_number: res_id[1] = new_number residue.id = tuple(res_id) new_number += 1 chain_counts[chain.id] = new_number chains = sorted(chain_counts.items(), key=lambda x: x[1]) chain_to_keep = chains[-1][0] chains_to_delete = chains[:-1] chains_to_delete = [i for i, j in chains_to_delete] for i, j in enumerate(chains_to_delete): structure[0].detach_child(chains_to_delete[i]) pdb_io.set_structure(structure) pdb_io.save(clean_pdb_file) for model in structure: for chain in model: print('kept ID {} and deleted {}'.format( chain.id, chains_to_delete)) seq_list = [] chainID = chain.get_id() for residue in chain: if is_aa(residue.get_resname(), standard=True): seq_list.append(three_to_one(residue.get_resname())) else: seq_list.append('X') wt_seq = ''.join(seq_list) with open(fasta_outfile_loc, 'w') as outfile: outfile.write('>{}\n{}\n'.format('WT', wt_seq))
def __init__(self, model, in_file, dssp="dssp", acc_array="Sander", file_type='PDB'): """Create a DSSP object. Parameters ---------- model : Model The first model of the structure in_file : string Either a PDB file or a DSSP file. dssp : string The dssp executable (ie. the argument to os.system) acc_array : string Accessible surface area (ASA) from either Miller et al. (1987), Sander & Rost (1994), or Wilke: Tien et al. 2013, as string Sander/Wilke/Miller. Defaults to Sander. file_type: string File type switch, either PDB or DSSP with PDB as default. """ self.residue_max_acc = residue_max_acc[acc_array] # create DSSP dictionary file_type = file_type.upper() assert(file_type in ['PDB', 'DSSP']) # If the input file is a PDB file run DSSP and parse output: if file_type == 'PDB': # Newer versions of DSSP program call the binary 'mkdssp', so # calling 'dssp' will not work in some operating systems # (Debian distribution of DSSP includes a symlink for 'dssp' argument) try: dssp_dict, dssp_keys = dssp_dict_from_pdb_file(in_file, dssp) except OSError: # TODO: Use FileNotFoundError once drop Python 2 if dssp == 'dssp': dssp = 'mkdssp' elif dssp == 'mkdssp': dssp = 'dssp' else: raise dssp_dict, dssp_keys = dssp_dict_from_pdb_file(in_file, dssp) # If the input file is a DSSP file just parse it directly: elif file_type == 'DSSP': dssp_dict, dssp_keys = make_dssp_dict(in_file) dssp_map = {} dssp_list = [] def resid2code(res_id): """Serialize a residue's resseq and icode for easy comparison.""" return '%s%s' % (res_id[1], res_id[2]) # Now create a dictionary that maps Residue objects to # secondary structure and accessibility, and a list of # (residue, (secondary structure, accessibility)) tuples for key in dssp_keys: chain_id, res_id = key chain = model[chain_id] try: res = chain[res_id] except KeyError: # In DSSP, HET field is not considered in residue identifier. # Thus HETATM records may cause unnecessary exceptions. # (See 3jui chain A res 593.) # Try the lookup again with all HETATM other than water res_seq_icode = resid2code(res_id) for r in chain: if r.id[0] not in (' ', 'W'): # Compare resseq + icode if resid2code(r.id) == res_seq_icode: # Found a matching residue res = r break else: raise KeyError(res_id) # For disordered residues of point mutations, Biopython uses the # last one as default, But DSSP takes the first one (alternative # location is blank, A or 1). See 1h9h chain E resi 22. # Here we select the res in which all atoms have altloc blank, A or # 1. If no such residues are found, simply use the first one appears # (as DSSP does). if res.is_disordered() == 2: for rk in res.disordered_get_id_list(): # All atoms in the disordered residue should have the same # altloc, so it suffices to check the altloc of the first # atom. altloc = res.child_dict[rk].get_list()[0].get_altloc() if altloc in tuple('A1 '): res.disordered_select(rk) break else: # Simply select the first one res.disordered_select(res.disordered_get_id_list()[0]) # Sometimes point mutations are put into HETATM and ATOM with altloc # 'A' and 'B'. # See 3piu chain A residue 273: # <Residue LLP het=H_LLP resseq=273 icode= > # <Residue LYS het= resseq=273 icode= > # DSSP uses the HETATM LLP as it has altloc 'A' # We check the altloc code here. elif res.is_disordered() == 1: # Check altloc of all atoms in the DisorderedResidue. If it # contains blank, A or 1, then use it. Otherwise, look for HET # residues of the same seq+icode. If not such HET residues are # found, just accept the current one. altlocs = set(a.get_altloc() for a in res.get_unpacked_list()) if altlocs.isdisjoint('A1 '): # Try again with all HETATM other than water res_seq_icode = resid2code(res_id) for r in chain: if r.id[0] not in (' ', 'W'): if resid2code(r.id) == res_seq_icode and \ r.get_list()[0].get_altloc() in tuple('A1 '): res = r break (aa, ss, acc, phi, psi, dssp_index, NH_O_1_relidx, NH_O_1_energy, O_NH_1_relidx, O_NH_1_energy, NH_O_2_relidx, NH_O_2_energy, O_NH_2_relidx, O_NH_2_energy) = dssp_dict[key] res.xtra["SS_DSSP"] = ss res.xtra["EXP_DSSP_ASA"] = acc res.xtra["PHI_DSSP"] = phi res.xtra["PSI_DSSP"] = psi res.xtra["DSSP_INDEX"] = dssp_index res.xtra["NH_O_1_RELIDX_DSSP"] = NH_O_1_relidx res.xtra["NH_O_1_ENERGY_DSSP"] = NH_O_1_energy res.xtra["O_NH_1_RELIDX_DSSP"] = O_NH_1_relidx res.xtra["O_NH_1_ENERGY_DSSP"] = O_NH_1_energy res.xtra["NH_O_2_RELIDX_DSSP"] = NH_O_2_relidx res.xtra["NH_O_2_ENERGY_DSSP"] = NH_O_2_energy res.xtra["O_NH_2_RELIDX_DSSP"] = O_NH_2_relidx res.xtra["O_NH_2_ENERGY_DSSP"] = O_NH_2_energy # Relative accessibility resname = res.get_resname() try: rel_acc = acc / self.residue_max_acc[resname] except KeyError: # Invalid value for resname rel_acc = 'NA' else: if rel_acc > 1.0: rel_acc = 1.0 res.xtra["EXP_DSSP_RASA"] = rel_acc # Verify if AA in DSSP == AA in Structure # Something went wrong if this is not true! # NB: DSSP uses X often try: resname = three_to_one(resname) except KeyError: resname = 'X' if resname == "C": # DSSP renames C in C-bridges to a,b,c,d,... # - we rename it back to 'C' if _dssp_cys.match(aa): aa = 'C' # Take care of HETATM again if (resname != aa) and (res.id[0] == ' ' or aa != 'X'): raise PDBException("Structure/DSSP mismatch at %s" % res) dssp_vals = (dssp_index, aa, ss, rel_acc, phi, psi, NH_O_1_relidx, NH_O_1_energy, O_NH_1_relidx, O_NH_1_energy, NH_O_2_relidx, NH_O_2_energy, O_NH_2_relidx, O_NH_2_energy) dssp_map[key] = dssp_vals dssp_list.append(dssp_vals) AbstractResiduePropertyMap.__init__(self, dssp_map, dssp_keys, dssp_list)
for chain in model: for residue in chain: try: resName = residue.get_resname() """ n = residue['N'] o = residue['O'] NOdist = n-o if not NOdists.has_key(resName): NOdists[resName] = [NOdist ] else: NOdists[resName].append(NOdist) """ ca = residue['CA'] AA = three_to_one(resName) cg = residue[SelectCG(AA)] AGdist = ca - cg if not CaCgdists.has_key(resName): CaCgdists[resName] = [AGdist] else: CaCgdists[resName].append(AGdist) except: print 'WARNING: missing CA or CG atoms ' """ finalNOdists = dict() for res, dists in NOdists.iteritems(): finalNOdists[res] = np.mean(dists) AA = three_to_one(res) finalNOdists[AA] = finalNOdists[res]