def get_contact_map(self, chain_id): ''' Input: self: Use Biopython.PDB structure which has been stored in an object variable chain_id : String (usually in ['A','B', 'C' ...]. The number of chains depends on the specific protein and the resulting structure) Return: Return a complete contact map (see description in exercise sheet) for a given chain in a Biopython.PDB structure as numpy array. The values in the matrix describe the c-alpha distance between all residues in a chain of a Biopython.PDB structure. Only integer values of the distance have to be given (see below). ''' length = len(self.get_sequence(chain_id)) contact_map = np.zeros((length, length), dtype=np.int64) row = 0 col = 0 for residue_row in self.structure[0][chain_id]: if not is_aa(residue_row, standard=True): continue for residue_col in self.structure[0][chain_id]: if not is_aa(residue_col, standard=True): continue contact_map[row][col] = residue_row['CA'] - residue_col['CA'] col += 1 row += 1 col = 0 return contact_map.astype(np.int) # return rounded (integer) values
def get_side_chain_vector(residue): """ Find the average of the unit vectors to different atoms in the side chain from the c-alpha atom. For glycine the average of the N-Ca and C-Ca is used. Returns (C-alpha coordinate vector, side chain unit vector) for residue r """ u = None gly = 0 if is_aa(residue) and residue.has_id('CA'): ca = residue['CA'].get_coord() dv = np.array( [ak.get_coord() for ak in residue.get_unpacked_list()[4:]]) if len(dv) < 1: if residue.has_id('N') and residue.has_id('C'): dv = [residue['C'].get_coord(), residue['N'].get_coord()] dv = np.array(dv) gly = 1 else: return None dv = dv - ca if gly: dv = -dv n = np.sum(abs(dv)**2, axis=-1)**(1. / 2) v = dv / n[:, np.newaxis] u = (Vector(ca), Vector(v.mean(axis=0))) return u
def modeller_get_chain_seqs(target_protein, target_chain, version): target_path = path.join(PATHS.modeller, target_protein + target_chain) target_pdb_fname = 'v%s_pdb' % version + target_protein + '.ent' pdb_file_path = path.join(target_path, target_pdb_fname) if not path.isfile(pdb_file_path): LOGGER.warning('File %s not found' % pdb_file_path) return None, None parser = PDBParser(PERMISSIVE=1, QUIET=True) structure_id = path.basename(target_pdb_fname).split('.')[0] try: structure = parser.get_structure(structure_id, pdb_file_path) except: print( "ERROR: failed parser.get_structure(structure_id, pdb_fname) for " + target_pdb_fname) return None model = structure[0] try: chain = model[target_chain] except KeyError: return None chain_lst = [] for res in chain.get_residues(): if is_aa(res) and res.get_id()[0] == ' ': if res.resname == 'UNK' or res.resname == 'ASX': chain_lst.append('-') elif res.resname == 'SEC': chain_lst.append('U') else: chain_lst.append(Polypeptide.three_to_one(res.resname)) return chain_lst, chain
def get_side_chain_vector(residue): """ Find the average of the unit vectors to different atoms in the side chain from the c-alpha atom. For glycine the average of the N-Ca and C-Ca is used. Returns (C-alpha coordinate vector, side chain unit vector) for residue r """ u = None gly = 0 if is_aa(residue) and residue.has_id('CA'): ca = residue['CA'].get_coord() dv = np.array([ak.get_coord() for ak in residue.get_unpacked_list()[4:]]) if len(dv) < 1: if residue.has_id('N') and residue.has_id('C'): dv = [residue['C'].get_coord(), residue['N'].get_coord()] dv = np.array(dv) gly = 1 else: return None dv = dv - ca if gly: dv = -dv n = np.sum(abs(dv) ** 2, axis=-1) ** (1. / 2) v = dv / n[:, np.newaxis] u = (Vector(ca), Vector(v.mean(axis=0))) return u
def filter_chain(chain): non_aas = [] for idx, residue in enumerate(chain): if not is_aa(residue, standard=True): non_aas.append(residue.id) for non_aa in non_aas: chain.__delitem__(non_aa) return chain
def extract_feature(self): counter = 0 overall_time = datetime.now() number_of_amino_acids = len(standard_aa_names) print_info_nn(" >>> Adding Half Surface Exposure ... ".format(self._database.name)) if not os.path.exists(self._get_dir_name()): os.makedirs(self._get_dir_name()) for complex_name in self._database.complexes.keys(): protein_complex = self._database.complexes[complex_name] proteins = [protein_complex.unbound_formation.ligand, protein_complex.unbound_formation.receptor] for protein in proteins: hse_file = self._get_dir_name() + protein.name if not os.path.exists(hse_file + ".npy"): counter += 1 if counter <= 15: print_info_nn("{0}, ".format(protein.name)) else: counter = 0 print_info("{0}".format(protein.name)) number_of_residues = len(protein.biopython_residues) un = np.zeros(number_of_residues) dn = np.zeros(number_of_residues) uc = np.zeros((number_of_amino_acids, number_of_residues)) dc = np.zeros((number_of_amino_acids, number_of_residues)) for index, residue in enumerate(protein.biopython_residues): u = self.get_side_chain_vector(residue) if u is None: un[index] = np.nan dn[index] = np.nan uc[:, index] = np.nan dc[:, index] = np.nan else: residue_index = self._residue_index_table[residue.get_resname()] uc[residue_index, index] += 1 dc[residue_index, index] += 1 neighbours_indices = protein.residues[index].get_feature(Features.RESIDUE_NEIGHBOURHOOD) # print neighbours_indices for neighbour_index in neighbours_indices: if neighbour_index == -1: break neighbour_residue = protein.biopython_residues[int(neighbour_index)] if is_aa(neighbour_residue) and neighbour_residue.has_id('CA'): neighbour_vector = neighbour_residue['CA'].get_vector() residue_index = self._residue_index_table[neighbour_residue.get_resname()] if u[1].angle((neighbour_vector - u[0])) < np.pi / 2.0: un[index] += 1 uc[residue_index, index] += 1 else: dn[index] += 1 dc[residue_index, index] += 1 uc = (uc / (1.0 + un)).T dc = (dc / (1.0 + dn)).T hse_array = np.hstack((uc, dc)) np.save(hse_file, hse_array) hse = np.load(hse_file + ".npy") for i in range(len(protein.residues)): protein.residues[i].add_feature(Features.HALF_SPHERE_EXPOSURE, hse[i, :]) print_info("took {0} seconds.".format((datetime.now() - overall_time).seconds))
def get_PDB(pdb_ids, valid_chains=None, chain_len=True, pdb_dir='.'): # Debug logging.debug('Directory for PDB files') logging.debug(pdb_dir) logging.debug('Chain length') logging.debug(chain_len) logging.debug('Valid chains') logging.debug(valid_chains) # New list for residues # It will be turned into DataFrame later ds_residues = list() # Loop thorugh every protein for pdb_id in pdb_ids: # Define an array of aminoacids for the current protein residues = list() # Get structure of the protein structure = PDBParser(QUIET=True).get_structure(pdb_id, pdb_dir + '/pdb{}.ent'.format(pdb_id)) # We select only the 0-th model model = structure[0] # Loop through every model's chain for chain in model: # Check chain is in valid chains if (valid_chains is not None) and ((pdb_id, chain.id) not in valid_chains): continue for residue in chain: # Do not take into account non-aminoacidic residues (e.g. water molecules) if not is_aa(residue): continue # Add an entry to the residues list residues.append((pdb_id, model.id, chain.id, residue.id[1], residue.get_resname(), 0, 0)) if not residues: logging.warning('A protein {} has no valid residues'.format(pdb_id)) ds_residues += residues if not ds_residues: logging.error('No valid aminoacidics found\nAborting...') exit() # Turn list into dataframe ds_residues = pd.DataFrame(ds_residues) # Debug logging.debug('PDB dataset') logging.debug(ds_residues) # Define dataset column names ds_residues.columns = ['PDB_ID', 'MODEL_ID', 'CHAIN_ID', 'RES_ID', 'RES_NAME', 'LIP_SCORE', 'LIP'] # Check if chain lengths should be added if chain_len: # Group and extract chain length ds_chain_len = ds_residues.groupby(['PDB_ID', 'MODEL_ID', 'CHAIN_ID']).size().reset_index(name='CHAIN_LEN') # Add chain len to main dataframe ds_residues = ds_residues.merge(ds_chain_len, how='left', on=['PDB_ID', 'MODEL_ID', 'CHAIN_ID']) # Reindex columns of the main dataframe: chain length after chain id ds_residues = ds_residues.reindex(['PDB_ID', 'MODEL_ID', 'CHAIN_ID', 'CHAIN_LEN', 'RES_ID', 'RES_NAME', 'LIP_SCORE', 'LIP'], axis=1) # Show some info about the dataset logging.debug("Numbers of proteins: {}".format(len(pdb_ids))) logging.debug("Numbers of residues: {}".format(len(ds_residues.PDB_ID))) # Return created dataset return ds_residues
def get_ensemble_dimension(ensemble): structure = ensemble[0] N, M = 0, len(ensemble) for chain in structure: for residue in chain: if is_aa( residue ): # Filter hetero groups (returns only amino acids) IUPACData.protein_letters_3to1.get(residue.get_resname().capitalize()))) N += 1 return N, M
def collect_coordinates(structure): ''' Loops over all residues in a structure and collects coordinates for alpha- carbons and sidechain center-of-mass. Returns a list of dictionaries, where each dictionary corresponds to residue in the structure. ''' output_list = [] for residue in structure.get_residues(): if is_aa(residue): output_list.append(process_residue(residue)) return output_list
def test_rmsd_translated(self): s = self.get_test_structure() chain_a_copy = s[0]['A'].copy() # move the copy by 1 angstrom for atom in chain_a_copy.get_atoms(): atom.coord += (1, 0, 0) chain_a = ChainResidues([r for r in s[0]['A'] if is_aa(r)], s.id, 'A') chain_a_copy = ChainResidues([r for r in chain_a_copy if is_aa(r)], f'moved_{s.id}', 'A') get_c_alpha_coords = GetCAlphaCoords() get_centroid = GetCentroid((get_c_alpha_coords, )) get_centered_c_alpha_coords = GetCenteredCAlphaCoords( (get_c_alpha_coords, get_centroid)) get_rmsd = GetRMSD((get_centered_c_alpha_coords, GetRotationMatrix( (get_centered_c_alpha_coords, )))) rmsd = get_rmsd(chain_a, chain_a_copy) self.assertAlmostEqual(0, rmsd, places=5)
def get_aa_seq(chain): ''' Extract amino acid sequence from a PDB chain object and return sequence as Bio.SeqRecord object. ''' aa_list = [] residue_numbers = [] for residue in chain: if is_aa(residue): aa_list.append(SCOPData.protein_letters_3to1[residue.resname]) residue_numbers.append( str(residue.get_id()[1]) + residue.get_id()[2].strip()) aa_seq = SeqRecord(Seq(''.join(aa_list)), id='pdb_seq', description='') return aa_seq, residue_numbers
def getReferenceResidue(structure): '''Returns the last residue of chain A model 0 of the given structure. This function is a helper function that should not normally be called directly.''' # If the following line doesn't work we're in trouble. # Likely initialize_res() wasn't called. resRef = structure[0]['A'].child_list[-1] # If the residue is not an amino acid we're in trouble. # Likely somebody is trying to append residues to an existing # structure that has non-amino-acid molecules in the chain. assert is_aa(resRef) return resRef
def get_sequence(self, chain_id): ''' Input: self: Use Biopython.PDB structure which has been stored in an object variable chain_id : String (usually in ['A','B', 'C' ...]. The number of chains depends on the specific protein and the resulting structure) Return: Return the amino acid sequence (single-letter alphabet!) of a given chain (chain_id) in a Biopython.PDB structure as a string. ''' first_model = self.structure[0] chain = first_model[chain_id] aa_sequence = [] for residue in chain: if is_aa(residue, standard=True): aa_sequence.append(three_to_one(residue.get_resname())) return ''.join(aa_sequence)
def get_primary_tertiary(file_path, pdb_id): # https://bioinformatics.stackexchange.com/questions/14101/extract-residue-sequence-from-pdb-file-in-biopython-but-open-to-recommendation p = PDBParser(QUIET=True) try: structure = p.get_structure(file=file_path, id=pdb_id) except ValueError as ve: write_out(ve, file_path) raise MyException(ve) primary = [] tertiary = [] first_model = structure[0] model_id = str(first_model.get_id()) for chain in first_model: chain_id = str(chain.get_id()) for residue in chain: if is_aa(residue) and residue.resname in aa_codes.keys(): primary.append(aa_codes[residue.resname]) try: n = residue['N'].get_coord() ca = residue['CA'].get_coord() c = residue['C'].get_coord() except KeyError: write_out('> KeyError in ', '>chain:' + chain_id, residue.resname, residue.get_id()) raise MyException('KeyError for :' + residue.resname) aa_coord = np.hstack([n, ca, c]) tertiary.append(aa_coord) # for atom in residue: # print('>chain:' + chain_id + ' residue:' + residue.resname + ' Atom:' # + atom.get_name() + str(atom.get_coord())) # see_shape(',,,,,,,,,primary,,,,,', primary) # see_shape(',,,,,,,,,tertiary,,,,,', tertiary) length = len(primary) return np.asarray(primary), np.asarray(tertiary), length
def run(self, struct: Model) -> List[Chain]: return list( filter( lambda chain: sum(is_aa(residue) for residue in chain) >= 50, struct.get_chains()))
def get_short_peptide_ligands(struct: Entity, peptide_length_limit: int) -> Iterator[Chain]: return filter( lambda chain: sum(is_aa(residue) for residue in chain) <= peptide_length_limit, struct.get_chains())
def extract_feature(self): counter = 0 overall_time = datetime.now() number_of_amino_acids = len(standard_aa_names) print_info_nn(" >>> Adding Half Surface Exposure ... ".format( self._database.name)) if not os.path.exists(self._get_dir_name()): os.makedirs(self._get_dir_name()) for complex_name in self._database.complexes.keys(): protein_complex = self._database.complexes[complex_name] proteins = [ protein_complex.unbound_formation.ligand, protein_complex.unbound_formation.receptor ] for protein in proteins: hse_file = self._get_dir_name() + protein.name if not os.path.exists(hse_file + ".npy"): counter += 1 if counter <= 15: print_info_nn("{0}, ".format(protein.name)) else: counter = 0 print_info("{0}".format(protein.name)) number_of_residues = len(protein.biopython_residues) un = np.zeros(number_of_residues) dn = np.zeros(number_of_residues) uc = np.zeros((number_of_amino_acids, number_of_residues)) dc = np.zeros((number_of_amino_acids, number_of_residues)) for index, residue in enumerate( protein.biopython_residues): u = self.get_side_chain_vector(residue) if u is None: un[index] = np.nan dn[index] = np.nan uc[:, index] = np.nan dc[:, index] = np.nan else: residue_index = self._residue_index_table[ residue.get_resname()] uc[residue_index, index] += 1 dc[residue_index, index] += 1 neighbours_indices = protein.residues[ index].get_feature( Features.RESIDUE_NEIGHBOURHOOD) # print neighbours_indices for neighbour_index in neighbours_indices: if neighbour_index == -1: break neighbour_residue = protein.biopython_residues[ int(neighbour_index)] if is_aa(neighbour_residue ) and neighbour_residue.has_id('CA'): neighbour_vector = neighbour_residue[ 'CA'].get_vector() residue_index = self._residue_index_table[ neighbour_residue.get_resname()] if u[1].angle((neighbour_vector - u[0])) < np.pi / 2.0: un[index] += 1 uc[residue_index, index] += 1 else: dn[index] += 1 dc[residue_index, index] += 1 uc = (uc / (1.0 + un)).T dc = (dc / (1.0 + dn)).T hse_array = np.hstack((uc, dc)) np.save(hse_file, hse_array) hse = np.load(hse_file + ".npy") for i in range(len(protein.residues)): protein.residues[i].add_feature( Features.HALF_SPHERE_EXPOSURE, hse[i, :]) print_info("took {0} seconds.".format( (datetime.now() - overall_time).seconds))
def _is_valid(r): return is_aa(r)
def notlobset(ID, ht): fig = plt.figure() ht_ID = ht.filter(ht.swiss == ID) ht_ID = ht_ID.transmute(aa_orig = ht_ID.aa_change[0], aa_var = ht_ID.aa_change[-1]) ht_ID = ht_ID.annotate(i = ht_ID.aa_orig + hl.str(hl.int32(ht_ID.aa_num))) gt = ht_ID.to_pandas() gt['i'] = gt['i'].astype('str') gt = gt.set_index('i', drop = True) # make a request result = swiss_request(ID, mode = 'json', provider = 'pdb', template = '') structures = [] files = [] chains = [] if result: n_results = len(result["result"]["structures"]) print(n_results) if n_results > 0: for n in range(0,n_results-1): template = result["result"]["structures"][n]["template"] #requests.get() #pdbpath = swiss_request(ID, mode = 'pdb', provider = 'pdb', template = template) zip_pdbpath = getpdb(ID, template) pdbpath = zip_pdbpath[:-2] pdb = open(pdbpath, "wb") with gzip.open(zip_pdbpath, "rb") as f: bindata = f.read() pdb.write(bindata) pdb.close() structure = parser.get_structure(template, pdbpath) os.remove(pdbpath) match_chain = whichchain(ID, template) for model in structure: for chain_id in match_chain: ax = fig.add_subplot(111, projection='3d') ax.set_aspect('equal') """ this is the level at which we will run dbscan""" chain = model[chain_id] print(chain_id) pdbstruct = [] for residue in chain: if not is_aa(residue): continue atom = residue["CA"] # uppercase protein letter maps resi = Bio.Data.SCOPData.protein_letters_3to1[residue.get_resname()] pdbrow = [resi + str(residue.get_id()[1]), residue.get_id()[1]] + atom.get_coord().tolist() pdbstruct.append(pdbrow) df = pd.DataFrame(data=pdbstruct, columns = ['i','aa','x','y','z']) df['i'] = df['i'].astype('str') df = df.set_index('i', drop = True) mldf = df.join(gt) mldf = mldf.drop(['aa_num','Gene','swiss','locus.contig','locus.position','aa_orig','aa_var','alleles'], axis = 1) mldf = mldf.sort_values('aa') mldf['AC'] = mldf['AC'].fillna(0) mldf = mldf.groupby(mldf.index).agg({'aa':np.mean,'x':np.mean,'y':np.mean,'z':np.mean,'AC':np.sum}).sort_values('aa').reset_index('i') mldf = mldf.drop(['i'],axis=1) mldf2 = mldf.drop(['AC'],axis=1) mlmat = mldf2.as_matrix() print(mlmat) result = sklearn.cluster.DBSCAN().fit_predict(mlmat,sample_weight = mldf['AC']) print(result) """
def is_aa(self): return is_aa(self)
if args.verbose: print("Verbose mode enabled. Parsing PDB {}.".format(pdb_id)) phis = [] psis = [] if args.chain: # plot residues from a single chain if args.verbose: print("Using residues from chain {}.".format(args.chain)) for model in structure: # using list() to avoid weird residue indices lchain = list(model[args.chain]) for i, res in enumerate(lchain): # next residue in chain for torsion calculation n = lchain[i + 1] # only calculate torsion if next residue is also an AA if is_aa(n): # make groups of relevant atoms to calculate angle phi_atoms = (res['C'], n['N'], n['CA'], n['C']) psi_atoms = (res['N'], res['CA'], res['C'], n['N']) # make group of coordinates of relevant atoms phi_coords = (a.get_coord() for a in phi_atoms) psi_coords = (a.get_coord() for a in psi_atoms) # calculate torsions from groups of coordinates and store phis.append(torsion(*phi_coords)) psis.append(torsion(*psi_coords)) # stop calculating torsions when AAs run out else: break else: # plot residues from all chains if args.verbose: print("Using all residues.")
ax1.plot(real_df['aaNum'],real_df['MTR']) ax2.plot(mtr_df['Protein_position'], mtr_df['MTR']) plt.savefig('Orig_MTR_result.png') plt.close() real_df.to_csv('Orig_MTR_result.csv', index=False) # create the pdb structure data structure = parser.get_structure('h2A', pdbpath) model = structure[0] chain = model['A'] pdbstruct = [] for residue in chain: if not is_aa(residue): continue atom = residue["CA"] # using the alpha carbon # uppercase protein letter maps resi = Bio.Data.SCOPData.protein_letters_3to1[residue.get_resname()] pdbrow = [residue.get_id()[1]] + atom.get_coord().tolist() pdbstruct.append(pdbrow) pdb_df = pd.DataFrame(data=pdbstruct, columns = ['aaNum','x','y','z']) all_df = pdb_df.join(agg_df.set_index('aaNum'), on='aaNum') all_df = all_df.fillna(0) # calculate MTR print('calculating MTR score...') def calc_MTR_sphere_window(row): """