def get_contact_map(self, chain_id):
        '''
            Input:
                self: Use Biopython.PDB structure which has been stored in an object variable
                chain_id  : String (usually in ['A','B', 'C' ...]. The number of chains
                        depends on the specific protein and the resulting structure)
            Return:
                Return a complete contact map (see description in exercise sheet) 
                for a given chain in a Biopython.PDB structure as numpy array. 
                The values in the matrix describe the c-alpha distance between all residues 
                in a chain of a Biopython.PDB structure.
                Only integer values of the distance have to be given (see below).
        '''

        length = len(self.get_sequence(chain_id))
        contact_map = np.zeros((length, length), dtype=np.int64)
        row = 0
        col = 0
        for residue_row in self.structure[0][chain_id]:
            if not is_aa(residue_row, standard=True):
                continue
            for residue_col in self.structure[0][chain_id]:
                if not is_aa(residue_col, standard=True):
                    continue
                contact_map[row][col] = residue_row['CA'] - residue_col['CA']
                col += 1
            row += 1
            col = 0

        return contact_map.astype(np.int)  # return rounded (integer) values
 def get_side_chain_vector(residue):
     """
     Find the average of the unit vectors to different atoms in the side chain
     from the c-alpha atom. For glycine the average of the N-Ca and C-Ca is
     used.
     Returns (C-alpha coordinate vector, side chain unit vector) for residue r
     """
     u = None
     gly = 0
     if is_aa(residue) and residue.has_id('CA'):
         ca = residue['CA'].get_coord()
         dv = np.array(
             [ak.get_coord() for ak in residue.get_unpacked_list()[4:]])
         if len(dv) < 1:
             if residue.has_id('N') and residue.has_id('C'):
                 dv = [residue['C'].get_coord(), residue['N'].get_coord()]
                 dv = np.array(dv)
                 gly = 1
             else:
                 return None
         dv = dv - ca
         if gly:
             dv = -dv
         n = np.sum(abs(dv)**2, axis=-1)**(1. / 2)
         v = dv / n[:, np.newaxis]
         u = (Vector(ca), Vector(v.mean(axis=0)))
     return u
Exemplo n.º 3
0
def modeller_get_chain_seqs(target_protein, target_chain, version):
    target_path = path.join(PATHS.modeller, target_protein + target_chain)
    target_pdb_fname = 'v%s_pdb' % version + target_protein + '.ent'

    pdb_file_path = path.join(target_path, target_pdb_fname)
    if not path.isfile(pdb_file_path):
        LOGGER.warning('File %s not found' % pdb_file_path)
        return None, None
    parser = PDBParser(PERMISSIVE=1, QUIET=True)
    structure_id = path.basename(target_pdb_fname).split('.')[0]
    try:
        structure = parser.get_structure(structure_id, pdb_file_path)
    except:
        print(
            "ERROR: failed parser.get_structure(structure_id, pdb_fname) for "
            + target_pdb_fname)
        return None
    model = structure[0]
    try:
        chain = model[target_chain]
    except KeyError:
        return None
    chain_lst = []
    for res in chain.get_residues():
        if is_aa(res) and res.get_id()[0] == ' ':
            if res.resname == 'UNK' or res.resname == 'ASX':
                chain_lst.append('-')
            elif res.resname == 'SEC':
                chain_lst.append('U')
            else:
                chain_lst.append(Polypeptide.three_to_one(res.resname))

    return chain_lst, chain
 def get_side_chain_vector(residue):
     """
     Find the average of the unit vectors to different atoms in the side chain
     from the c-alpha atom. For glycine the average of the N-Ca and C-Ca is
     used.
     Returns (C-alpha coordinate vector, side chain unit vector) for residue r
     """
     u = None
     gly = 0
     if is_aa(residue) and residue.has_id('CA'):
         ca = residue['CA'].get_coord()
         dv = np.array([ak.get_coord() for ak in residue.get_unpacked_list()[4:]])
         if len(dv) < 1:
             if residue.has_id('N') and residue.has_id('C'):
                 dv = [residue['C'].get_coord(), residue['N'].get_coord()]
                 dv = np.array(dv)
                 gly = 1
             else:
                 return None
         dv = dv - ca
         if gly:
             dv = -dv
         n = np.sum(abs(dv) ** 2, axis=-1) ** (1. / 2)
         v = dv / n[:, np.newaxis]
         u = (Vector(ca), Vector(v.mean(axis=0)))
     return u
def filter_chain(chain):
    non_aas = []
    for idx, residue in enumerate(chain):
        if not is_aa(residue, standard=True):
            non_aas.append(residue.id)
    for non_aa in non_aas:
        chain.__delitem__(non_aa)
    return chain
 def extract_feature(self):
     counter = 0
     overall_time = datetime.now()
     number_of_amino_acids = len(standard_aa_names)
     print_info_nn(" >>> Adding Half Surface Exposure ... ".format(self._database.name))
     if not os.path.exists(self._get_dir_name()):
         os.makedirs(self._get_dir_name())
     for complex_name in self._database.complexes.keys():
         protein_complex = self._database.complexes[complex_name]
         proteins = [protein_complex.unbound_formation.ligand, protein_complex.unbound_formation.receptor]
         for protein in proteins:
             hse_file = self._get_dir_name() + protein.name
             if not os.path.exists(hse_file + ".npy"):
                 counter += 1
                 if counter <= 15:
                     print_info_nn("{0}, ".format(protein.name))
                 else:
                     counter = 0
                     print_info("{0}".format(protein.name))
                 number_of_residues = len(protein.biopython_residues)
                 un = np.zeros(number_of_residues)
                 dn = np.zeros(number_of_residues)
                 uc = np.zeros((number_of_amino_acids, number_of_residues))
                 dc = np.zeros((number_of_amino_acids, number_of_residues))
                 for index, residue in enumerate(protein.biopython_residues):
                     u = self.get_side_chain_vector(residue)
                     if u is None:
                         un[index] = np.nan
                         dn[index] = np.nan
                         uc[:, index] = np.nan
                         dc[:, index] = np.nan
                     else:
                         residue_index = self._residue_index_table[residue.get_resname()]
                         uc[residue_index, index] += 1
                         dc[residue_index, index] += 1
                         neighbours_indices = protein.residues[index].get_feature(Features.RESIDUE_NEIGHBOURHOOD)
                         # print neighbours_indices
                         for neighbour_index in neighbours_indices:
                             if neighbour_index == -1:
                                 break
                             neighbour_residue = protein.biopython_residues[int(neighbour_index)]
                             if is_aa(neighbour_residue) and neighbour_residue.has_id('CA'):
                                 neighbour_vector = neighbour_residue['CA'].get_vector()
                                 residue_index = self._residue_index_table[neighbour_residue.get_resname()]
                                 if u[1].angle((neighbour_vector - u[0])) < np.pi / 2.0:
                                     un[index] += 1
                                     uc[residue_index, index] += 1
                                 else:
                                     dn[index] += 1
                                     dc[residue_index, index] += 1
                 uc = (uc / (1.0 + un)).T
                 dc = (dc / (1.0 + dn)).T
                 hse_array = np.hstack((uc, dc))
                 np.save(hse_file, hse_array)
             hse = np.load(hse_file + ".npy")
             for i in range(len(protein.residues)):
                 protein.residues[i].add_feature(Features.HALF_SPHERE_EXPOSURE, hse[i, :])
     print_info("took {0} seconds.".format((datetime.now() - overall_time).seconds))
Exemplo n.º 7
0
def get_PDB(pdb_ids, valid_chains=None, chain_len=True, pdb_dir='.'):
    # Debug
    logging.debug('Directory for PDB files')
    logging.debug(pdb_dir)
    logging.debug('Chain length')
    logging.debug(chain_len)
    logging.debug('Valid chains')
    logging.debug(valid_chains)
    # New list for residues
    # It will be turned into DataFrame later
    ds_residues = list()
    # Loop thorugh every protein
    for pdb_id in pdb_ids:
        # Define an array of aminoacids for the current protein
        residues = list()
        # Get structure of the protein
        structure = PDBParser(QUIET=True).get_structure(pdb_id, pdb_dir + '/pdb{}.ent'.format(pdb_id))
        # We select only the 0-th model
        model = structure[0]
        # Loop through every model's chain
        for chain in model:
            # Check chain is in valid chains
            if (valid_chains is not None) and ((pdb_id, chain.id) not in valid_chains):
                continue
            for residue in chain:
                # Do not take into account non-aminoacidic residues (e.g. water molecules)
                if not is_aa(residue):
                    continue
                # Add an entry to the residues list
                residues.append((pdb_id, model.id, chain.id, residue.id[1], residue.get_resname(), 0, 0))
        if not residues:
            logging.warning('A protein {} has no valid residues'.format(pdb_id))
        ds_residues += residues
    if not ds_residues:
        logging.error('No valid aminoacidics found\nAborting...')
        exit()
    # Turn list into dataframe
    ds_residues = pd.DataFrame(ds_residues)
    # Debug
    logging.debug('PDB dataset')
    logging.debug(ds_residues)
    # Define dataset column names
    ds_residues.columns = ['PDB_ID', 'MODEL_ID', 'CHAIN_ID', 'RES_ID', 'RES_NAME', 'LIP_SCORE', 'LIP']
    # Check if chain lengths should be added
    if chain_len:
        # Group and extract chain length
        ds_chain_len = ds_residues.groupby(['PDB_ID', 'MODEL_ID', 'CHAIN_ID']).size().reset_index(name='CHAIN_LEN')
        # Add chain len to main dataframe
        ds_residues = ds_residues.merge(ds_chain_len, how='left', on=['PDB_ID', 'MODEL_ID', 'CHAIN_ID'])
        # Reindex columns of the main dataframe: chain length after chain id
        ds_residues = ds_residues.reindex(['PDB_ID', 'MODEL_ID', 'CHAIN_ID', 'CHAIN_LEN', 'RES_ID', 'RES_NAME', 'LIP_SCORE', 'LIP'], axis=1)
    # Show some info about the dataset
    logging.debug("Numbers of proteins: {}".format(len(pdb_ids)))
    logging.debug("Numbers of residues: {}".format(len(ds_residues.PDB_ID)))
    # Return created dataset
    return ds_residues
def get_ensemble_dimension(ensemble):
    structure = ensemble[0]
    N, M = 0, len(ensemble)
    for chain in structure:
        for residue in chain:
            if is_aa(
                    residue
            ):  # Filter hetero groups (returns only amino acids)                                      IUPACData.protein_letters_3to1.get(residue.get_resname().capitalize())))
                N += 1
    return N, M
Exemplo n.º 9
0
def collect_coordinates(structure):
    '''
    Loops over all residues in a structure and collects coordinates for alpha-
    carbons and sidechain center-of-mass. Returns a list of dictionaries, where
    each dictionary corresponds to residue in the structure.
    '''
    output_list = []
    for residue in structure.get_residues():
        if is_aa(residue):
            output_list.append(process_residue(residue))
    return output_list
Exemplo n.º 10
0
    def test_rmsd_translated(self):
        s = self.get_test_structure()
        chain_a_copy = s[0]['A'].copy()

        # move the copy by 1 angstrom
        for atom in chain_a_copy.get_atoms():
            atom.coord += (1, 0, 0)

        chain_a = ChainResidues([r for r in s[0]['A'] if is_aa(r)], s.id, 'A')
        chain_a_copy = ChainResidues([r for r in chain_a_copy if is_aa(r)],
                                     f'moved_{s.id}', 'A')

        get_c_alpha_coords = GetCAlphaCoords()
        get_centroid = GetCentroid((get_c_alpha_coords, ))
        get_centered_c_alpha_coords = GetCenteredCAlphaCoords(
            (get_c_alpha_coords, get_centroid))
        get_rmsd = GetRMSD((get_centered_c_alpha_coords,
                            GetRotationMatrix(
                                (get_centered_c_alpha_coords, ))))

        rmsd = get_rmsd(chain_a, chain_a_copy)
        self.assertAlmostEqual(0, rmsd, places=5)
Exemplo n.º 11
0
def get_aa_seq(chain):
    '''
    Extract amino acid sequence from a PDB chain object and return sequence as
    Bio.SeqRecord object.
    '''
    aa_list = []
    residue_numbers = []
    for residue in chain:
        if is_aa(residue):
            aa_list.append(SCOPData.protein_letters_3to1[residue.resname])
            residue_numbers.append(
                str(residue.get_id()[1]) + residue.get_id()[2].strip())
    aa_seq = SeqRecord(Seq(''.join(aa_list)), id='pdb_seq', description='')
    return aa_seq, residue_numbers
Exemplo n.º 12
0
def getReferenceResidue(structure):
    '''Returns the last residue of chain A model 0 of the given structure.
    
    This function is a helper function that should not normally be called
    directly.'''

    # If the following line doesn't work we're in trouble.
    # Likely initialize_res() wasn't called.
    resRef = structure[0]['A'].child_list[-1]
    
    # If the residue is not an amino acid we're in trouble.
    # Likely somebody is trying to append residues to an existing
    # structure that has non-amino-acid molecules in the chain.
    assert is_aa(resRef)
        
    return resRef
    def get_sequence(self, chain_id):
        '''
            Input:
                self: Use Biopython.PDB structure which has been stored in an object variable
                chain_id  : String (usually in ['A','B', 'C' ...]. The number of chains
                        depends on the specific protein and the resulting structure)
            Return:
                Return the amino acid sequence (single-letter alphabet!) of a given chain (chain_id)
                in a Biopython.PDB structure as a string.
        '''
        first_model = self.structure[0]
        chain = first_model[chain_id]
        aa_sequence = []
        for residue in chain:
            if is_aa(residue, standard=True):
                aa_sequence.append(three_to_one(residue.get_resname()))

        return ''.join(aa_sequence)
Exemplo n.º 14
0
def get_primary_tertiary(file_path, pdb_id):
    # https://bioinformatics.stackexchange.com/questions/14101/extract-residue-sequence-from-pdb-file-in-biopython-but-open-to-recommendation
    p = PDBParser(QUIET=True)

    try:
        structure = p.get_structure(file=file_path, id=pdb_id)
    except ValueError as ve:
        write_out(ve, file_path)
        raise MyException(ve)

    primary = []
    tertiary = []

    first_model = structure[0]
    model_id = str(first_model.get_id())
    for chain in first_model:
        chain_id = str(chain.get_id())
        for residue in chain:
            if is_aa(residue) and residue.resname in aa_codes.keys():
                primary.append(aa_codes[residue.resname])

                try:
                    n = residue['N'].get_coord()
                    ca = residue['CA'].get_coord()
                    c = residue['C'].get_coord()
                except KeyError:
                    write_out('> KeyError in ', '>chain:' + chain_id,
                              residue.resname, residue.get_id())
                    raise MyException('KeyError for :' + residue.resname)
                aa_coord = np.hstack([n, ca, c])
                tertiary.append(aa_coord)

                # for atom in residue:
                #     print('>chain:' + chain_id + ' residue:' + residue.resname + ' Atom:'
                #           + atom.get_name() + str(atom.get_coord()))

    # see_shape(',,,,,,,,,primary,,,,,', primary)
    # see_shape(',,,,,,,,,tertiary,,,,,', tertiary)

    length = len(primary)
    return np.asarray(primary), np.asarray(tertiary), length
 def run(self, struct: Model) -> List[Chain]:
     return list(
         filter(
             lambda chain: sum(is_aa(residue) for residue in chain) >= 50,
             struct.get_chains()))
def get_short_peptide_ligands(struct: Entity,
                              peptide_length_limit: int) -> Iterator[Chain]:
    return filter(
        lambda chain: sum(is_aa(residue)
                          for residue in chain) <= peptide_length_limit,
        struct.get_chains())
 def extract_feature(self):
     counter = 0
     overall_time = datetime.now()
     number_of_amino_acids = len(standard_aa_names)
     print_info_nn(" >>> Adding Half Surface Exposure ... ".format(
         self._database.name))
     if not os.path.exists(self._get_dir_name()):
         os.makedirs(self._get_dir_name())
     for complex_name in self._database.complexes.keys():
         protein_complex = self._database.complexes[complex_name]
         proteins = [
             protein_complex.unbound_formation.ligand,
             protein_complex.unbound_formation.receptor
         ]
         for protein in proteins:
             hse_file = self._get_dir_name() + protein.name
             if not os.path.exists(hse_file + ".npy"):
                 counter += 1
                 if counter <= 15:
                     print_info_nn("{0}, ".format(protein.name))
                 else:
                     counter = 0
                     print_info("{0}".format(protein.name))
                 number_of_residues = len(protein.biopython_residues)
                 un = np.zeros(number_of_residues)
                 dn = np.zeros(number_of_residues)
                 uc = np.zeros((number_of_amino_acids, number_of_residues))
                 dc = np.zeros((number_of_amino_acids, number_of_residues))
                 for index, residue in enumerate(
                         protein.biopython_residues):
                     u = self.get_side_chain_vector(residue)
                     if u is None:
                         un[index] = np.nan
                         dn[index] = np.nan
                         uc[:, index] = np.nan
                         dc[:, index] = np.nan
                     else:
                         residue_index = self._residue_index_table[
                             residue.get_resname()]
                         uc[residue_index, index] += 1
                         dc[residue_index, index] += 1
                         neighbours_indices = protein.residues[
                             index].get_feature(
                                 Features.RESIDUE_NEIGHBOURHOOD)
                         # print neighbours_indices
                         for neighbour_index in neighbours_indices:
                             if neighbour_index == -1:
                                 break
                             neighbour_residue = protein.biopython_residues[
                                 int(neighbour_index)]
                             if is_aa(neighbour_residue
                                      ) and neighbour_residue.has_id('CA'):
                                 neighbour_vector = neighbour_residue[
                                     'CA'].get_vector()
                                 residue_index = self._residue_index_table[
                                     neighbour_residue.get_resname()]
                                 if u[1].angle((neighbour_vector -
                                                u[0])) < np.pi / 2.0:
                                     un[index] += 1
                                     uc[residue_index, index] += 1
                                 else:
                                     dn[index] += 1
                                     dc[residue_index, index] += 1
                 uc = (uc / (1.0 + un)).T
                 dc = (dc / (1.0 + dn)).T
                 hse_array = np.hstack((uc, dc))
                 np.save(hse_file, hse_array)
             hse = np.load(hse_file + ".npy")
             for i in range(len(protein.residues)):
                 protein.residues[i].add_feature(
                     Features.HALF_SPHERE_EXPOSURE, hse[i, :])
     print_info("took {0} seconds.".format(
         (datetime.now() - overall_time).seconds))
Exemplo n.º 18
0
def _is_valid(r):
    return is_aa(r)
Exemplo n.º 19
0
def notlobset(ID, ht):
    fig = plt.figure()

    ht_ID = ht.filter(ht.swiss == ID)
    ht_ID = ht_ID.transmute(aa_orig = ht_ID.aa_change[0], aa_var = ht_ID.aa_change[-1])
    ht_ID = ht_ID.annotate(i = ht_ID.aa_orig + hl.str(hl.int32(ht_ID.aa_num)))
    gt = ht_ID.to_pandas()
    gt['i'] = gt['i'].astype('str')
    gt = gt.set_index('i', drop = True)
    # make a request
    result = swiss_request(ID, mode = 'json', provider = 'pdb', template = '')
    structures = []
    files = []
    chains = []
    if result:
        n_results = len(result["result"]["structures"])
        print(n_results)
        if n_results > 0:
            for n in range(0,n_results-1):
                template = result["result"]["structures"][n]["template"]
                #requests.get()
                #pdbpath = swiss_request(ID, mode = 'pdb', provider = 'pdb', template = template)
                zip_pdbpath = getpdb(ID, template)
                pdbpath = zip_pdbpath[:-2]
                pdb = open(pdbpath, "wb")
                with gzip.open(zip_pdbpath, "rb") as f:
                    bindata = f.read()
                pdb.write(bindata)
                pdb.close()
                structure = parser.get_structure(template, pdbpath)
                os.remove(pdbpath)
                match_chain = whichchain(ID, template)
                for model in structure:
                    for chain_id in match_chain:
                        ax = fig.add_subplot(111, projection='3d')
                        ax.set_aspect('equal')
                        """ this is the level at which we will run dbscan"""
                        chain = model[chain_id]
                        print(chain_id)
                        pdbstruct = []
                        for residue in chain:
                            if not is_aa(residue):
                                continue
                            atom = residue["CA"]
                            # uppercase protein letter maps
                            resi = Bio.Data.SCOPData.protein_letters_3to1[residue.get_resname()]
                            pdbrow = [resi + str(residue.get_id()[1]), residue.get_id()[1]] + atom.get_coord().tolist()
                            pdbstruct.append(pdbrow)
                        df = pd.DataFrame(data=pdbstruct, columns = ['i','aa','x','y','z'])
                        df['i'] = df['i'].astype('str')
                        df = df.set_index('i', drop = True)
                        mldf = df.join(gt) 
                        mldf = mldf.drop(['aa_num','Gene','swiss','locus.contig','locus.position','aa_orig','aa_var','alleles'], axis = 1)
                        mldf = mldf.sort_values('aa')
                        mldf['AC'] = mldf['AC'].fillna(0)
                        mldf = mldf.groupby(mldf.index).agg({'aa':np.mean,'x':np.mean,'y':np.mean,'z':np.mean,'AC':np.sum}).sort_values('aa').reset_index('i')
                        mldf = mldf.drop(['i'],axis=1)
                        mldf2 = mldf.drop(['AC'],axis=1)
                        mlmat = mldf2.as_matrix()
                        print(mlmat)
                        result = sklearn.cluster.DBSCAN().fit_predict(mlmat,sample_weight = mldf['AC'])
                        print(result)
                        
                        """
Exemplo n.º 20
0
 def is_aa(self):
     return is_aa(self)
Exemplo n.º 21
0
if args.verbose:
    print("Verbose mode enabled. Parsing PDB {}.".format(pdb_id))
phis = []
psis = []

if args.chain:  # plot residues from a single chain
    if args.verbose:
        print("Using residues from chain {}.".format(args.chain))
    for model in structure:
        # using list() to avoid weird residue indices
        lchain = list(model[args.chain])
        for i, res in enumerate(lchain):
            # next residue in chain for torsion calculation
            n = lchain[i + 1]
            # only calculate torsion if next residue is also an AA
            if is_aa(n):
                # make groups of relevant atoms to calculate angle
                phi_atoms = (res['C'], n['N'], n['CA'], n['C'])
                psi_atoms = (res['N'], res['CA'], res['C'], n['N'])
                # make group of coordinates of relevant atoms
                phi_coords = (a.get_coord() for a in phi_atoms)
                psi_coords = (a.get_coord() for a in psi_atoms)
                # calculate torsions from groups of coordinates and store
                phis.append(torsion(*phi_coords))
                psis.append(torsion(*psi_coords))
            # stop calculating torsions when AAs run out
            else:
                break
else:  # plot residues from all chains
    if args.verbose:
        print("Using all residues.")
Exemplo n.º 22
0
ax1.plot(real_df['aaNum'],real_df['MTR'])
ax2.plot(mtr_df['Protein_position'], mtr_df['MTR'])
plt.savefig('Orig_MTR_result.png')
plt.close()

real_df.to_csv('Orig_MTR_result.csv', index=False)


# create the pdb structure data

structure = parser.get_structure('h2A', pdbpath)
model = structure[0]
chain = model['A']
pdbstruct = []
for residue in chain:
    if not is_aa(residue):
        continue
    atom = residue["CA"] # using the alpha carbon
    # uppercase protein letter maps
    resi = Bio.Data.SCOPData.protein_letters_3to1[residue.get_resname()]
    pdbrow = [residue.get_id()[1]] + atom.get_coord().tolist()
    pdbstruct.append(pdbrow)
pdb_df = pd.DataFrame(data=pdbstruct, columns = ['aaNum','x','y','z'])
all_df = pdb_df.join(agg_df.set_index('aaNum'), on='aaNum')
all_df = all_df.fillna(0)

# calculate MTR
print('calculating MTR score...')

def calc_MTR_sphere_window(row):
    """