示例#1
0
def end_to_end_distance(structure):
    entry_id = structure[0:7]
    chain_id = structure[-5]
    ensemble = structure[8]
    conformer = structure.split("_")[1].split("-")[1]
    io = PDBIO()
    pdb = PDBParser().get_structure(structure, structure)
    residues_number_list = []
    for chain in pdb.get_chains():
        for residue in chain.get_residues():
            if residue.id[0] == " ":
                residues_number_list.append(residue.get_id()[1])
            else:
                pass
    coordinates_first = []
    coordinates_last = []
    for chain in pdb.get_chains():
        for residue in chain.get_residues():
            if residue.get_id()[1] == residues_number_list[0]:
                coordinates_first.append((residue['CA'].get_coord()))
            elif residue.get_id()[1] == residues_number_list[-1]:
                coordinates_last.append((residue['CA'].get_coord()))
    dist = numpy.linalg.norm(coordinates_first[0] - coordinates_last[0])
    end_to_end_df.loc[len(end_to_end_df)] = [
        entry_id, ensemble, conformer, chain_id, structure, dist
    ]
示例#2
0
def chain_splitter(structure, chains):
    io = PDBIO()
    pdb = PDBParser().get_structure(structure, structure)
    for chain in pdb.get_chains():
        if (chain.get_id() != " "):
            io.set_structure(chain)
            io.save(structure[:-4] + "_" + chain.get_id() + ".pdb")
            chains.append(chain.get_id())
        else:
            chain_correction(structure)
            chains.append("A")
    os.remove(structure)
    return (chains)
示例#3
0
def PDBParser(input, input_format=None):
    try:
        parser = BioPDBParser().get_structure('pdb', io.StringIO(input))
        chain = list(parser.get_chains())[0]
        remove_atoms(chain)
        contacts = get_chain_contacts(chain)
    except:
        raise InvalidFormat('Unable to parse contacts')

    if not contacts:
        raise InvalidFormat('Unable to parse contacts')

    output = ["PDB"]
    output += sorted(contacts, key=itemgetter(2), reverse=True)
    return output
示例#4
0
def download_and_get_chains():
    from Bio.PDB import PDBParser, PDBIO
    failed = []
    pdbs_dict = read_rostdb_entries()
    io = PDBIO()
    pdbl = PDBList()
    for pdb_e, chains in pdbs_dict.items():
        for chain_e in chains:
            try:
                pdbl.retrieve_pdb_file(pdb_e, pdir='./')
                pdb = PDBParser().get_structure(pdb_e, 'pdb'+pdb_e.lower()+'.ent')
                for chain in pdb.get_chains():
                    if chain.get_id() == chain_e:
                        io.set_structure(chain)
                        io.save(pdb.get_id() + '_' + chain.get_id() + '.pdb')
            except:
                failed.append((pdb_e, chain_e))
    print("failures:", failed)
示例#5
0
class IterationTests(unittest.TestCase):        

    def setUp(self):
        self.struc = PDBParser(PERMISSIVE=True).get_structure('X', "PDB/a_structure.pdb")

    def test_get_chains(self):
        """Yields chains from different models separately."""
        chains = [chain.id for chain in self.struc.get_chains()]
        self.assertEqual(chains, ['A','A', 'B', ' '])

    def test_get_residues(self):
        """Yields all residues from all models."""
        residues = [resi.id for resi in self.struc.get_residues()]
        self.assertEqual(len(residues), 167)

    def test_get_atoms(self):
        """Yields all atoms from the structure, excluding duplicates and ALTLOCs which are not parsed."""
        atoms = ["%12s"%str((atom.id, atom.altloc)) for atom in self.struc.get_atoms()]
        self.assertEqual(len(atoms), 756)
示例#6
0
class IterationTests(unittest.TestCase):

    def setUp(self):
        self.struc = PDBParser(PERMISSIVE=True).get_structure('X', "PDB/a_structure.pdb")

    def test_get_chains(self):
        """Yields chains from different models separately."""
        chains = [chain.id for chain in self.struc.get_chains()]
        self.assertEqual(chains, ['A','A', 'B', ' '])

    def test_get_residues(self):
        """Yields all residues from all models."""
        residues = [resi.id for resi in self.struc.get_residues()]
        self.assertEqual(len(residues), 167)

    def test_get_atoms(self):
        """Yields all atoms from the structure, excluding duplicates and ALTLOCs which are not parsed."""
        atoms = ["%12s"%str((atom.id, atom.altloc)) for atom in self.struc.get_atoms()]
        self.assertEqual(len(atoms), 756)
示例#7
0
def download_and_get_chains():
    from Bio.PDB import PDBParser, PDBIO
    failed = []
    pdbs_dict = read_rostdb_entries()
    io = PDBIO()
    pdbl = PDBList()
    for pdb_e, chains in pdbs_dict.items():
        for chain_e in chains:
            try:
                pdbl.retrieve_pdb_file(pdb_e, pdir='./')
                pdb = PDBParser().get_structure(pdb_e,
                                                'pdb' + pdb_e.lower() + '.ent')
                for chain in pdb.get_chains():
                    if chain.get_id() == chain_e:
                        io.set_structure(chain)
                        io.save(pdb.get_id() + '_' + chain.get_id() + '.pdb')
            except:
                failed.append((pdb_e, chain_e))
    print("failures:", failed)
示例#8
0
def get_output(domain, virtualcb=False):
    """
    Function loads pdb file, reads it and returns the atomic coordinates of
    domain with ranges specified in CATH sequence file. It also returns a list
    with secondary structure representation developed by DSSP, as well as torsion angles.

    Function can create a virtual C-beta atom of Glycine residue if requested
    Input:
        domain   : Full domain name (eg. 16pkA01)
        virtualcb: (Boolean) Create virtual C beta atom on Glycine. By default
                   is False, which means that atomic coordinates for Glycine are C-alpha

    Output:
        coords_list : 2D array of coordinates
            legend: [residue number, residue name, X, Y, Z]

        sectorsions : 2D array
            legend: [Secondary structure, Phi, Psi]
    """

    domain_start, domain_end = domains[domain][0], domains[domain][1]
    domain_id = domain[:4]
    chain_id = domain[4]

    # Get PDB structure
    try:
        structure = PDBParser().get_structure(
            '', f'../../data/pdbfiles/{domain_id}.pdb')
    except (IndexError, ValueError):
        return None, None

    # There is a problem with 0 character, because sometimes
    # it means no chain (chain == ' '), but another times
    # it is a valid chain ID
    if chain_id == '0':
        # get all chain_IDs
        chain_IDs = np.array(
            [ch.get_full_id()[2] for ch in structure.get_chains()])
        if '0' in chain_IDs:
            pass
        else:
            chain_id = ' '

    chain = structure[0][chain_id]

    coords_list = []

    known_aminoacids = np.array(list(protein_letters_3to1.keys()))

    for i, residue in enumerate(chain.get_residues()):
        residue_name = residue.get_resname()

        if residue_name not in known_aminoacids:
            break

        residue_oneletter = protein_letters_3to1[residue_name]
        residue_number = residue.child_list[0].get_full_id()[3][1]

        if residue_oneletter == 'G':  # if Glycin -> C-alpha/Virtual C-beta. Otherwise C-beta
            try:
                if virtualcb:
                    atom = virtual_cbeta(residue)
                else:
                    atom = residue['CA']
            except KeyError:
                if residue_number < domain_start:
                    if virtualcb:
                        atom = [0, 0, 0]
                    else:
                        atom = residue.child_list[
                            0]  # just append any atom, it doesnt matter
                else:
                    print('Missing C-alpha atom')
                    return None, None
        else:
            try:
                atom = residue['CB']
            except KeyError:
                if residue_number < domain_start:
                    atom = residue.child_list[
                        0]  # just append any atom, it doesnt matter
                else:
                    print('Missing C-beta atom')
                    return None, None

        if residue_oneletter == 'G' and virtualcb:
            x, y, z = atom
        else:
            x, y, z = atom.get_coord()

        coords_list.append(
            [residue_number, residue_name, residue_oneletter, x, y, z])

        if residue_number == domain_end:  # because we need to include also that residue
            break

    coords_list = np.array(coords_list, dtype='O')

    # in case the domain_start is not included in the coords indices
    try:
        start = np.where(coords_list[:, 0] == domain_start)[0][0]
        end = np.where(coords_list[:, 0] == domain_end)[0][0]
    except IndexError:
        print('domain_start or domain_end index not found in pdb file')
        return None, None

    if (end - start) == (domain_end - domain_start):
        coords_list = coords_list[start:(end + 1)]

        # Secondary structure and Torsion Angles
        sec_torsions, seq = secondary_torsions(domain)  # , start, end)
        if sec_torsions is None:
            return None, None

        if len(seq) < len(coords_list):
            print('DSSP output smaller than PDB')
            return len(seq), len(coords_list)

        dssp_start, dssp_end = align(''.join(coords_list[:, 2]), seq)

        if dssp_start is None:
            print('DSSP Sequence != PDB sequence')
            print(
                f'PDB Sequence:\n{"".join(coords_list[:, 2])}\nDSSP sequence:\n{seq}'
            )
            return None, None
        else:
            return coords_list, sec_torsions[dssp_start:dssp_end]

    else:
        print(
            f'Domain {domain} has missing data. PDB indices:{start, end}, CATH indices: {domain_start, domain_end}'
        )
        return None, None
    #
    p = Pool(20)

    parts = len(list_result_n) // 20
    for i in tqdm(range(parts)):
        p.map(download, list_result_n[i * 20:i * 20 + 20])
    p.map(download, list_result_n[parts * 20:])

    result_chains = {}
    for i in result:
        for i1 in result[i]:
            struc = i1.split(";")[0]
            chain = i1.split(";")[1].strip()
            if struc in result_chains:
                result_chains[struc].append(chain)
            else:
                result_chains[struc] = [chain]

    print(result_chains)

    for i in tqdm(os.listdir("pdb_m")):
        io = PDBIO()
        pdb = PDBParser().get_structure(i, "pdb_m/%s" % i)

        for chain in pdb.get_chains():
            if chain.get_id() in result_chains[i.split("_")[1]]:
                io.set_structure(chain)
                io.save("pdb_chain/" + pdb.get_id() + "_" + chain.get_id() +
                        ".pdb")
示例#10
0
def parse_pdb_length(name):
	pdb = PDBParser().get_structure(name, "../../../0-identify_structure/2-get_pdb_chain/{0}/{1}.pdb".format(organism, name))
	chain = list(pdb.get_chains())[0]	#only 1 chain present	
	return len([_ for _ in chain.get_residues() if PDB.is_aa(_)])	#omits missing residues
            row for row in rows
            if abs(float(row["x"])) <= 5 and abs(float(row["y"])) <= 5
        ]
        try:
            min_relaxed_energy = float(rows[0]["Relaxed energy (kcal/mol)"])
            min_unrelaxed_energy = float(
                rows[0]["Unrelaxed energy (kcal/mol)"])
        except IndexError:
            min_unrelaxed_energy = np.nan
            min_relaxed_energy = np.nan
    # Count the number of atoms and residues in the antigen.
    ag_file = os.path.join(standards.ExperimentsDirectory, experiment,
                           "structures", "antigen_relaxed.pdb")
    structure = PDBParser().get_structure("antigen", ag_file)
    n_chains, n_atoms, n_residues = 0, 0, 0
    for chain in structure.get_chains():
        n_chains += 1
        n_residues += len(list(chain.get_residues()))
        n_atoms += len(list(chain.get_atoms()))
    results[-1].update({
        "Experiment": experiment,
        "Chains": n_chains,
        "Residues": n_residues,
        "Atoms": n_atoms,
        "Positions": positions,
        "Selected Designs": selected,
        "Min Unrelaxed Energy": min_unrelaxed_energy,
        "Min Relaxed Energy": min_relaxed_energy
    })

# Output results as a CSV file.
示例#12
0
class Blueprint:
    def __init__(self,
                 blueprint_file=None,
                 pdbfile=None,
                 structure=None,
                 segments=None,
                 data=None):
        if pdbfile:
            self.structure = PDBParser().get_structure(pdbfile, pdbfile)
        else:
            self.structure = structure

        if segments:
            self.segments = segments
            self.bp_data = []
            self.segment_dict = {}
            for seg in segments:
                self.bp_data += seg.bp_data
                self.segment_dict[seg.id] = seg

        if blueprint_file and not data:
            # read the blueprint file and initialize segments
            # if self.structure is available put the residues in the segments.

            #self.segments = [ ]
            foldinfo_register = ""
            hsstriplet_register = ""
            register = re.compile('^\s*(\d+)\s+(\w+)\s+(\w+)\s+(.+)')
            data = []
            for line in open(blueprint_file):
                if line.startswith('FOLDINFO'):
                    foldinfo_register = line.strip()
                elif line.startswith('HSSTRIPLET'):
                    hsstriplet_register = line.strip()
                elif line.startswith('HSSTRIAD'):
                    hsstriplet_register = line.strip()
                elif line.startswith('SSPAIR'):
                    #r = re.compile("(\d+)-(\d+).(\w).")
                    r = re.compile("(\d+)-(\d+).(\w).([-]?\d+)")
                    self.sspairs = r.findall(line)

                elif line.startswith('HHPAIR') or line[0] == '#':
                    pass
                else:
                    r = register.split(line)
                    data.append([int(r[1]), r[2], r[3], r[4]])
        if blueprint_file or data:

            # group the tuples in lists by their secondary structure and initiliaze the segments
            # grab the residues from the structure if this is available
            # self.bp_data contains all blueprint residue data
            # self segment_dict is a dict of segments where the keys are the ID for the ss segment. For example
            # H3 means Helix 3.
            self.segments = []
            self.bp_data = []
            self.segment_dict = {}
            res_index = 0
            segment_count = {'L': 1, 'H': 1, 'E': 1}
            residues = list(
                self.structure.get_residues()) if self.structure else None
            for sstype, bp_data in groupby(data, key=lambda x: x[2][0]):
                resdata = list(bp_data)
                self.bp_data += resdata
                id = sstype + str(segment_count[sstype])
                segment_count[sstype] += 1
                seg = None
                if self.structure:
                    segment_residues = []
                    for data in resdata:
                        segment_residues.append(residues[res_index])
                        res_index += 1
                    seg = Segment(id, sstype, resdata, segment_residues)
                else:
                    seg = Segment(id, sstype, resdata)
                # append the segment to the segment list
                self.segments.append(seg)
                # insert the segment to the segment dict
                self.segment_dict[id] = seg
                #use the segment_dict to fill foldinfo and hsstriplet
                ##  I AM GOING TO FINISH THIS LATER BECAUSE IT IS GOING TO BE TRICKY TO SET UP THE FOLDS WITH THE SWAPP
                ##  MEANWHILE I AM GOING TO MODIFY dump_blueprint to take the foldinfo and hss tripplet as arguments
                #get_fold_tokens  = re.compile('(\d+-\d+\.[AP]\.-?\d)')
                #fold_tokens = get_fold_tokens.findall(foldinfo_register)
                #for ft in fold_tokens:
                #    pass

    def topology(self):
        return reduce(lambda x, y: x + '-' + y, [s.id for s in self.segments])

    def topology_lengths(self):
        topol1 = reduce(lambda x, y: x + '-' + y,
                        [s.id for s in self.segments])
        elements = re.compile("[HEL]\d+")
        ss_lst = elements.findall(topol1)
        topol2 = ''
        topol3 = ''
        for ss in ss_lst:
            seg = self.segment_dict[ss]
            n = len(seg.bp_data)
            topol2 += '%s%s-' % (ss[0], n)
            topol3 += '%s[%s-%s]' % (ss[0], n, n)

        return topol2, topol3

    def ss_tag(self):
        H = 0
        E = 0
        for s in self.segments:
            if s.sstype == 'H':
                H += 1
            elif s.sstype == 'E':
                E += 1
            else:
                pass
        return "%dH%dE" % (H, E)

    def freeze_all(self):
        for res in self.bp_data:
            res[3] = '.'

    def remodel_all(self):
        for res in self.bp_data:
            res[3] = 'R'

    def remodel_segment(self,
                        index=None,
                        id=None,
                        index_to_zero=False,
                        loop_edge=True):
        res_for_remodel = []
        if index:
            for res in self.segments[index].bp_data:
                res_for_remodel.append(res)
        elif id:
            for res in self.segment_dict[id].bp_data:
                res_for_remodel.append(res)

        for res in res_for_remodel:
            if index_to_zero:
                res[0] = 0
            res[3] = 'R'

        if loop_edge:
            for i in range(1, len(self.segments) - 1):
                prev_seg = self.segments[i - 1]
                seg = self.segments[i]
                next_seg = self.segments[i + 1]
                if seg.sstype == 'L':
                    if seg.bp_data[0][3] == 'R':
                        prev_seg.bp_data[-1][3] = 'R'
                    if seg.bp_data[-1][3] == 'R':
                        next_seg.bp_data[0][3] = 'R'

    def residue_segment(self, pos):
        its_segment = ''
        for segment in self.segment_dict.keys():
            seg = self.segment_dict[segment]
            for res in seg.bp_data:
                if res[0] == pos:
                    its_segment = segment
                    break
            else:
                continue
            break

        return its_segment

    def segment_lengths(self):
        return reduce(lambda i, j: i + '-' + j,
                      [s.sstype + str(len(s.bp_data)) for s in self.segments])

    def reindex_blueprint(self, start=1, rebuild_index_to_zero=False):
        indexer = start
        for bp_data in self.bp_data:
            if rebuild_index_to_zero and bp_data[3] == 'R':
                bp_data[0] = 0
            else:
                bp_data[0] = indexer
                indexer += 1

    def segment_list(self):
        r = re.compile('([HEL]\d+)-?')
        seg_list = r.findall(self.topology())
        return seg_list

    def dump_blueprint(self, filename, header_lines=[]):
        '''header lines are for setting foldinfo, hsstriplet or any other register on the top of the blueprint.'''
        out = open(filename, 'w')
        for line in header_lines:
            line.strip()  # avoid doble carriage return
            out.write(line + '\n')
        for r in self.bp_data:
            out.write("%d    %s    %s    %s\n" % tuple(r))
        out.close()

    def dump_pdb(self, filename):
        io = PDBIO()
        io.set_structure(self.structure)
        io.save(filename)

    def swapp_segments(self, index1, index2):
        '''This function swaps the segments, reindexes the blueprint and PDB file
        and set for remodelling the segments directly conected to the swapped segments.
        The rest of the structure is set frozen.'''
        #freeze the structure and delete the residues conected to the swapped segments for remodel
        #add to the blueprint the corresponding insertions for the deleted residues.
        self.freeze_all()
        self.remodel_segment(index1 - 1, index_to_zero=True)
        self.remodel_segment(index1 + 1, index_to_zero=True)
        self.remodel_segment(index2 - 1, index_to_zero=True)
        self.remodel_segment(index2 + 1, index_to_zero=True)

        #wapp the self.segments
        self.segments[index1], self.segments[index2] = self.segments[
            index2], self.segments[index1]
        #renumerate the blueprint and the residues
        indexer = 1
        residues_to_detach = set()
        for segment in self.segments:
            for i in range(0, len(segment.bp_data)):
                if segment.bp_data[i][0] == 0:
                    residues_to_detach.add(segment.residues[i])
                    continue
                segment.bp_data[i][0] = indexer
                id = segment.residues[i].id
                segment.residues[i].id = (id[0], indexer, id[2])
                indexer += 1

        # detach the residues of the residues directly connected to the swapp
        # this is done to avoid clashes during the remodelling
        for res in residues_to_detach:
            p = res.get_parent()
            p.detach_child(res.id)

        # sort the residues in the structure accoriding to the new indexing
        for chain in self.structure.get_chains():
            chain.child_list = sorted(chain.child_list, key=lambda r: r.id[1])

        #now that the elements have been reindexed self.bp_data and self.residues must be updated
        self.bp_data = reduce(lambda x, y: x + y,
                              [s.bp_data for s in self.segments])
        self.residues = reduce(lambda x, y: x + y,
                               [s.residues for s in self.segments])
class Family(object):
    '''A class that compiles information about a protein structure and 
    related sequences. This information is meant to be sufficient to filter
    the residues and calculate an ez-beta moment from those that remain.

    Attributes:
    stru_name: Name of the structure
    stru_path: Path of the PDB format structure file
    stru: the structure, as a Biopython entity
    msa: a dictionary mapping sequence identifiers to rows in a multiple
        sequence alignment
    template_seq: the row of the MSA containing the sequence of the
        structure
    res_to_pos: a dictionary mapping residues from structures to their
        column number in the MSA (asssuming the first column is numbered 0)
    dssp: a Biopython DSSP object with a DSSP for the structure
    calc: an Ez-beta calculator'''

    def __init__(self, stru_name, stru_path, msa_path, template_name,
                 param_path):
        '''Requires a name for the structure (your choice), a path to a
        PDB format structure file, a path to a multiple sequence alignment
        containing a row with exactly the same sequence as the structure,
        the sequence identifier of this row, and a path to a CSV file
        of Ez-beta parameters (see zenergy.Calculator for how to make
        these files)'''
        self.stru_name = stru_name
        self.stru_path = stru_path

        with warnings.catch_warnings():
            warnings.simplefilter('ignore')
            self.stru = PDBParser().get_structure(stru_name, stru_path)

        # When Daniel created the aligned structures, he removed heteroatoms
        # (though the procedure he used seems to have removed anything
        # without the residue identifer of one of the 20 standard amino
        # acids, leading to main chain selenomethionines being removed from
        # the 1FEP structure). However, he also added a sort of box of
        # water atoms (perhaps as a visual aid, so you can tell how the
        # coordinate system is defined?)
        # Therefore, remove all waters
        waters = [i for i in self.stru.get_residues() \
                  if i.get_resname() == 'HOH']
        for chain in self.stru.get_chains():
            for water in waters:
                try:
                    chain.detach_child(water.get_id())
                # Maybe it's not in this chain
                except KeyError:
                    pass
            

        msa = Bio.AlignIO.read(open(msa_path), 'clustal')
        self.msa = dict((seq.id, seq) for seq in msa)
        self.template_seq = self.msa[template_name]

        self.res_to_pos = map_res_to_pos(self.stru.get_residues(),
                                         self.template_seq)
        
        self.dssp = DSSP_win.DSSP(self.stru.child_dict[0], stru_path)

        params = csv.reader(open(param_path, 'rb'))
        self.calc = zenergy.Calculator(params)
    '-mute basic -mute core -ignore_zero_occupancy false -rebuild_disulf false -detect_disulf false'
)

for pdb_file in glob.glob('../Data/structures/*.pdb'):
    print(pdb_file)
    if '.rosetta' in pdb_file:
        continue
    pdb_file_clean = pdb_file.replace('.pdb', '.rosetta.pdb')

    initial_pose = pose_from_pdb(pdb_file)
    initial_pose.dump_pdb(pdb_file_clean)

    io = PDBIO()
    pdb = PDBParser().get_structure(
        pdb_file.split('/')[-1].split('.')[0], pdb_file_clean)
    chains = list(pdb.get_chains())
    assert len(chains) == 1
    residues = list(chains[0].get_residues())
    Dice.extract(pdb, chains[0].get_id(), 1, len(residues) + 1, pdb_file_clean)
    #io.set_structure(chains[0])
    #io.save(pdb_file_clean)
    #io = PDBIO()

    ##Set up ScoreFunction
    #sf = get_fa_scorefxn()

    ##Set up MoveMap.
    #mm = MoveMap()
    #mm.set_bb(True)
    #mm.set_chi(True)