def split_chains(pdbid):

    if not os.path.isfile(pdbid):
        print("Error: Function split_chains: file does not exist:" + pdbid +
              "\n")
        return

    if '.gz' in pdbid:
        handle = gzip.open(pdbid, "rt")
        parser = PDB.MMCIFParser(QUIET=True)
        structure = parser.get_structure(pdbid[:-7], handle)
    else:
        handle = open(pdbid, "rt")
        parser = PDB.MMCIFParser(QUIET=True)
        structure = parser.get_structure(pdbid[:-4], handle)

    for model in structure:
        for chain in model:
            #io=PDB.PDBIO()      Section to output .pdb format
            #io.set_structure(chain)
            #chain_filename=(structure.id+chain.id+".pdb")             #Output in PDB format
            #io.save(chain_filename)
            #cmd=('gzip -f '+chain_filename)
            #subprocess.call(cmd, shell=True)

            io = PDB.MMCIFIO()
            io.set_structure(chain)
            chain_filename = (structure.id + chain.id + ".cif"
                              )  #Output in MMCIF format
            io.save(chain_filename)
            cmd = ('gzip -f ' + chain_filename)
            subprocess.call(cmd, shell=True)
示例#2
0
def align_structs(id1, chain1, id2, chain2):
    """
    the main function. gets the ids and the chain's names and finds the alignment with the best RMSD.
    prints the best RMSD, and saving the alignments file in cif format
    :param id1: the first file id
    :param chain1: the first protein's chain
    :param id2: the second file id
    :param chain2: the second protein's chain
    """
    # generating the relevant data
    lst = pdb.PDBList()
    protein1 = lst.retrieve_pdb_file(id1)
    protein2 = lst.retrieve_pdb_file(id2)
    parser = pdb.MMCIFParser()
    struct1 = parser.get_structure("p1", protein1)
    struct2 = parser.get_structure("p2", protein2)

    # creating a lists of CA atoms to align
    atoms1 = create_atoms_list(struct1, chain1)
    atoms2 = create_atoms_list(struct2, chain2)
    if len(atoms1) != len(atoms2):
        atoms1, atoms2 = bonus_9_2(chain1, chain2, struct1, struct2)

    # making the align
    super_imposer = pdb.Superimposer()
    super_imposer.set_atoms(atoms1, atoms2)
    super_imposer.apply(struct2[0].get_atoms())
    print(super_imposer.rms)

    # saving the aligned structure to files
    saving_file(id1, struct1)
    saving_file(id2, struct2)
示例#3
0
    def test_from_mmcif(self):
        import Bio.PDB as bpdb

        cg = ftmc.from_pdb('test/forgi/threedee/data/1Y26.cif',
                           parser=bpdb.MMCIFParser())
        cg2 = ftmc.from_pdb('test/forgi/threedee/data/1y26.pdb')

        self.assertEqual(cg.defines, cg2.defines)
        self.assertGreater(len(cg.defines), 3)
        for d in cg.defines:
            nptest.assert_almost_equal(cg.coords[d], cg2.coords[d])
def _read_structure(path, pdb_id='pdb', cif_id='cif' ):
    file_name = os.path.basename(path).split('.')[0]
    file_sufix = os.path.basename(path).split('.')[1]
    dir_path = os.path.dirname(path)
    if file_sufix == 'pdb':
        parser = struct.PDBParser(QUIET=True)
        structure = parser.get_structure(pdb_id, path)
    elif file_sufix == 'cif':
        parser = struct.MMCIFParser()
        structure = parser.get_structure(cif_id, path)
    else:
        print("ERROR: Unreognized file type " + file_sufix + " in " + file_name)
        sys.exit(1)
    return structure, dir_path, file_name
示例#5
0
文件: pdb.py 项目: alisterburt/ABTT
def read(pdb_file):
    """
    reads a pdb file into a structure object
    :param pdb_file: pdb format file
    :return: structure
    """
    logging.info(f'reading pdb file: {pdb_file}')
    if not pdb_file.lower().endswith('.cif'):
        structure = PDB.PDBParser().get_structure(pdb_file, pdb_file)
    else:
        logging.info(f'switched to cif modus for file: {pdb_file}')
        structure = PDB.MMCIFParser().get_structure(pdb_file, pdb_file)

    return structure
示例#6
0
    def mmcif_to_graph():
        from werkzeug import secure_filename

        name = secure_filename(request.files['pdb_file'].filename)

        try:
            result = forna.pdb_to_json(request.files['pdb_file'].read(),
                                       name,
                                       parser=bpdb.MMCIFParser())
        except Exception as ex:
            app.logger.exception(ex)
            abort(400, "PDB file parsing error: {}".format(str(ex)))

        return json.dumps(result), 201
示例#7
0
def get_all_chains(in_filename, parser=None):
    '''
    Load the PDB file located at filename, select the longest
    chain and return it.

    :param in_filename: The location of the original file.
    :return: A list of Bio.PDB chain structures corresponding to all
             RNA structures stored in in_filename
    '''
    if parser is None:
        #print("in_filename is {}".format(in_filename), file=sys.stderr)
        if in_filename.endswith(".pdb"):
            parser = bpdb.PDBParser()
        elif in_filename.endswith(".cif"):
            parser = bpdb.MMCIFParser()
        else:  #Cannot determine filetype by extention. Try to read first line.
            with open(in_filename) as pdbfile:
                line = pdbfile.readline(20)
                # According to
                #page 10 of ftp://ftp.wwpdb.org/pub/pdb/doc/format_descriptions/Format_v33_A4.pdf
                # a HEADER entry is mandatory. Biopython sometime starts directly with ATOM
                if line.startswith("HEADER") or line.startswith("ATOM"):
                    #print("HEADER found", file=sys.stderr)
                    parser = bpdb.PDBParser()
                else:
                    parser = bpdb.MMCIFParser()
                    #print("HEADER NOT found", file=sys.stderr)

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        s = parser.get_structure('temp', in_filename)

    if len(s) > 1:
        warnings.warn("Multiple models in file. Using only the first model")
    chains = list(chain for chain in s[0] if contains_rna(chain))
    return chains
示例#8
0
 def __init__(self,path, pdb_id='pdb', cif_id='cif'):
     # copied from input_output.py _read_structure
     self.file_name = os.path.basename(path).split('.')[0]
     self.file_sufix = os.path.basename(path).split('.')[1]
     self.dir_path = os.path.dirname(path)
     self.params = CP.read_charmm_FF()
     self.chains = []
     self.models = {}
     if self.file_sufix == 'pdb':
         self.header = struct.parse_pdb_header(path)
         self.structure = struct.PDBParser(QUIET=True).get_structure(pdb_id, path)
         self.has_sequence = False
     elif self.file_sufix == 'cif':
         self.header = struct.MMCIF2Dict()
         self.structure = struct.MMCIFParser().get_structure(cif_id, path)
         self.has_sequence = True
     else:
         print("ERROR: Unreognized file type " + self.file_sufix + " in " + self.file_name)
         sys.exit(1)
def compute_distance(input_dir, filename, res1, atm1, res2, atm2):
    handle = gzip.open((input_dir + '/' + filename + '.cif.gz'), 'rt')
    parser = PDB.MMCIFParser(QUIET=True)
    structure = parser.get_structure("PDB", handle)
    ignoremodified = open(f'List_modified_aminoacid.txt', 'r')
    atom_present = 0
    for model in structure:
        for chain in model:
            for residue in chain:
                ignoremodified.seek(0)
                if (int(residue.id[1]) == int(res1) and residue.get_id()[0]
                        == ' ') or (int(residue.id[1]) == int(res1) and
                                    ((residue.id[0][2:] + '\n')
                                     in ignoremodified.readlines())):
                    if residue.has_id(atm1):
                        res1_object = residue
                        #residue1=chain[res1]
                        atom_present = atom_present + 1
                ignoremodified.seek(0)
                if (int(residue.id[1]) == int(res2) and residue.get_id()[0]
                        == ' ') or (int(residue.id[1]) == int(res2) and
                                    ((residue.id[0][2:] + '\n')
                                     in ignoremodified.readlines())):
                    if residue.has_id(atm2):
                        res2_object = residue
                        #residue2=chain[res2]
                        atom_present = atom_present + 1

    if atom_present == 2:
        distance = round(
            float(res1_object[atm1] - res2_object[atm2]), 2
        )  #round works only on type float, so first convert the number to float
        #print(distance)
        #print(f'{distance:.2f}')
        return distance
    else:
        return 999
示例#10
0
def get_all_chains(in_filename,
                   parser=None,
                   no_annotation=False,
                   assembly_nr=None):
    '''
    Load the PDB file located at filename, read all chains and return them.

    :param in_filename: The location of the original file.
    :param assembly_nr: Which assembly to return. Default: The first.
    :return: a tuple chains, missing_residues

             * chains: A list of Bio.PDB chain structures corresponding to all
                       RNA structures stored in in_filename
             * missing_residues: A list of dictionaries, describing the missing residues.
             * interacting residues: A list of residues
    '''
    if parser is None:
        if in_filename.endswith(".pdb"):
            parser = bpdb.PDBParser()
        elif in_filename.endswith(".cif"):
            parser = bpdb.MMCIFParser()
        else:  # Cannot determine filetype by extention. Try to read first line.
            with open(in_filename) as pdbfile:
                line = pdbfile.readline(20)
                # According to
                # page 10 of ftp://ftp.wwpdb.org/pub/pdb/doc/format_descriptions/Format_v33_A4.pdf
                # a HEADER entry is mandatory. Biopython sometime starts directly with ATOM
                if line.startswith("HEADER") or line.startswith("ATOM"):
                    parser = bpdb.PDBParser()
                else:
                    parser = bpdb.MMCIFParser()

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        s = parser.get_structure('temp', in_filename)
    if len(s) > 1:
        warnings.warn("Multiple models in file. Using only the first model")

    # Let's detach all H2O, to speed up processing.
    for chain in s[0]:
        log.debug("Before detaching water from %s: chain has %s residues",
                  chain.id, len(chain))
        for r in chain.child_list[:]:  # We need a copy here, because we are modifying it during iteration
            if r.resname.strip() == "HOH":
                chain.detach_child(r.id)
        log.debug("After detaching water from %s: chain has %s residues",
                  chain.id, len(s[0][chain.id]))

    # Rename residues from other programs
    for chain in s[0]:
        for r in chain:
            # rename rosetta-generated structures
            if r.resname == ' rA':
                r.resname = '  A'
            elif r.resname == ' rC':
                r.resname = '  C'
            elif r.resname == ' rG':
                r.resname = '  G'
            elif r.resname == ' rU':
                r.resname = '  U'

            # rename iFoldRNA-generated structures
            if r.resname == 'ADE':
                r.resname = '  A'
            elif r.resname == 'CYT':
                r.resname = '  C'
            elif r.resname == 'GUA':
                r.resname = '  G'
            elif r.resname == 'URI':
                r.resname = '  U'
    # Now search for protein interactions.
    if not no_annotation:
        interacting_residues = enumerate_interactions_kdtree(s[0])
    else:
        interacting_residues = set()

    # The chains containing RNA
    chains = list(chain for chain in s[0] if contains_rna(chain))

    try:
        log.debug("PDB header %s", parser.header)
        mr = parser.header["missing_residues"]
        if assembly_nr is not None:
            warnings.warn(
                "Getting an assembly is not supported for the old PDB format.")
    except AttributeError:  # A mmCIF parser
        cifdict = parser._mmcif_dict  # We read a private attribute here, because parsing the mmcif dictionary a second time would cause a performance penalty.
        # Generate an assembly
        try:
            operation_mat, operation_vec = _extract_symmetrymatrices_from_cif_dict(
                cifdict)
        except KeyError:
            pass
        else:
            if False:  # Still experimental and not working correctly.
                chains = _get_assemblies(chains, cifdict)
        mr = []
        try:
            mask = np.array(cifdict["_pdbx_poly_seq_scheme.pdb_mon_id"],
                            dtype=str) == "?"
            int_seq_ids = np.array(
                cifdict["_pdbx_poly_seq_scheme.pdb_seq_num"], dtype=int)[mask]
            cs = np.array(cifdict["_pdbx_poly_seq_scheme.pdb_strand_id"],
                          dtype=str)[mask]
            insertions = np.array(
                cifdict["_pdbx_poly_seq_scheme.pdb_ins_code"], dtype=str)[mask]
            insertions[insertions == "."] = " "
            symbol = np.array(cifdict["_pdbx_poly_seq_scheme.mon_id"],
                              dtype=str)[mask]
        except KeyError:
            pass
        else:
            if not no_annotation:
                for i, sseq in enumerate(int_seq_ids):
                    mr.append({
                        "model": None,
                        "res_name": symbol[i],
                        "chain": cs[i],
                        "ssseq": sseq,
                        "insertion": insertions[i]
                    })
    except KeyError:
        mr = []
        with open(in_filename) as f:
            for wholeline in f:
                if wholeline.startswith("REMARK 465"):
                    line = wholeline[10:].strip()
                    mr_info = _parse_remark_465(line)
                    if mr_info is not None:
                        mr.append(mr_info)
                else:
                    continue
    else:
        if mr:
            log.info("This PDB has missing residues")
        elif not no_annotation:
            log.info("This PDB has no missing residues")
    '''for res1, res2 in itertools.combinations(s[0].get_residues(), 2):
        rna_res=None
        other_res=None
        if res1.resname.strip() in RNA_RESIDUES:
            rna_res=res1
        else:
            other_res=res1
        if res2.resname.strip() in RNA_RESIDUES:
            rna_res=res2
        else:
            other_res=res2
        if rna_res is None or other_res is None:
            continue
        if other_res.resname.strip()=="HOH":
            continue
        if residues_interact(rna_res, other_res):
            log.error("%s and %s interact", rna_res, other_res)
            interacting_residues.add(rna_res)'''
    log.debug("LOADING DONE: chains %s, mr %s, ir: %s", chains, mr,
              interacting_residues)
    return chains, mr, interacting_residues
示例#11
0
def get_pdb_STR(pdbPath):
    STR = PDB.MMCIFParser(QUIET=True).get_structure("pdb",pdbPath)
    # DICT = PDB.MMCIF2Dict.MMCIF2Dict(cifPath)
    # print(DICT)
    return STR #,DICT
示例#12
0
    def _exp_file_to_data(self, file_path, params):
        """
            _exp_file_to_data:
                Do the PDB conversion--parse the experiment pdb file for creating a pdb data object
        """
        logging.info(
            f'Parsing pdb file {file_path} to a pdb structure with params: {params}'
        )

        parser = PDB.MMCIFParser()
        cif = file_path
        pp_no = 0
        mmcif_data = None

        try:
            structure = parser.get_structure("PHA-L", cif)
        except (RuntimeError, TypeError, KeyError, ValueError) as e:
            logging.info(f'MMCIFParser errored with message: {e.message}')
            raise
        else:
            ppb = PPBuilder()
            for pp in ppb.build_peptides(structure):
                pp_no += 1

            struc_name = structure.header.get('name', '')
            hd = self._upload_to_shock(file_path)

            # logging.info(f'Getting pdb structure data for {structure}!')
            (cpd, src) = self._get_compound_source(structure)
            (num_models,
             model_ids) = self._get_models_from_structure(structure)
            (num_chains,
             chain_ids) = self._get_chains_from_structure(structure)
            (num_residues,
             residue_ids) = self._get_residues_from_structure(structure)
            (num_atoms, atom_ids) = self._get_atoms_from_structure(structure)
            protein_data = self._get_proteins_by_structure(
                structure, model_ids[0], file_path)
            (protein_data, params) = self._match_features(params, protein_data)

            pdb_info = params.get('pdb_info', None)
            if pdb_info and pdb_info.get('sequence_identities', None):
                mmcif_data = {
                    'name':
                    struc_name,
                    'head':
                    structure.header.get('head', ''),
                    'rcsb_id':
                    structure.header.get('rcsb_id', ''),
                    'deposition_date':
                    structure.header.get('deposition_date', ''),
                    'release_date':
                    structure.header.get('release_date', ''),
                    'structure_method':
                    structure.header.get('structure_method', ''),
                    'resolution':
                    structure.header.get('resolution', 0.0),
                    'structure_reference':
                    structure.header.get('structure_reference', []),
                    'keywords':
                    structure.header.get('keywords', ''),
                    'author':
                    structure.header.get('author', ''),
                    'compound':
                    cpd,
                    'source':
                    src,
                    'num_models':
                    num_models,
                    'num_chains':
                    num_chains,
                    'num_residues':
                    num_residues,
                    'num_atoms':
                    num_atoms,
                    'num_het_atoms':
                    structure.header.get('num_het_atoms', 0),
                    'num_water_atoms':
                    structure.header.get('num_water_atoms', 0),
                    'num_disordered_atoms':
                    structure.header.get('num_disordered_atoms', 0),
                    'num_disordered_residues':
                    structure.header.get('num_disordered_residues', 0),
                    'pdb_handle':
                    hd,
                    'mmcif_handle':
                    hd,
                    'xml_handle':
                    hd,
                    'proteins':
                    protein_data
                }
            else:
                mmcif_data = {}
                logging.info(
                    f'Parsing pdb file {file_path} failed to match KBase genome/features!'
                )
        finally:
            return mmcif_data, pp_no, params
示例#13
0
    def test_from_mmcif(self):
        import Bio.PDB as bpdb

        cg = ftmc.from_pdb('test/forgi/threedee/data/1Y26.cif',
                           parser=bpdb.MMCIFParser())
示例#14
0
def generate_pairwise_subunits_from_pdb(pdb_file_path, templates_path,
                                        file_type, verbose):
    """Take an existing complex and fragment it into each of the pairwise interactions between subunits.

    Keyword arguments:
    pdb_file_path -- path where the complex PDB is
    templates_path -- folder where the resulting folders will be saved
    file_type -- type of file
    verbose -- if a log of the program execution is saved

    Considerations:
    Does not consider nucleic acid sequences, it is only for testing the program on different complexes"""

    num_file = 0

    if file_type == 'PDB':
        parser = pdb.PDBParser(PERMISSIVE=1)
    else:
        parser = pdb.MMCIFParser()

    structure = parser.get_structure('pdb_name', pdb_file_path)

    # give unique chain identifiers to a structure, it has to be similar to the ids of the chains used in build_complex, to be able to use further the structure_in_created_structures() function
    id_nch = 0
    for chain in structure.get_chains():
        actual_id = chain.id
        chain.id = (complete_chain_alphabet[id_nch] + '_', actual_id)
        id_nch += 1

    # free the ./templates_path/
    os.system('rm -rf ' + templates_path + '*')

    # initialize the saved pairs and structures
    saved_pairs = set()
    saved_structures = []

    # loop through all possible pairwise files

    for chain1 in structure.get_chains():

        for chain2 in structure.get_chains():

            # the following strings define the pairs already saved
            comb = tuple(list(chain1.id) + list(chain2.id))
            comb_rev = tuple(list(chain2.id) + list(chain1.id))

            if chain1 is not chain2 and comb not in saved_pairs:

                # save the combination
                saved_pairs.add(comb)
                saved_pairs.add(comb_rev)

                # ask if any of the residues is interacting, if so save the PDB

                chains_interacting = False

                for residue1 in chain1:
                    if chains_interacting is True:
                        break
                    for residue2 in chain2:
                        if residue1 != residue2:

                            # define which is the important residue of each chain:
                            atoms1 = [x.id for x in residue1.get_atoms()]
                            atoms2 = [x.id for x in residue2.get_atoms()]

                            important_atom1 = None
                            if 'CA' in atoms1:
                                important_atom1 = residue1['CA']
                            elif 'P' in atoms1:
                                important_atom1 = residue1['P']

                            important_atom2 = None
                            if 'CA' in atoms2:
                                important_atom2 = residue2['CA']
                            elif 'P' in atoms2:
                                important_atom2 = residue2['P']

                            # compute the distance:
                            if important_atom1 is not None and important_atom2 is not None:
                                distance = important_atom1 - important_atom2
                            else:
                                continue

                            if distance < 7:
                                chains_interacting = True
                                break

                if chains_interacting is True:

                    # create a structure object
                    ID = str(num_file)
                    num_file += 1
                    new_structure = pdb_struct.Structure(ID)

                    new_model = pdb_model.Model(0)
                    new_model.add(chain1.copy())
                    new_model.add(chain2.copy())

                    new_structure.add(new_model)

                    # move the coordinates of the structure to simulate what would happen if they were coming from different files
                    rotation = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
                    translation = np.array((0, 0, 1), 'f')
                    for atom in new_structure.get_atoms():
                        atom.transform(rotation, translation)

                    # write to new pdb:
                    if structure_in_created_structures(
                            new_structure, saved_structures) is False:

                        # record as a saved structure:
                        saved_structures.append(new_structure.copy())

                        # give unique chains to a structure (A and B)
                        id_nch = 0
                        for chain in new_structure.get_chains():
                            chain.id = chain_alphabet[id_nch]
                            id_nch += 1

                        if verbose:
                            print(
                                'writing PDB file with the interaction of %s and %s into %s.pdb'
                                % (chain1.id[1], chain2.id[1], ID))

                        # write using our customized writer
                        io = pdb.PDBIO()
                        io.set_structure(new_structure)
                        io.save(templates_path + ID + '.pdb')
示例#15
0
def get_distance_matrix(mmcif_file, chain_id):
    """
    Given a protein structure in mmcif format and a chain id, extract the
    residue type, the coordinate of each residue, and the residue numbering.
    Compute all the pairwise euclidean distances among residues. Returns a
    dictionary containing all of these data.
    """

    parser = PDB.MMCIFParser()
    structure = parser.get_structure("_", mmcif_file)
    out = {
        # this is the residue identity (which aminoacid it is)
        "residue": [],
        # the xyz coordinates for the beta carbon (alpha for GLY)
        "coordinates": [],
        # this corresponds to the numerical part of PDB_BEG and PDB_END in the
        # sifts mapping table (es. 1 for residue 1A)
        "resseq": [],
        # this corresponds to the letteral part of PDB_BEG and PDB_END in the
        # sifts mapping table (es. A for residue 1A)
        "icode": [],
    }

    matching_chains = 0

    for model in structure:
        if model.id == 0:
            print("Processing model:", model.id)

            for chain in model:
                if chain.id != chain_id:
                    continue
                matching_chains += 1

                for residue in chain.get_residues():
                    het_field = residue.id[0]

                    # discard HETATM records

                    if het_field != " ":
                        continue
                    out["residue"].append(residue.resname)

                    if residue.resname == "GLY":
                        # GLY does not have a beta carbon
                        out["coordinates"].append(residue["CA"].get_coord())
                    else:
                        out["coordinates"].append(residue["CB"].get_coord())
                    out["resseq"].append(residue.id[1])
                    out["icode"].append(residue.id[2])
        else:
            print("Skipping model:", model.id)
    assert matching_chains == 1

    out["coordinates"] = np.array(out["coordinates"], dtype=float)
    out["resseq"] = np.array(out["resseq"], dtype=int)
    out["icode"] = np.array(out["icode"], dtype=str)
    # NaN is not defined in a str array, and I need to represent in a way which
    # is coherent with the way pandas represents it
    out["icode"] = np.where(out["icode"] == " ", "nan", out["icode"])
    # the Minkowski 2-norm is the euclidean distance
    out["distance_matrix"] = spatial.distance_matrix(out["coordinates"],
                                                     out["coordinates"],
                                                     p=2)
    out["sequence"] = "".join([
        SeqUtils.IUPACData.protein_letters_3to1[r.capitalize()]
        for r in out["residue"]
    ])
    out["pdb_id"] = mmcif_file.split(".")[0]
    out["chain_id"] = chain_id

    return out
示例#16
0
def parse(*,
          file_id: str,
          mmcif_string: str,
          catch_all_errors: bool = True) -> ParsingResult:
    """Entry point, parses an mmcif_string.

  Args:
    file_id: A string identifier for this file. Should be unique within the
      collection of files being processed.
    mmcif_string: Contents of an mmCIF file.
    catch_all_errors: If True, all exceptions are caught and error messages are
      returned as part of the ParsingResult. If False exceptions will be allowed
      to propagate.

  Returns:
    A ParsingResult.
  """
    errors = {}
    try:
        parser = PDB.MMCIFParser(QUIET=True)
        handle = io.StringIO(mmcif_string)
        full_structure = parser.get_structure('', handle)
        first_model_structure = _get_first_model(full_structure)
        # Extract the _mmcif_dict from the parser, which contains useful fields not
        # reflected in the Biopython structure.
        parsed_info = parser._mmcif_dict  # pylint:disable=protected-access

        # Ensure all values are lists, even if singletons.
        for key, value in parsed_info.items():
            if not isinstance(value, list):
                parsed_info[key] = [value]

        header = _get_header(parsed_info)

        # Determine the protein chains, and their start numbers according to the
        # internal mmCIF numbering scheme (likely but not guaranteed to be 1).
        valid_chains = _get_protein_chains(parsed_info=parsed_info)
        if not valid_chains:
            return ParsingResult(
                None, {(file_id, ''): 'No protein chains found in this file.'})
        seq_start_num = {
            chain_id: min([monomer.num for monomer in seq])
            for chain_id, seq in valid_chains.items()
        }

        # Loop over the atoms for which we have coordinates. Populate two mappings:
        # -mmcif_to_author_chain_id (maps internal mmCIF chain ids to chain ids used
        # the authors / Biopython).
        # -seq_to_structure_mappings (maps idx into sequence to ResidueAtPosition).
        mmcif_to_author_chain_id = {}
        seq_to_structure_mappings = {}
        for atom in _get_atom_site_list(parsed_info):
            if atom.model_num != '1':
                # We only process the first model at the moment.
                continue

            mmcif_to_author_chain_id[
                atom.mmcif_chain_id] = atom.author_chain_id

            if atom.mmcif_chain_id in valid_chains:
                hetflag = ' '
                if atom.hetatm_atom == 'HETATM':
                    # Water atoms are assigned a special hetflag of W in Biopython. We
                    # need to do the same, so that this hetflag can be used to fetch
                    # a residue from the Biopython structure by id.
                    if atom.residue_name in ('HOH', 'WAT'):
                        hetflag = 'W'
                    else:
                        hetflag = 'H_' + atom.residue_name
                insertion_code = atom.insertion_code
                if not _is_set(atom.insertion_code):
                    insertion_code = ' '
                position = ResiduePosition(chain_id=atom.author_chain_id,
                                           residue_number=int(
                                               atom.author_seq_num),
                                           insertion_code=insertion_code)
                seq_idx = int(
                    atom.mmcif_seq_num) - seq_start_num[atom.mmcif_chain_id]
                current = seq_to_structure_mappings.get(
                    atom.author_chain_id, {})
                current[seq_idx] = ResidueAtPosition(position=position,
                                                     name=atom.residue_name,
                                                     is_missing=False,
                                                     hetflag=hetflag)
                seq_to_structure_mappings[atom.author_chain_id] = current

        # Add missing residue information to seq_to_structure_mappings.
        for chain_id, seq_info in valid_chains.items():
            author_chain = mmcif_to_author_chain_id[chain_id]
            current_mapping = seq_to_structure_mappings[author_chain]
            for idx, monomer in enumerate(seq_info):
                if idx not in current_mapping:
                    current_mapping[idx] = ResidueAtPosition(position=None,
                                                             name=monomer.id,
                                                             is_missing=True,
                                                             hetflag=' ')

        author_chain_to_sequence = {}
        for chain_id, seq_info in valid_chains.items():
            author_chain = mmcif_to_author_chain_id[chain_id]
            seq = []
            for monomer in seq_info:
                code = SCOPData.protein_letters_3to1.get(monomer.id, 'X')
                seq.append(code if len(code) == 1 else 'X')
            seq = ''.join(seq)
            author_chain_to_sequence[author_chain] = seq

        mmcif_object = MmcifObject(
            file_id=file_id,
            header=header,
            structure=first_model_structure,
            chain_to_seqres=author_chain_to_sequence,
            seqres_to_structure=seq_to_structure_mappings,
            raw_string=parsed_info)

        return ParsingResult(mmcif_object=mmcif_object, errors=errors)
    except Exception as e:  # pylint:disable=broad-except
        errors[(file_id, '')] = e
        if not catch_all_errors:
            raise
        return ParsingResult(mmcif_object=None, errors=errors)
示例#17
0
ligDF = pd.read_csv(ligFile)

feLig = ligDF[ligDF['Ligand_Formula'].str.contains("Fe") == True]


import csv

#loop through feLig and access mmcif file 
with open("dataOut.txt","w") as outFile:
    outWriter = csv.writer(outFile)

    for row in feLig.itertuples():
        with gzip.open(os.path.join(d,"mmcif",row.PDB_ID.lower()+".cif.gz"),"rt") as handle:
            # print(row.PDB_ID,row.Chain_ID)
            STR = PDB.MMCIFParser(QUIET=True).get_structure(row.PDB_ID,handle)

            chain = STR[0][row.Chain_ID]
            atom_list = PDB.Selection.unfold_entities(STR,"A")
            
            # print(type(row.Ligand_ID))
            # gather ligands that have the intended name
            ligands = [res for res in chain.get_list() if res.get_resname() == row.Ligand_ID]

            #loop through lig atoms, get fe coords, get nearest neighbors of fe's
            for lig in ligands:

                for atom in lig.get_atoms():
                    # print(atom.get_name()) 
                    if "FE" in atom.get_name():
from Bio import PDB

parser = PDB.MMCIFParser()
structure = parser.get_structure("2DN1", "dn\\2dn1.cif")

model = structure[0]
chain = model['A']
residue_1 = chain[2]
residue_2 = chain[3]
atom_1 = residue_1['CA']
atom_2 = residue_2['CA']

dist = atom_1 - atom_2
print(dist)
示例#19
0
def compute_dihedrals(pdbfilename):

    ignoremodified=('PTR','TPO','SEP','MSE','BWB','CAS','CME','CSO','CSS','CSX','MK8','MLY','NEP','NMM','PHD','CAF','CSD','CYO','OCS','OCY','SCS',\
                    'ALY','KCX',',LGY','CXM','MHO','T8L','ACE','AME','CY0','UNK','T8L','MHO','COM')

    if '.gz' in pdbfilename.lower():
        handle = gzip.open(pdbfilename, 'rt')
        pdbfilename = pdbfilename[0:-3]
    else:
        handle = open(pdbfilename, 'r')

    if '.pdb' in pdbfilename.lower():
        parser = PDB.PDBParser(QUIET=True)
    if '.cif' in pdbfilename.lower():
        parser = PDB.MMCIFParser(QUIET=True)
    structure = parser.get_structure("PDB", handle)

    for model in structure:
        for chain in model:
            first = 1
            for residue in chain:
                if residue.id[0] != ' ' or residue.id[0][2:] in ignoremodified:
                    continue

                if first == 1:  #The 'first' blocks are required to assign first and second residue to variables
                    prev_residue = residue
                    first = 2
                    continue

                if first == 2:  #This block computes psi dihedral for first residue
                    curr_residue = residue
                    psi = compute_psi(structure, model, chain, prev_residue,
                                      curr_residue)
                    chi1 = compute_chi1(structure, model, chain, prev_residue)
                    chi2 = compute_chi2(structure, model, chain, prev_residue)
                    chi3 = compute_chi3(structure, model, chain, prev_residue)
                    chi4 = compute_chi4(structure, model, chain, prev_residue)

                    first = 3
                    print(pdbfilename[0:-4].rjust(8)+str(model.id).rjust(8)+chain.id.rjust(8)+str(prev_residue.id[1]).rjust(8)+prev_residue.resname.rjust(8)+\
                          str(999.00).rjust(8)+str(psi).rjust(8)+str(999.00).rjust(8)+str(chi1).rjust(8)+str(chi2).rjust(8)+str(chi3).rjust(8)+str(chi4).rjust(8))
                    continue

                if first == 3:  #This block computes phi and psi dihedrals from second residue onward. At anytime in the block we have three residue variables assigned.
                    next_residue = residue
                    phi = compute_phi(structure, model, chain, prev_residue,
                                      curr_residue)
                    psi = compute_psi(structure, model, chain, curr_residue,
                                      next_residue)
                    omega = compute_omega(structure, model, chain,
                                          prev_residue, curr_residue)
                    chi1 = compute_chi1(structure, model, chain, curr_residue)
                    chi2 = compute_chi2(structure, model, chain, curr_residue)
                    chi3 = compute_chi3(structure, model, chain, curr_residue)
                    chi4 = compute_chi4(structure, model, chain, curr_residue)

                    print(pdbfilename[0:-4].rjust(8)+str(model.id).rjust(8)+chain.id.rjust(8)+str(curr_residue.id[1]).rjust(8)+curr_residue.resname.rjust(8)+\
                          str(phi).rjust(8)+str(psi).rjust(8)+str(omega).rjust(8)+str(chi1).rjust(8)+str(chi2).rjust(8)+str(chi3).rjust(8)+str(chi4).rjust(8))

                    prev_residue = curr_residue
                    curr_residue = next_residue  #update residue variables

            if first == 3:  #This block computes phi dihedral for the last residue
                phi = compute_phi(structure, model, chain, prev_residue,
                                  curr_residue)
                omega = compute_omega(structure, model, chain, prev_residue,
                                      curr_residue)
                chi1 = compute_chi1(structure, model, chain, curr_residue)
                chi2 = compute_chi2(structure, model, chain, curr_residue)
                chi3 = compute_chi3(structure, model, chain, curr_residue)
                chi4 = compute_chi4(structure, model, chain, curr_residue)

                print(pdbfilename[0:-4].rjust(8)+str(model.id).rjust(8)+chain.id.rjust(8)+str(curr_residue.id[1]).rjust(8)+curr_residue.resname.rjust(8)\
                     +str(phi).rjust(8)+str(999.00).rjust(8)+str(omega).rjust(8)+str(chi1).rjust(8)+str(chi2).rjust(8)+str(chi3).rjust(8)+str(chi4).rjust(8))
    return