Exemplo n.º 1
0
def get_missing_sidechains(pdb_dataset, output_scwrl):
    """Get residues that are missing atoms."""
    for pdb_filename in db.get_structures_filenames(pdb_dataset):
        biopy_structure = db.parse_biopython_structure(pdb_filename)
        pdb_name = db.get_pdb_name(pdb_filename)
        missing = 0
        scwrl_list = []
        logging.info("Processing {:}".format(pdb_name))
        for model in biopy_structure:
            for chain in model:
                for i, residue in enumerate(chain):
                    res_name = residue.resname
                    if res_name not in expected:
                        logging.warning("Non-standard residue found: {:}. "
                                        "Skipping.".format(res_name))
                        continue
                    res_code = poly.three_to_one(res_name)
                    res_id = residue.id[1]
                    curr_count = len(
                        Bio.PDB.Selection.unfold_entities(residue, 'A'))
                    if curr_count != expected[res_name]:
                        logging.debug(
                            "Missing residue {:} at position {:} (with id {:})"
                            " which has {:} instead of the expected {:} atoms."
                            .format(res_name, i, res_id, curr_count,
                                    expected[res_name]))
                        missing += 1
                        scwrl_list.append(res_code.upper())
                    else:
                        scwrl_list.append(res_code.lower())

        logging.debug("Missing {:} residue total".format(missing))
        with open(output_scwrl, 'w') as f:
            f.write("".join(scwrl_list))
Exemplo n.º 2
0
def _generate_reference(pdb_filename, s2r_chain, s2r_res, output_filename,
                        style):
    """Transform PDB structure to a reference structure."""
    biopy_structure = db.parse_biopython_structure(pdb_filename)
    pdb_name = db.get_pdb_name(pdb_filename)

    new_model = Bio.PDB.Model.Model('0')
    new_structure = Bio.PDB.Structure.Structure('')
    for (chain, residues) in \
            struct.get_chain_to_valid_residues(biopy_structure, pdb_name):
        if style == 'dockground' and chain not in s2r_chain:
            # If we are in dockground, we allow ourselves to remove unmapped
            # chains.
            continue
        ref_chain = s2r_chain[chain]

        if chain in s2r_res:
            # If we have an alignment for this chain.
            new_chain = Bio.PDB.Chain.Chain(ref_chain)
            for i, residue in enumerate(residues):
                if residue.id[0] != ' ':
                    continue
                residue.segid = ""
                residue.id = (' ', s2r_res[chain][i], residue.id[2])
                new_chain.add(residue)
        else:
            # Else, just remove segment ID.
            new_chain = Bio.PDB.Chain.Chain(ref_chain)
            for i, residue in enumerate(residues):
                residue.segid = ""
        new_model.add(new_chain)

    new_structure.add(new_model)
    w = Bio.PDB.PDBIO()
    w.set_structure(new_structure)
    w.save(output_filename)
Exemplo n.º 3
0
def parse_structure(structure_filename, concoord=False, one_model=False):
    """Parse a file into chain,model-to-residue mapping."""
    _, ext = os.path.splitext(structure_filename)
    detailed = ext == '.pkl'
    if detailed:
        # If detailed we are reading pandas pickle file outputted by
        # protprep.
        df = pd.read_pickle(structure_filename)
        # Set model to 0, because a multi-model file was either already split
        # into separate files (using the split command) or was pared down to a
        # single model by the autodock portion of the protprep pipeline.
        # This might need to be revisited if/when autodock is removed from
        # pipeline or we decide to actually keep track of correct model.
        df['model'] = get_model(structure_filename)
        # Remove hydrogens, for now, to maintain compatability.
        df = df[df['maestro_atom_name'].apply(lambda x: x.strip()[0]) != 'H']
    else:
        # BioPython.PDB Structure extracted from PDB file.
        biopy_structure = db.parse_biopython_structure(structure_filename)
        pdb_name = db.get_pdb_name(structure_filename)
        if concoord:
            # need to set model number to be correct (drawn from filename)
            # TODO: I (Raphael) moved this out of core Structure code, need to
            # make sure it is correct still for CONCOORD.
            biopy_structure = db.parse_biopython_structure(structure_filename)
            biopy_structure = \
                Bio.PDB.Structure.Structure(biopy_structure.id)

            chainmodel = pdb_name.split('_')[1]
            model_id = str(int(re.split('(\d+)', chainmodel)[1]) + 1)

            for model_obj in biopy_structure:
                new_model = Bio.PDB.Model.Model(model_id)
                for chain in model_obj:
                    new_model.add(chain)
                biopy_structure.add(new_model)

        if one_model:
            new_structure = Bio.PDB.Structure.Structure(biopy_structure.id)
            new_structure.add(biopy_structure[0])
            biopy_structure = new_structure
        atoms = []
        for residue in Bio.PDB.Selection.unfold_entities(biopy_structure, 'R'):
            # Prune out things that aren't actually residue atoms.
            if 'CA' in residue and residue.get_id()[0] == ' ':
                for atom in residue:
                    atoms.append(atom)

        df = pd.DataFrame(
            [(pdb_name,
              str(atom.get_parent().get_parent().get_parent().serial_num),
              atom.get_parent().get_full_id()[2],
              str(atom.get_parent().get_id()[1]) +
              atom.get_parent().get_id()[2], atom.get_parent().get_resname(),
              atom.get_coord()[0], atom.get_coord()[1], atom.get_coord()[2],
              atom.get_id()[0], atom.get_name(), str(atom.serial_number))
             for atom in atoms],
            columns=[
                'pdb_name', 'model', 'chain', 'residue', 'resname', 'x', 'y',
                'z', 'element', 'atom_name', 'aid'
            ])
    return df