Пример #1
0
def load_from_one_cath_pml_file(pml_file, scratch_path, superfamilies,
                                dssp_path):
    '''Load data from a .pml file of superposed
  homologous superfamilies from the CATH database.
  '''
    superfamilies.append([])
    candidate_proteins = []

    with open(pml_file, 'r') as f:
        while True:
            line = f.readline()
            if not line: break

            # Read one structure

            if line.strip().startswith('cmd.read_pdbstr'):
                pdb_lines = [line.strip()[19:].strip('\\')]
                pdb_id = ''

                while True:
                    line = f.readline()
                    if line.strip().startswith('"""'):
                        pdb_id = line.strip()[5:12]
                        break

                    pdb_line = line.strip().strip('\\')
                    if len(pdb_line) > 17:
                        pdb_line = pdb_line[0:16] + ' ' + pdb_line[
                            17:]  # Remove all altLoc flags

                    pdb_lines.append(pdb_line)  # Remove all altLoc flags

                # Make a pdb file of the structure for DSSP analysis

                structure = structure_from_pdb_string('\n'.join(pdb_lines),
                                                      pdb_id)

                # Store structures without chain breaks

                if len(topology.find_structure_chain_breaks(structure)) == 0:
                    structure_path = os.path.join(scratch_path,
                                                  pdb_id + '.pdb')

                    io = PDB.PDBIO()
                    io.set_structure(structure)
                    io.save(structure_path)

                    candidate_proteins.append({
                        'structure': structure,
                        'path': structure_path
                    })

    for p in candidate_proteins:
        try:
            find_secondary_structures(p, dssp_path)
        except:
            continue
        superfamilies[-1].append(
            p)  # Add a protein to a superfamily if there's no exception
Пример #2
0
def preparePdb(pdb_fname, out_pdb_fname):
    ''' Prepare the PDB file with only first model and redundancies cut out '''
    # 'Absolutize' the path names - rest is done in the temporary dir
    pdb_fname = os.path.abspath(pdb_fname)
    if not os.path.exists(pdb_fname):
        raise IOError('%s does not exist' % pdb_fname)
    out_pdb_fname = os.path.abspath(out_pdb_fname)
    # Inside the temporary dir
    with tempDir() as tmp_dir:
        # Temporary names for curated input and output files
        new_pdb_fname = 'query.pdb'
        out_tmp_fname = 'out.pdb'
        # If the original PDB is packed with gzip - unpack it into a new file
        if pdb_fname.endswith('.gz'):
            rfh = gzip.open(pdb_fname, 'r')
        else:
            rfh = open(pdb_fname, 'r')
        try:
            with open(new_pdb_fname, 'w') as wfh:
                wfh.write(rfh.read())
        finally:
            rfh.close()
        # Parse structure
        # Redirect standard output/error to a cStringIO,
        #so that PDBParser stops messing the output
        parser = Bio.PDB.PDBParser()
        err_fh = io.StringIO()
        sys.stdout = err_fh
        sys.stderr = err_fh
        struct = parser.get_structure('query', new_pdb_fname)
        sys.stdout = sys.__stdout__
        sys.stderr = sys.__stderr__
        # Output formatted info about PDBParser's work to a log
        s = err_fh.getvalue()
        if s.strip():
            logging.info(
                "Structure parsing generated following error message(s): \n%s\n%s\n%s"
                % ('-' * 120, s, '-' * 120))
        # By default use only first model
        model = struct[0]
        del struct.child_list[1:]
        # Check for discontinuities greater than 5 residues - warn about this _specifically_
        for chain in model:
            chid = chain.id
            last_rid = None
            for residue in chain:
                if last_rid is not None and rid > last_rid + 5:
                    rid = residue.id[1]
                    logging.warn(
                        "Residues %s:%s-%s:%s. Results might be inaccurate, as the break in a protein chain numbers more than 5 residues."
                        % (last_id, chain, rid, chain))
                    last_rid = rid
        # Save structure without hydrogens
        io = Bio.PDB.PDBIO()
        io.set_structure(struct)
        io.save(new_pdb_fname)
        shutil.move(new_pdb_fname, out_pdb_fname)
        return out_pdb_fname
Пример #3
0
def save_structure(struct, name):
    file = '{}.pdb'.format(name)

    io = PDBIO()
    io.set_structure(struct)
    io.save(file)
    del io

    with open(file, 'r') as f:
        atoms = f.read()

    data = header() + atoms

    with open(file, 'w') as f:
        f.write(data)
Пример #4
0
def get_sequence(pdb, chain):
    pdb_parser = PDBParser(PERMISSIVE=0)                    # The PERMISSIVE instruction allows PDBs presenting errors.
    pdb_structure = pdb_parser.get_structure(pdb,pdb+".pdb")
    pdb_chain = pdb_structure[0][chain]
    i = 1
    lista=[]
    for residue in pdb_chain:
        if i < int(sys.argv[3]) or i > int(sys.argv[4]):
            lista.append(residue.get_id())
            #pdb_chain.detach_child(residue.get_id())
        i+=1
    for x in lista:
        pdb_chain.detach_child(x)

    io = PDBIO()
    io.set_structure(pdb_chain)
    output = sys.argv[5]+"_segment.pdb"
    io.save(output)
Пример #5
0
def extract_ligands(path):
    """ Extraction of the heteroatoms of .pdb files """

    for pfb_file in os.listdir(path + 'pdbs/'):
        i = 1
        if pfb_file.endswith('.pdb') and not pfb_file.startswith("lig_"):
            pdb_code = pfb_file[:-4]
            pdb = PDBParser().get_structure(pdb_code,
                                            path + 'pdbs/' + pfb_file)
            io = PDBIO()
            io.set_structure(pdb)
            model_selected = pdb[0]
            # for model in pdb:

            for chain in model_selected:
                for residue in chain:
                    if not is_het(residue):
                        continue
                    print(f"saving {chain} {residue}")
                    io.save(f"lig_{pdb_code}_{i}.pdb",
                            ResidueSelect(chain, residue))
                    i += 1
Пример #6
0
    def download_pdb(self, info):
        pdb_id, chain_id = info

        ## Check if atom has alternative position, if so, keep 'A' position and remove the flag
        ## but somehow this class doesn't seem to function well
        class NotDisordered(Select):
            def accept_atom(self, atom):
                if not atom.is_disordered() or atom.get_altloc() == 'A':
                    atom.set_altloc(' ')
                    return True
                else:
                    return False

        ## BioPython downloads PDB but it gives a lowercase name in pdb{}.ent format
        biopdb_name = '{0}/pdb{1}.ent'.format(self.work_dir, pdb_id.lower())
        biopdb_modf = '{0}/pdb{1}.mod.ent'.format(self.work_dir,
                                                  pdb_id.lower())
        if not os.path.isfile(biopdb_modf):
            try:
                PDB.PDBList(verbose=False).retrieve_pdb_file(
                    pdb_id,
                    pdir=self.work_dir,
                    obsolete=False,
                    file_format='pdb')
            except FileNotFoundError:
                print(
                    '  \033[31m> ERROR: BioPython cannot download PDB: \033[0m'
                    + pdb_id)
                return None

        ## Replace modified AA to avoid mis-recognition in biopython readin
        ## Replace disordered atoms and keep only the "A" variant
        ReplacePDBModifiedAA(biopdb_name, biopdb_modf)
        os.system('grep "REMARK  " {0} > {0}.remark'.format(biopdb_modf))
        with open(biopdb_modf, 'r') as fi:
            remarks = [l for l in fi if re.search('REMARK HET ', l)]

        ## Read the PDB file and extract the chain from structure[0]
        try:
            model = PDB.PDBParser(PERMISSIVE=1,
                                  QUIET=1).get_structure(pdb_id,
                                                         biopdb_modf)[0]
        except KeyError:
            print('  \033[31m> ERROR: BioPython cannot read in PDB: \033[0m' +
                  biopdb_modf)
            return None
        except ValueError:
            print('  \033[31m> ERROR: PDB file is empty: \033[0m' +
                  biopdb_modf)
            return None

        ### Bug alert: as of 20.02.18, Biopython dev hasn't come up with good
        ### strategy to fix the 'atom.disordered_get_list()' issue with alternative
        ### position of residue side chains. To go around this, will physically
        ### remove "B" variant and keep only "A" variant in
        io = PDB.PDBIO()
        io.set_structure(model[chain_id])
        io.save('{0}/{1}_{2}.pdb'.format(self.work_dir, pdb_id, chain_id),
                select=NotDisordered())

        # Attach REMARK to end of PDB as safekeeping
        os.system('cat {0}/{1}_{2}.pdb {3}.remark > {1}.temp'.format(
            self.work_dir, pdb_id, chain_id, biopdb_modf))
        os.system('mv {1}.temp {0}/{1}_{2}.pdb'.format(self.work_dir, pdb_id,
                                                       chain_id))
        #    os.system('mv {1} {0}/{2}.ent'.format(self.work_dir, biopdb_name, pdb_id))
        #    os.system('bzip2 -f {0}/{1}.ent'.format(self.work_dir, pdb_id))
        #    os.system('rm {0} {0}.remark'.format(biopdb_modf))

        return '{0}/{1}_{2}.pdb'.format(self.work_dir, pdb_id, chain_id)
Пример #7
0
def prepareWithHydrogens(pdb_fname, out_pdb_fname="wth_hydro.pdb"):
    ''' Prepare the PDB file with hydrogen data (clean up and create a new one). '''
    # 'Absolutize' the path names - rest is done in the temporary dir
    pdb_fname = os.path.abspath(pdb_fname)
    if not os.path.exists(pdb_fname) or not os.path.isfile(pdb_fname):
        raise IOError('%s does not exist or is not a file.' % pdb_fname)
    out_pdb_fname = os.path.abspath(out_pdb_fname)
    if pdb_fname.endswith('.gz'):
        rfh = gzip.open(pdb_fname, 'r')
        #print pdb_fname
    else:
        rfh = open(pdb_fname, 'r')
    try:
        # Parse structure
        parser = Bio.PDB.PDBParser()
        # Redirect standard output/error to a cStringIO,
        # so that PDBParser stops messing the output
        err_fh = io.StringIO()
        sys.stdout = err_fh
        sys.stderr = err_fh
        struct = parser.get_structure('query', rfh)
    finally:
        # Restore streams
        sys.stdout = sys.__stdout__
        sys.stderr = sys.__stderr__
        # ... and close up
        rfh.close()
    # Output formatted info about PDBParser's work to a logger
    s = err_fh.getvalue()
    if s.strip():
        logging.info(
            "Structure parsing generated following error message(s): \n%s\n%s\n%s"
            % ('-' * 120, s, '-' * 120))
    # By default use only first model
    # ... delete the rest
    model = struct[0]
    del struct.child_list[1:]
    # Check for discontinuities greater than 5 residues - warn about this _specifically_ (into the logger, again)
    for chain in model:
        chid = chain.id
        last_rid = None
        for residue in chain:
            if last_rid is not None and rid > last_rid + 5:
                rid = residue.id[1]
                logging.warn(
                    "Residues %s:%s-%s:%s. Results might be inaccurate, as the break in a protein chain numbers more than 5 residues."
                    % (last_id, chain, rid, chain))
                last_rid = rid
    # Prepare the remade hydrogens
    remakeHydrogens(struct)
    # Save structure
    if out_pdb_fname.endswith('.gz'):
        with closing(gzip.open(out_pdb_fname, 'w')) as wfh:
            io = Bio.PDB.PDBIO()
            io.set_structure(struct)
            io.save(wfh)
    else:
        io = Bio.PDB.PDBIO()
        io.set_structure(struct)
        io.save(out_pdb_fname)
    return out_pdb_fname
Пример #8
0
def prepareWithHydrogensPrep23(pdb_fname, out_pdb_fname="wth_hydro.pdb"):
    ''' Prepare the PDB file with hydrogen data (clean up and create a new one). '''
    # 'Absolutize' the path names - rest is done in the temporary dir
    pdb_fname = os.path.abspath(pdb_fname)
    if not os.path.exists(pdb_fname) or not os.path.isfile(pdb_fname):
        raise IOError('%s does not exist or is not a file.' % pdb_fname)
    out_pdb_fname = os.path.abspath(out_pdb_fname)
    # Inside the temporary dir
    with tempDir() as tmp_dir:
        # Temporary names for curated input and output files
        new_pdb_fname = 'query.pdb'
        out_tmp_fname = 'out.pdb'
        # Prepare the sources
        prep_exec = _preparePrepExec()
        # Copy the original file into our temporary directory
        # If the original PDB is packed with gzip - unpack it into a new file
        if pdb_fname.endswith('.gz'):
            rfh = gzip.open(pdb_fname, 'r')
        else:
            rfh = open(pdb_fname, 'r')
        try:
            with open(new_pdb_fname, 'w') as wfh:
                wfh.write(rfh.read())
        finally:
            rfh.close()
        # Parse structure
        # Redirect standard output/error to a cStringIO,
        #so that PDBParser stops messing the output
        parser = Bio.PDB.PDBParser()
        err_fh = io.StringIO()
        sys.stdout = err_fh
        sys.stderr = err_fh
        with open(new_pdb_fname, 'r') as rfh:
            struct = parser.get_structure('query', rfh)
        sys.stdout = sys.__stdout__
        sys.stderr = sys.__stderr__
        # Output formatted info about PDBParser's work to a log
        s = err_fh.getvalue()
        if s.strip():
            logging.info(
                "Structure parsing generated following error message(s): \n%s\n%s\n%s"
                % ('-' * 120, s, '-' * 120))
        # By default use only first model
        model = struct[0]
        del struct.child_list[1:]
        # Check for discontinuities greater than 5 residues - warn about this _specifically_
        for chain in model:
            chid = chain.id
            last_rid = None
            # Curate disordered residues keeping only the last
            chain.child_list = [residue for residue in chain]
            chain.child_dict = dict((residue.id, residue) for residue in chain)
            for residue in chain:
                # Curate disordered atoms keeeping only the last
                residue.child_list = [a for a in residue]
                residue.child_dict = dict((a.id, a) for a in residue)
                if last_rid is not None and rid > last_rid + 5:
                    rid = residue.id[1]
                    logging.warn(
                        "Residues %s:%s-%s:%s. Results might be inaccurate, as the break in a protein chain numbers more than 5 residues."
                        % (last_id, chain, rid, chain))
                    last_rid = rid
        # Save structure without hydrogens
        io = Bio.PDB.PDBIO()
        io.set_structure(struct)
        io.save(new_pdb_fname, NoHydroSelect())
        # Run the preparation executable on the newly created PDB file
        if (subprocess.call("%s %s %s 1>tmp.out 2>tmp.err" %
                            (prep_exec, new_pdb_fname, out_tmp_fname),
                            shell=True) != 0):
            raise RuntimeError(
                'Could not prepare corrected structure file for %s' %
                pdb_fname)
        # Fix the occupancies (creating the last and final temporary PDB file)
        final_fn = "final.pdb"
        #raw_input('WAITING...')
        with open(out_tmp_fname, 'r') as rfh:
            with open(final_fn, 'w') as wfh:
                for line in rfh:
                    if line.startswith('ATOM'):
                        print >> wfh, line[:-1] + "  0.00  0.00           C"
                    else:
                        print >> wfh, line,
        # Move the output file to the desired location
        shutil.move(final_fn, out_pdb_fname)
        return out_pdb_fname