Exemplo n.º 1
0
def main():
    """Run it."""
    args = _parse_arguments()
    fname, s_ext = splitext(basename(args.input_file))
    parser = None
    if s_ext in {'.pdb', '.ent'}:
        parser = PDBParser(QUIET=1)
    elif s_ext == ".cif":
        parser = FastMMCIFParser(QUIET=1)

    with open(args.input_file) as in_file:
        # try to set electrostatics from input file if not provided by user
        electrostatics = args.electrostatics \
            if args.electrostatics or s_ext == '.cif' \
            else extract_electrostatics(in_file)
        prodigy_lig = ProdigyLig(parser.get_structure(fname, in_file),
                                 chains=args.chains,
                                 electrostatics=electrostatics,
                                 cutoff=args.distance_cutoff)

    prodigy_lig.predict()
    prodigy_lig.print_prediction('', args.verbose)

    if args.output_file is not None:
        output_file_name = splitext(prodigy_lig.structure.id)[0]
        output_file_name += "-processed.pdb"
        prodigy_lig.print_structure(output_file_name)
Exemplo n.º 2
0
    def from_file(cls, filename, file_format="pdb"):
        """
        Initialize structure from PDB/mmCIF file

        Parameters
        ----------
        filename : str
            Path of file
        file_format : {"pdb", "cif"}, optional (default: "pdb")
            Format of structure (old PDB format or mmCIF)

        Returns
        -------
        ClassicPDB
            Initialized PDB structure
        """
        try:
            if file_format == "pdb":
                from Bio.PDB import PDBParser
                parser = PDBParser(QUIET=True)
            elif file_format == "cif":
                from Bio.PDB import FastMMCIFParser
                parser = FastMMCIFParser(QUIET=True)
            else:
                raise InvalidParameterError(
                    "Invalid file_format, valid options are: pdb, cif"
                )

            structure = parser.get_structure("", filename)
            return cls(structure)
        except FileNotFoundError as e:
            raise ResourceError(
                "Could not find file {}".format(filename)
            ) from e
Exemplo n.º 3
0
def call_fast_mmcif(f):
    '''
    Call function for mmcifr files (Using Fast Parser)
    '''

    if (".cif") in f:
        name = f.split('/')[-1].split('.')[0].upper()
        # Open gz files
        if ".gz" in f:
            f = gzip.open(f, 'rt')
        parser = FastMMCIFParser()
        structure = parser.get_structure(name, f)
        mmtf_encoder = MMTFEncoder()
        pass_data_on(input_data=structure,
                     input_function=biopythonInputFunction,
                     output_data=mmtf_encoder)
        return (name, mmtf_encoder)
def retrieve_struct(pdb_id, chain_id, load_local=False):
    '''Structure parser use mmCif supported more filetype
    :param pdb_id: PDB_ID protein
    :param chain_id: Alphabetic chain
    :return: structure and has_structured sequences
    '''
    from Bio.PDB import PDBList, FastMMCIFParser
    from Bio.PDB.Polypeptide import three_to_one
    pdbl = PDBList()
    pdbl.retrieve_pdb_file(pdb_id, file_format='mmCif', pdir='./')
    parser = FastMMCIFParser()
    structure = parser.get_structure(pdb_id, pdb_id.lower() + '.cif')
    os.remove(pdb_id.lower() + '.cif')
    chain = structure[0][chain_id]
    coords = []
    structured_sequence = ''
    for residue in chain:
        if 'CA' in residue and residue['CA'].is_disordered() == 0:
            coords.append(residue['CA'].get_coord())
            structured_sequence += three_to_one(residue.resname)
        else:
            print((residue.is_disordered(), residue.id))
    return np.array(coords), str(structured_sequence)
Exemplo n.º 5
0
def parse_structure(spath):
    """Parses a PDB/cif structure"""

    if not os.path.isfile(spath):
        return IOError('File not found: {0}'.format(spath))

    if spath.endswith(('pdb', 'ent')):
        parser = PDBParser(QUIET=True)
    elif spath.endswith('cif'):
        parser = FastMMCIFParser()
    else:
        raise Exception('Format not supported ({0}). Must be .pdb/.ent or .cif'.format(spath))

    sname = os.path.basename(spath.split('.')[0])
    return parser.get_structure(sname, spath)
Exemplo n.º 6
0
def parse(filename, quiet=False):
    '''Parses pdb file and returns emap object.
    
    Parameters
    ----------
    filename: str
        Full path to file which needs to be parsed
    quiet: bool, optional
        Supresses output when set to true
    
    Returns
    -------
    my_emap: :class:`~pyemap.emap`
        emap object reading for parsing
    '''
    try:
        parser = PDBParser()
        structure = parser.get_structure("protein", filename)
    except Exception as e:
        parser = FastMMCIFParser()
        structure = parser.get_structure("protein", filename)
        io = PDBIO()
        fn = filename[:-4] + ".pdb"
        io.set_structure(structure)
        io.save(fn)
        parser = PDBParser()
        structure = parser.get_structure("protein", fn)
    chain_list = []
    num_models = 0
    for model in structure.get_models():
        num_models += 1
    if num_models < 1:
        raise RuntimeError("Unable to parse file.")
    for chain in structure[0].get_chains():
        chain_list.append(chain.id)
    non_standard_residue_list = []
    for res in structure[0].get_residues():
        if res.resname not in res_name_to_char:
            res.get_full_id()
            arom_res = res.copy()
            non_standard_residue_list.append(arom_res)
    custom_residue_list = process_custom_residues(non_standard_residue_list)
    if not quiet:
        print("Identified " + str(len(custom_residue_list)) +
              " non-protein ET active moieties.")
    my_emap = emap(filename, structure, custom_residue_list, chain_list)
    return my_emap
Exemplo n.º 7
0
    def parse_structure(self, filepath_):
        """Parse a PDB/cif structure."""

        try:
            filepath = pathlib.Path(filepath_)
            filepath.resolve(strict=True)
        except FileNotFoundError:
            raise FileNotFoundError(f"File not found: {filepath}")

        if filepath.suffix in {".pdb", ".ent"}:
            parser = PDBParser()
        elif filepath.suffix in {".cif", ".mmcif"}:
            parser = FastMMCIFParser()
        else:
            raise ValueError(
                f"Unsupported input structure format: {filepath.suffix}")

        return parser.get_structure(filepath.name, str(filepath))
def loop_parsing(file_type, proteins, rep=10):
    cwd = os.getcwd()
    if file_type == 'mmtf': parser = MMTFParser()
    elif file_type == 'fast_cif': parser = FastMMCIFParser()
    elif file_type == 'cif': parser = MMCIFParser()
    else: parser = PDBParser()
    for p in proteins:
        if file_type == "fast_cif": file_type = "cif"
        directory = "%s/%s/%s.%s" % (cwd, file_type, p, file_type)
        try:
            if file_type == 'mmtf':
                protein = parser.get_structure(directory)
            else:
                protein = parser.get_structure(random.randint(0, 100),
                                               directory)
        except Exception:
            print("Having trouble parsing %s" % (p))
            break
    return
Exemplo n.º 9
0
def pdb_process(task, core, output):
    process_start_time = time.time()  # for simple profiling

    chunksize = task.chunksize + task.chunksize_offset * core  # chunksize: how many files to analyse before commiting data to the database

    # assigning SQLalchemy connection
    engine = create_engine(task.dc_db)
    Base.metadata.bind = engine
    db_session = sessionmaker(bind=engine)
    session = db_session()
    str_parser = FastMMCIFParser(QUIET=1)

    print("Connected core %i, chunksize %i, offset %i" % (core, task.chunksize, task.chunksize_offset * core))

    # detail of console output
    verbose = task.verbose
    quiet = task.quiet

    # buffer to hold data for submission to database and if it has been submitted successfully. Handles potential of locked database while writing from parallel process
    buffer = []
    flush = True
    flush_offset = task.flush_offset
    flush_offset_count = 0
    file_queue = task.file_queue[core]

    # loop to analyse individual structure files from file queue
    for pdb_count, (root, file) in enumerate(file_queue):
        pdb = PDB(os.path.join(root, file),
                  core)  # generate a PDB object to hold all relevant information for a single parsed structure file
        task_summary = ""
        task_summary = task_summary + ("Processing file %s on core %i" % (pdb.path, core))

        # checks before structure analysis, including successful fast structure parsing
        if pdb.filesize > task.filesize_limit:  # throw out large files. TODO: Seperate filesize due to large assembly from structure factors
            task_summary = task_summary + "\n\tSize abort"
            pdb.pass_filesize = False
            pdb.abort = True
            print(task_summary)
        else:
            if task.verbose: task_summary = task_summary + "\n\tSize pass"
            pdb.pass_filesize = True
            try:
                pdb.structure = str_parser.get_structure(pdb.id, pdb.path)
                if task.verbose: task_summary = task_summary + "\n\tMMCIFParser pass"
                pdb.pass_structure = True
            except:
                task_summary = task_summary + "\n\tMMCIFParser abort"
                pdb.pass_structure = False
                pdb.abort = True
                print(task_summary)

        # PDB object functions analysing distance data in a structure file and assigning relevant metadata for the database
        pdb.analyse(task)
        pdb.generate_dictionary()
        pdb.assign_dictionary_data()
        pdb.sqla_convert_distances()

        if len(
                pdb.filtered_distances) > 0:  # hits from filtered_distances are preferred the summary table (here: intermolecular hits)
            representative_distance = pdb.representative_distance_filtered()
        else:  # no intermolecular hits found or no hits found at all
            representative_distance = pdb.representative_distance_unfiltered()
        if representative_distance:
            pdb.top_hit_sqla(representative_distance)  # populate class with information for summary table

        pdb.sqla_summary()  # generate entry for summary table

        pdb_entries_wrap = pdb.alchemy_distances, pdb.alchemy_sum  # results for all distances per structure and single distance for summary table

        buffer.append(pdb_entries_wrap)  # append result to buffer holding data for submission to database
        if len(buffer) == chunksize:
            print("Core %i - %i tasks in %i seconds" % (core, pdb_count + 1, (time.time() - process_start_time)))
            flush = buffer_SQLal_dc_submission(session, buffer, core)  # returns if submission to database successful
            chunksize = task.chunksize  # remove offset after first time. This might not be needed

            if flush:
                flush_offset_count = 0
                buffer = []
                print("Core %i - commit successful" % core)

        if not flush:  # Handles incomplete submission of buffer due to busy database. This fix likely already made chunk_offset obsolete
            if flush_offset_count == flush_offset:  # Tries to submit again every flush_offset until it succeeds
                print("Core %i - not flush at %i tasks with %i tasks in buffer" % (core, pdb_count + 1, len(buffer)))
                flush_offset_count = 0
                flush = buffer_SQLal_dc_submission(session, buffer, core)
                if flush:
                    buffer = []
            flush_offset_count += 1

        # # # some text for console
        if not verbose:
            task_summary = task_summary + "\nFinished task %i on core %i\n" % (pdb_count, core)
        if verbose:
            task_summary = (task_summary + ("\n\t\t%i\tdistances\t" % len(pdb.distances) + str(pdb.distances)))
            task_summary = task_summary + "\n\t\t\t%i of %i distances intermolecular" % (
                pdb.inter_count, len(pdb.distances))
            task_summary = task_summary + "\n\t\t\t%i of %i distances below cutoff" % (
                pdb.dist_count, len(pdb.distances))
            task_summary = task_summary + "\n\t\t\t%i distances below cutoff and intermolecular" % (pdb.hit_count)
            task_summary = task_summary + "\nFinished task %i on core %i\n" % (pdb_count, core)
        if not quiet: print(task_summary)

    #  Final commit for incomplete buffer at end of queue (remainder from chunk size)
    for i in range(0, 10):
        flush = buffer_SQLal_dc_submission(session, buffer, core)  # final commit attempted 10 times
        if not flush:
            time.sleep(60)
            print("Failed to commit final chunk of size %i on core %i. Waiting 60s" % (len(buffer), core))
        else:
            print("Committed final chunk of size %i on core %i." % (len(buffer), core))
            break
    if not flush:
        print("Failed to flush after 10 min. Attempting single submission")
        for i in range(0, 10):
            buffer = single_buffer_SQLal_dc_submission(session, buffer, core)  # final commit attempted 10 times
            if len(buffer) == 0:
                flush = True
            else:
                flush = False

            if not flush:
                time.sleep(60)
                print("Failed to single commit, final chunk of size %i on core %i. Waiting 60s" % (len(buffer), core))
            else:
                print("Committed all chunks on core %i." % (core))
                break
    if not flush:  # writes error to logfile upon failure to commit final entries after 2*10 attempts
        with open(task.dc_error_log, "a+") as outfile:
            for entry in buffer:
                outfile.write("{}\t{}".format(entry[1].pdb_id, core))

    output.put("--- Core %s finished %s tasks in %s seconds ---" % (
        core, len(file_queue), (time.time() - process_start_time)))
Exemplo n.º 10
0
import sys, os, json
import re
from pathlib import Path
from Bio.PDB.ResidueDepth import residue_depth
from Bio.PDB.Structure import Structure
from Bio.PDB.Atom import Atom
from Bio.PDB.Chain import Chain
from Bio.PDB.Residue import Residue
from ciftools.Structure import fetchStructure
from dotenv import load_dotenv
from Bio.PDB import FastMMCIFParser

from functools import reduce

parser = FastMMCIFParser(QUIET=True)
struct: Structure = parser.get_structure('3j7z', '3J7Z.cif')[0]

with open("3J7Z_TUNNEL_REPORT.json") as infile:
    tunnel = json.load(infile)

strands = tunnel['adjacent_strands']

charge_from_P = 0
charge_from_GLU_ASP = 0
charge_from_ARG_LYS = 0

p_global = []

for strand in strands:
    chain: Chain = struct[strand]