def main():
    """
    main forgetnrpdblist.py

    Usage: getnrpdblist.py 
              Input is CD-HIT cluster file on stdin
              Output is list of PDB identifiers on stdout
              Each has format:
              <repr-id>: <list of ids in cluster>

           where <repr-id> is the PDB (+ chain e.g. 2ssp_B) id in pdb_seqres
           format that has been chosen as the representative for the
           cluster and following is list of pdb_seqres identifiers in that
           cluster. Only proteins are output, not DNA, RNA, etc.
           (see module docstring at top of file).
    """
    if len(sys.argv) > 1:
        usage(sys.argv[0])

    TMPDIR = os.tempnam(None, "pdbgz")
    os.mkdir(TMPDIR)
    try:
        get_nr_pdb_list(TMPDIR)
    finally:
        cleanup_tmpdir(TMPDIR)
Пример #2
0
def get_tableau_from_pdbstruct(pdbid, domain,
                               pdb_structure, ptnode_list):
    """
    Build a PTTableau object for the tableau by first creating a
    simple PDB file with only the ATOM records for residues in the
    domain we are processing, and also a .SSEsInfo file containing the
    secnodary structure assignments we already have, then running
    TableauCreator on it (using our simple PDB file and SSEsInfo) and
    parsing the output.

    Parameters:
        pdbid - PDB identifier of the strucutre
        domain - The PTDomain object for our current domain
        pdb_structure - parsed Bio.PDB structure
        ptnode_list - list of PTNode objects (ie iterable of PTNode)
                         representing the SSEs (helices,strands) the
                         tabelau is for.
    Return value:
       PTTableau object built from TableauCreator output

    """
    TMPDIR = os.tempnam(None, "pttabin")
    os.mkdir(TMPDIR)
    try:
        filename = pdbid
        if domain.domainid != None:
            filename += '-' + domain.domainid
        filename += '.pdb'
        domain_pdb_filename = os.path.join(TMPDIR, filename)
        io = PDBIO()
        io.set_structure(pdb_structure)
        io.save(domain_pdb_filename, DomainSelect(domain))

        ssesinfo_filename = os.path.join(TMPDIR, filename + ".input-SSEsInfo")
        write_ssesinfo(ssesinfo_filename, ptnode_list)

        tableau =  read_tableau_from_tableaucreator(domain_pdb_filename,
                                                    ptnode_list,
                                                    ssesinfo_filename)
        os.unlink(domain_pdb_filename)
        os.unlink(ssesinfo_filename)
    finally:
        cleanup_tmpdir(TMPDIR)
    return tableau
Пример #3
0
def get_tableau_from_pdbstruct(pdbid, domain, pdb_structure, ptnode_list):
    """
    Build a PTTableau object for the tableau by first creating a
    simple PDB file with only the ATOM records for residues in the
    domain we are processing, and also a .SSEsInfo file containing the
    secnodary structure assignments we already have, then running
    TableauCreator on it (using our simple PDB file and SSEsInfo) and
    parsing the output.

    Parameters:
        pdbid - PDB identifier of the strucutre
        domain - The PTDomain object for our current domain
        pdb_structure - parsed Bio.PDB structure
        ptnode_list - list of PTNode objects (ie iterable of PTNode)
                         representing the SSEs (helices,strands) the
                         tabelau is for.
    Return value:
       PTTableau object built from TableauCreator output

    """
    TMPDIR = os.tempnam(None, "pttabin")
    os.mkdir(TMPDIR)
    try:
        filename = pdbid
        if domain.domainid != None:
            filename += '-' + domain.domainid
        filename += '.pdb'
        domain_pdb_filename = os.path.join(TMPDIR, filename)
        io = PDBIO()
        io.set_structure(pdb_structure)
        io.save(domain_pdb_filename, DomainSelect(domain))

        ssesinfo_filename = os.path.join(TMPDIR, filename + ".input-SSEsInfo")
        write_ssesinfo(ssesinfo_filename, ptnode_list)

        tableau = read_tableau_from_tableaucreator(domain_pdb_filename,
                                                   ptnode_list,
                                                   ssesinfo_filename)
        os.unlink(domain_pdb_filename)
        os.unlink(ssesinfo_filename)
    finally:
        cleanup_tmpdir(TMPDIR)
    return tableau
Пример #4
0
def get_tableaux(pdb_filename,
                 secstruct_program='dssp',
                 domain_program='none',
                 include_310_helices=True,
                 include_pi_helices=True,
                 sse_id_list=None,
                 min_sse_len=None,
                 use_numeric=False,
                 use_hk=False,
                 build_dist_matrix=False):
    """
    Get a tableau for a single PDB or ASTRAL pdb-style file
    (compressed files e.g. pdb1qlp.ent.gz) or uncompressed
    or the ASTRAL pdb-style hierarchy
    (uncompressed files e.g. d1qlpa_.ent).

    Parameters:
       pdb_filename - filename of PDB or ASTRAL pdb-style file, as above.
       secstruct_program - secondary structure definition program
                       ('stride' or 'dssp' or 'pdb') to use.
       domain_progam - domain decompositino method ('ddomain','cath', etc.)
       include_310_helices - if True, include 3_10 helices in the graph
       include_pi_helices - if True, include pi helices in the graph
       sse_id_list - list of ints representing SSE sequential id numbers
                     to include in tableau. Default None.
                     When None, all SSEs are included.
       min_sse_len - min number of residues in SSE to be ncluded.
                      Default None (no min length).
       use_numeric - if True build Numeric.array Omega matrix not PTTableau
       use_hk - If True build tableaux with HH and KK codes for strands in
                same sheet. default False.
       build_dist_matrix - If True, build SSE midpoint distance matrices
                   instead of tableaux.


    Return value:
        tuple (pdbid, tableaux_list, sse_string_list)
         from the pdb file, only one in lists unless
         domain decomposition is used and finds multidomains in input.
         tableaux_list is list of tableaux or omega matrices
         sse_string_list is SSE string description e.g. 'EEHHE' etc.
    """
    tableaux_list = []
    # check for compressed files. We only support gzip (.gz)
    # Note we are not using the zlib or GzipFile python modules
    # since we are calling to external programs which require the
    # file uncompressed themsevles anyway so we'll just run gzip
    # to uncompress the file to a temporary directory.
    pdb_file_basename = os.path.basename(pdb_filename)
    (name, extension) = os.path.splitext(pdb_file_basename)
    if extension == '.gz':
        TMPDIR = os.tempnam(None, "ptgz")
        os.mkdir(TMPDIR)
        tmp_pdbfilename = os.path.join(TMPDIR, name)
        os.system("gzip " + pdb_filename + " -d -c > " + tmp_pdbfilename)
        our_pdb_filename = tmp_pdbfilename
        used_tmp_file = True
    else:
        our_pdb_filename = pdb_filename
        used_tmp_file = False

    try:
        pdbid = name
        if len(pdbid) >= 6 and pdbid[:3].upper() == "PDB":
            pdbid = pdbid[3:7].upper()
        # parse PDB file
        pdb_parser = PDBParser()
        pdb_struct = pdb_parser.get_structure(pdbid, our_pdb_filename)
        # create the Tableaux and output them
        (tableaux_list, sse_string_list) = make_tableaux(
            our_pdb_filename, pdb_struct, secstruct_program, domain_program,
            include_310_helices, include_pi_helices, use_numeric, sse_id_list,
            use_hk, min_sse_len, build_dist_matrix)

    finally:
        if used_tmp_file:
            cleanup_tmpdir(TMPDIR)
    return (pdbid, tableaux_list, sse_string_list)
Пример #5
0
def read_tableau_from_tableaucreator(pdb_filename, ptnode_list,
                                     ssesinfo_filename):
    """
    Run Arun's TableauCreator program on the supplied pdb_filename
    using SSEsInfo file.

    Parameters:
       pdb_filename - PDB file to run TableauCreator on
       ptnode_list - list of PTNode objects (ie iterable of PTNode)
                         representing the SSEs (helices,strands) the
                         tabelau is for.
       ssesinfo_filename - filename of the .SSEsInfo file that was written
                           to define SSEs for TableauCreator.
    Return value:
       PTTableau object built from TableauCreator output
       
    NB: TableauCreator is not yet published or available (October 2007)
    and I am using a private version which Arun sent me, which I modified
    to add the -s option to use STRIDE rather than DSSP
    and to have the -i option to parse .SSEsInfo files.
    """

    # TableauCreator needs an output directory where it writes all its
    # intermediate/output files, only puts progress information/errors
    # to stdout/stderr.

    tmpdir = os.tempnam(None, "pttab")
    os.mkdir(tmpdir)
    command = "TableauCreator "
    command += "-i " + ssesinfo_filename + " "
    command += pdb_filename + " " + tmpdir
    command += " >/dev/null"
    if verbose:
        sys.stderr.write("running '" + command + "'...")
    os.system(command)
    if verbose:
        sys.stderr.write("done\n")
    # output files are:
    #   <pdbfilename>.angles
    #   <pdbfilename>.SSEsInfo
    #   <pdbfilename>.stride or <pdbfilename>.dssp
    #   <pdbfilename>.tableau
    outfile_prefix = os.path.join(tmpdir, os.path.basename(pdb_filename))
    if not os.path.isfile(os.path.join(tmpdir, "TABCREATE_OK")):
        sys.stderr.write("ERROR: TableauCreator failed\n")
        cleanup_tmpdir(tmpdir)
        return None
    # Now the tricky thing is TableauCreator indexes its matrix just with
    # purely sequential numbers from 0 (as conventional)
    # assuming all SSEs in one domain and in fact one chain
    # (so we handle this by creating our own simple PDB file with only
    # ATOM records for our current domain, and only one TER record on
    # end so chains concatenated effectively).
    # And also (as in comments above functions) we have the dodginess of
    # doing the same thing in different ways in multiple places (DSSP/STRIDE
    # parsing, PDB parsing, etc.).
    # So let's check that the TableauCreator SSE info lines up with ours
    # (otherwise we can't use the tableau data).

    # parse the SSEsInfo file and check lines up with ptnodes,
    # returns list of ptnodes corresponding to Tableau entries (may be shorter
    # than our input node list; some removed as no equivalent in tableua).
    nodelist = parse_tableaucreator_ssesinfo(outfile_prefix + '.SSEsInfo',
                                             ptnode_list)
    if nodelist != None:
        tableau_filename = outfile_prefix + ".tableau"
        tableau = parse_tableaucreator_output(tableau_filename, nodelist)
        if tableau != None:
            if verbose:
                sys.stderr.write(str(tableau))
        else:
            sys.stderr.write(
                'WARNING: problem parsing TableauCreator output;\n'
                '         tableau information will not be used\n')
    else:
        sys.stderr.write('WARNING: problem with TableauCreator output;\n'
                         '         tableau information will not be used\n')
        tableau = None

    cleanup_tmpdir(tmpdir)
    return tableau
Пример #6
0
def read_domains_from_ddomain(pdb_filename, pdb_model, chainid=None):
    """
    Use the DDOMAIN program to parse the structure from a PDB file into
    domains and return the corresponding list of PTDomain objects.

    DDOMAIN is described in

    Zhou, Xue, Zhou 2007 'DDOMAIN: Dividing structures into domains using a
    normalized domain-domain interaction profile' Protein Science 16:947-955.

    It is available as a 64-bit linux executable and FORTRAN-77 source code
    from http://sparks.informatics.iupui.edu/Resource_files/DDOMAIN.tar.gz

    Parameters:
       pdb_filename - filename of PDB file to run DDOMAIN on
       pdb_model  - Bio.PDB model struct for this PDB entry. Note that this
                    is only needed in the case that a DDomain domain has
                    different chain identifiers for start and end and
                    is then used just to find last residue number in chain.
       chainid - (default None). If not None, only the specified chain
                 is requested.

    Return value:
       List of PTDomain objects, one for each domain.

       NOTE: if there is only one domain, we will return a list with a single
       PTDomain with all data None, signifying a single domain protein
       with no further information.
       This is mainly because of when
       there are multiple chains, in which case the single domain is reported
       by DDOMAIN as having a different chain id for start and end. If there
       is a single domain we really don't want to do anything special, so
       it is better to just have it as a special case where no domain processing
       is done.

    """
    # DDOMAIN needs the PDB file in its working directory, and it reads
    # the PDB code (e.g. 1QLP for PDB file 1QLP.pdb) from stdin
    # (optionaly with chain suffix, which we won't use)
    # Note it requires this filename format, so for format like pdb1qlp.ent
    # we need to rename the file for DDOMAIN to 1QLP.pdb

    # This is nasty, but otherwise have to modify DDOMAIN FORTRAN-77 source
    # so that's even more hassle to have to have a custom version (like we
    # did with STRIDE).
    # So we'll work in /tmp directory, make a symlink (TODO: only UNIX allows
    # this, maybe should actually copy file so works on other platforms)
    # and run DDOMAIN there.
    oldcwd = os.getcwd()
    TMPDIR = os.tempnam(None, "ptdd")
    os.mkdir(TMPDIR)
    symlink_path = None
    try:
        pdb_file_basename = os.path.basename(pdb_filename)
        (name,extension) = os.path.splitext(pdb_file_basename)
        if extension.lower() == '.pdb':   # e.g. 1QLP.pdb
            pdb_identifier = name
            pdb_file_directory = os.path.split(pdb_filename)[0]
            symlink_path = os.path.join(TMPDIR, pdb_file_basename)
            os.symlink(os.path.abspath(pdb_filename), symlink_path)
        elif extension != '.ent' or name[:3].lower() != 'pdb':
            sys.stderr.write('WARNING: unknown PDB filename format "'
                             + pdb_file_basename + '"\n')
            sys.stderr.write('  Not running DDomain\n')
            domain_list = [PTDomain(None, None)] # one-domain protein, no further info
            return domain_list
        else: # e.g. pdb1qlp.ent, make a symlink to it in format 1QLP.pdb
            pdb_identifier = name[3:7].upper()
            symlink_path = os.path.join(TMPDIR, pdb_identifier + '.pdb')
            os.symlink(os.path.abspath(pdb_filename), symlink_path)

        os.chdir(TMPDIR)
        if verbose:
            sys.stderr.write("running DDomain...")
        (ddomain_stdin, ddomain_stdout) = os.popen2("DDomain")
        if chainid != None:
            pdbchainid = pdb_identifier + chainid
        else:
            pdbchainid = pdb_identifier
        ddomain_stdin.write(pdbchainid + '\n')
        ddomain_stdin.close()
        domain_list = parse_ddomain_output(ddomain_stdout, pdb_model)
        ddomain_stdout.close()
        if verbose:
            sys.stderr.write("done\n")
    finally:
        if symlink_path:
            os.unlink(symlink_path)
        os.chdir(oldcwd)
        cleanup_tmpdir(TMPDIR)
    return domain_list
def get_tableaux(
    pdb_filename,
    secstruct_program="dssp",
    domain_program="none",
    include_310_helices=True,
    include_pi_helices=True,
    sse_id_list=None,
    min_sse_len=None,
    use_numeric=False,
    use_hk=False,
    build_dist_matrix=False,
):

    """
    Get a tableau for a single PDB or ASTRAL pdb-style file
    (compressed files e.g. pdb1qlp.ent.gz) or uncompressed
    or the ASTRAL pdb-style hierarchy
    (uncompressed files e.g. d1qlpa_.ent).

    Parameters:
       pdb_filename - filename of PDB or ASTRAL pdb-style file, as above.
       secstruct_program - secondary structure definition program
                       ('stride' or 'dssp' or 'pdb') to use.
       domain_progam - domain decompositino method ('ddomain','cath', etc.)
       include_310_helices - if True, include 3_10 helices in the graph
       include_pi_helices - if True, include pi helices in the graph
       sse_id_list - list of ints representing SSE sequential id numbers
                     to include in tableau. Default None.
                     When None, all SSEs are included.
       min_sse_len - min number of residues in SSE to be ncluded.
                      Default None (no min length).
       use_numeric - if True build Numeric.array Omega matrix not PTTableau
       use_hk - If True build tableaux with HH and KK codes for strands in
                same sheet. default False.
       build_dist_matrix - If True, build SSE midpoint distance matrices
                   instead of tableaux.


    Return value:
        tuple (pdbid, tableaux_list, sse_string_list)
         from the pdb file, only one in lists unless
         domain decomposition is used and finds multidomains in input.
         tableaux_list is list of tableaux or omega matrices
         sse_string_list is SSE string description e.g. 'EEHHE' etc.
    """
    tableaux_list = []
    # check for compressed files. We only support gzip (.gz)
    # Note we are not using the zlib or GzipFile python modules
    # since we are calling to external programs which require the
    # file uncompressed themsevles anyway so we'll just run gzip
    # to uncompress the file to a temporary directory.
    pdb_file_basename = os.path.basename(pdb_filename)
    (name, extension) = os.path.splitext(pdb_file_basename)
    if extension == ".gz":
        TMPDIR = os.tempnam(None, "ptgz")
        os.mkdir(TMPDIR)
        tmp_pdbfilename = os.path.join(TMPDIR, name)
        os.system("gzip " + pdb_filename + " -d -c > " + tmp_pdbfilename)
        our_pdb_filename = tmp_pdbfilename
        used_tmp_file = True
    else:
        our_pdb_filename = pdb_filename
        used_tmp_file = False

    try:
        pdbid = name
        if len(pdbid) >= 6 and pdbid[:3].upper() == "PDB":
            pdbid = pdbid[3:7].upper()
        # parse PDB file
        pdb_parser = PDBParser()
        pdb_struct = pdb_parser.get_structure(pdbid, our_pdb_filename)
        # create the Tableaux and output them
        (tableaux_list, sse_string_list) = make_tableaux(
            our_pdb_filename,
            pdb_struct,
            secstruct_program,
            domain_program,
            include_310_helices,
            include_pi_helices,
            use_numeric,
            sse_id_list,
            use_hk,
            min_sse_len,
            build_dist_matrix,
        )

    finally:
        if used_tmp_file:
            cleanup_tmpdir(TMPDIR)
    return (pdbid, tableaux_list, sse_string_list)
Пример #8
0
        try:
            exit_status = 0
            # parse PDB file - only needed for DDOMAIN when segment spans chains
            pdb_parser = PDBParser()
            pdb_struct = pdb_parser.get_structure(pdbid, our_pdb_filename)

            test_domains = get_domains(test_domain_method, pdbid,
                                       our_pdb_filename, pdb_struct)
            ref_domains = get_domains(ref_domain_method, pdbid,
                                      our_pdb_filename, pdb_struct)
            if print_domains:
                print test_domain_method
                write_domains(sys.stdout, test_domains)
                print ref_domain_method
                write_domains(sys.stdout, ref_domains)
            print domeval.domain_eval(test_domains, ref_domains)
        except NotInCATH_Exception,ex_pdbid:
            sys.stderr.write(str(ex_pdbid) + " not found in CATH CDF file\n")
            exit_status = 1
        finally:
            if used_tmp_file:
                cleanup_tmpdir(TMPDIR)

    sys.exit(exit_status)

            
if __name__ == "__main__":
    warnings.filterwarnings('ignore', 'tempnam', RuntimeWarning) 
    main()
Пример #9
0
def main():
    """
    main for pytableaucreate.py

    Usage: pytableaucreate [-35nefuv] [-d|-b] [-t structprog] [-p domainprog]
                [-a domainid]
                [-s sse_num_list] [-c chainid] [-m min_sse_len]
                [-o savefile] <PDBfile>


    -3 specifies to include 3_10 helices in the diagram. Default is only
       alpha helices.

    -5 specifies to include pi helices in the diagram. Defaul is only
       alpha helices.

    -k use the HH and KK codes for respectively antiparallel and parallel
       strands in the same sheet, rather than the O, P etc. codes.
       
    -n output a numeric omega matrix instead of tableau.

    -e output numeric tableau angles in degrees, in the original
       TableauCreator .angles file format, with number of entries on
       first line, SSE sequence description on second line (E/H), then
       (full) matrix with angles in degrees (rather than radians).
       For distance matrix, same format with distances between SSEs
       in Angstroms.

    -f output the matrix in 'FORTRAN style' lower triangle with
       header line suitable for input to TMATN.

    -d build SSE axis midpoint distance matrix rather than tableau.

    -b build both the tableau and distance matrix and output together,
       for use with tsrchd etc. for example. If -u is used to permute
       the matrices, they are permuted the same way so they are still
       consistent.

    -p specify the domain decomposition method.
       Valid values are 'none' (default), 'ddomain', 'cath:cdf_filename'.

    -a domainid : only output specified domain

    -t specifies the secondary structure assignment program to use.
       Currently suppoed is 'pdb' and 'dfh,ssp' and 'stride' or 'pmml'.
       Default 'pdb'.

    -s sse_num_list specifies a comman-separated
       list of SSE sequential ids to build the
       tableau for. SSE sequential id's start at 1 and go from N to C
       terminus. E.g. -s1,5,8 includes only the 1st, 5th and 8ths SSEs.
       Numbers do not restart at chains (but do restart in each domain).
       These nubmers are those assigned by 'ptgraph2 -b sequential' option.

       TODO: this currently does not make sense when multiple domains
       are being procssed, this option applies to each domain.
    
   -c chainid : specify chain identifier; only build tableau for that chain

   -m min_sse_len : minimum nubmer of residues in SSE for it to be included

   -i identifier : when using fortran format (-f), specify the identifier
      to use in the output rather than deriving it from the filename

    -o savefile : save tableau in packed format for use in other
       programs, such as tabsearchqpml.py
       WARNING: savefile is overwritten if it exists

       TODO: this currently does not make sense when multiple domains
       are being procssed, this option only saves first domain.
       
    -u randomly pemute the rows+cols (symmetric) of the tableau/distance matrix.
       writes the permutation vector in form 
       permutation = i,j,..,m
       e.g. 
       permutation = 3,1,2,4
       as first line of output before identifier information and tableau

    -v specifies verbose mode: debugging output is written to stderr.
    """
    global verbose
    
    try:
        opts, args = getopt.getopt(sys.argv[1:], "35bdfknep:a:t:s:c:m:i:o:uv?")
    except getopt.GetoptError:
        usage(os.path.basename(sys.argv[0]))

    valid_secstruct_programs = ["dssp", "stride", "pdb", "pmml"]
    valid_domain_programs = getdomains.valid_domain_programs + [r"none"]
    valid_domain_programs_re = [ re.compile(re_str) for re_str in
                                 valid_domain_programs ]

    verbose = False # global (python globals are only 'global' to module though)
    secstruct_program = "pdb"
    include_310_helices = False
    include_pi_helices = False
    domain_program = "none"
    sse_id_list = None
    use_numeric = False
    use_hk = False
    savefilename = None
    min_sse_len = None
    fortran_format = False
    build_distance_matrix = False
    chainid = None
    fident = None
    do_shuffle = False
    build_both = False # both tableau and dist matrix
    use_old_format = False # size + SSE chain + degrees omega matrix
    domainid = None

    for opt,arg in opts:
        if opt == "-3":   # include 3_10 helices
            include_310_helices = True
        elif opt == "-5": # include pi helices
            include_pi_helices = True
        elif opt == "-d":  # build SSE midpoint distance matrix not tableau
            build_distance_matrix = True
        elif opt == "-b": # build both tableau and distance matrix
            build_both = True
        elif opt == "-k": # use HH and KK codes
            use_hk = True
        elif opt == "-n": # output numeric matrix not tableau
            use_numeric = True
        elif opt == "-e": # use TableauCreator .angles file format
            use_old_format = True
        elif opt == "-f":  # FORTRAN style format for TMATN
            fortran_format = True
        elif opt == "-p": # domain parsing program
            domain_program = None
            for valid_domarg_re in valid_domain_programs_re:
                if valid_domarg_re.match(arg):
                    domain_program = arg
                    break
            if domain_program == None:
                sys.stderr.write("valid values for -p are: " +
                                 str(valid_domain_programs) + "\n")
                usage(sys.argv[0])
        elif opt == "-a":  # only output tableau for specified domain id
            domainid = arg
        elif opt == "-t":
            if arg not in valid_secstruct_programs:
                sys.stderr.write("valid values for -t are: " +
                                 str(valid_secstruct_programs) + "\n")
                usage(sys.argv[0])
            secstruct_program = arg
        elif opt == "-s":
            sse_id_list_str = arg.split(',')
            sse_id_list = []
            sse_id_uniq_dict = {} # { id : True } just for checking all unique
            for sse_id_str in sse_id_list_str:
                if sse_id_str.isdigit():
                    if sse_id_uniq_dict.has_key(int(sse_id_str)):
                        sys.stderr.write("duplicate SSE sequential number "  +
                                         sse_id_str + "\n")
                        usage(sys.argv[0])
                    sse_id_uniq_dict[int(sse_id_str)] = True
                    sse_id_list.append(int(sse_id_str))
                else:
                    sys.stderr.write("not a valid SSE sequential number '" +
                                     sse_id_str + "'\n")
                    usage(sys.argv[0])
            sse_id_list.sort() # ensure SSEs are in order
        elif opt == "-c": # chain identifier
            if len(arg) != 1:
                sys.stderr.write("invalid chain identifier for -c option\n")
                usage(sys.argv[0])
            chainid = arg.upper()
        elif opt == "-m": # min sse len
            min_sse_len = int(arg)
        elif opt == "-i": # identifier to use for fortran format
            fident = arg
        elif opt == "-o": # save tableau in packed format
            savefilename = arg
        elif opt == "-u": # randomly permute the tableau/matrix
            do_shuffle = True
        elif opt == "-v": # verbose
            verbose = True # this module only
            ptnode_set_verbose(True) # ptnode module
            ptsecstruct.ptsecstruct_set_verbose(True) # ptsecstruct module
            ptdomain_set_verbose(True) # ptdomain module
        else:
            usage(sys.argv[0])

    if use_numeric and use_hk:
        sys.stderr.write("-n (numeric) and -k (use HH and KK codes) are "
                         "mutually exlusive\n")
        usage(sys.argv[0])

    if build_distance_matrix and build_both:
        sys.stderr.write("WARNING: both -d (build dist matrix) and -b "
                         "(build both) specified, ignoring -d\n")
        build_distance_matrix = False

    if savefilename and do_shuffle:
        sys.stderr.write('WARNING: saved tableau will not be shuffled\n')

    if build_distance_matrix:
        if use_numeric:
            use_numeric = False
            sys.stderr.write("WARNING: -n (numeric) ignored for -d (distance matrix)\n")
        if use_hk:
            sys.stderr.write("-k (use HH and KK) invalid for -d (distance matrix)\n");
            usage(sys.argv[0])

    if (secstruct_program == "pmml" and 
        (min_sse_len == None  or min_sse_len < 3)):
        sys.stderr.write("WARNING: PMML can give SSEs of length 1 or 2 causing axis fitting to fail, setting minimum length to 3 as if -m3 were specfified\n")
        min_sse_len = 3

    if fident:
        if not fortran_format:
            sys.stderr.write("-i is only valid with -f\n")
            usage(sys.argv[0])
        elif len(fident) > 8:
            sys.stderr.write("identifier must be 8 chars or less\n")
            usage(sys.argv[0])

    if use_old_format and (build_both or
                           use_hk or use_numeric or fortran_format or
                           do_shuffle or savefilename):
        sys.stderr.write("-e (use old .angles format) is not compatible "
                         "with -b -k or -n or -f or -u or -o\n")
        usage(os.path.basename(sys.argv[0]))
              
    if len(args) != 1:
        usage(os.path.basename(sys.argv[0]))

    pdb_filename = args[0]

    # check for compressed files. We only support gzip (.gz)
    # Note we are not using the zlib or GzipFile python modules
    # since we are calling to external programs which require the
    # file uncompressed themsevles anyway so we'll just run gzip
    # to uncompress the file to a temporary directory.
    pdb_file_basename = os.path.basename(pdb_filename)
    (name,extension) = os.path.splitext(pdb_file_basename)
    if extension == '.gz':
        TMPDIR = os.tempnam(None, "ptgz")
        os.mkdir(TMPDIR)
        tmp_pdbfilename = os.path.join(TMPDIR, name)
        os.system("gzip " + pdb_filename + " -d -c > " + tmp_pdbfilename)
        our_pdb_filename = tmp_pdbfilename
        used_tmp_file = True
    else:
        our_pdb_filename = pdb_filename
        used_tmp_file = False

    try:
        if fortran_format and fident:
            pdbid = fident
        else:
            pdbid = name.upper()
            if len(pdbid) >= 6 and pdbid[:3] == "PDB":
                pdbid = pdbid[3:7]
            if chainid:
                pdbid += '_' + chainid

        # parse PDB file
        pdb_parser = PDBParser()
        pdb_struct = pdb_parser.get_structure(pdbid, our_pdb_filename)
        # create the Tableaux and output them
        (tableaux_list, ssestr_list) = make_tableaux(our_pdb_filename,
                                      pdb_struct,
                                      secstruct_program,
                                      domain_program,
                                      include_310_helices,
                                      include_pi_helices,
                                      (use_numeric or use_old_format),
                                      sse_id_list,
                                      use_hk,
                                      min_sse_len,
                                      build_distance_matrix,
                                      chainid,
                                      domainid)
        if build_both:
            (distmatrix_list, ssestr_list) = make_tableaux(our_pdb_filename,
                                            pdb_struct,
                                            secstruct_program,
                                            domain_program,
                                            include_310_helices,
                                            include_pi_helices,
                                            use_numeric,
                                            sse_id_list,
                                            use_hk,
                                            min_sse_len,
                                            True, # build_distance_matrix
                                            chainid,
                                            domainid)
        i = 1
        for tableau in tableaux_list:
            n = len(tableau)
            permutation = range(n) # used to permute rows/cols: null permutation
            if do_shuffle:
                random.shuffle(permutation) # actually permute for shuffle mode
                if verbose:
                    sys.stderr.write('permutation is: ' + str(permutation)+'\n')
                sys.stdout.write('permutation = ' + ','.join([str(x+1) for x in permutation]) + '\n')
            if i > 1:
                sys.stdout.write('\ndomain ' + str(i) + ':\n')

            if fortran_format:
                sys.stdout.write("%7.7s %4d\n" % (pdbid.upper(), n))

            if use_old_format:
                if build_distance_matrix:
                    write_distmatrix_old_format(n, tableau, ssestr_list[i-1])
                else:
                    write_tableau_old_format(n, tableau, ssestr_list[i-1])
            else:
                write_tableau(n, tableau, permutation, use_numeric,
                              fortran_format, build_distance_matrix)

            if build_both:
                write_tableau(n, distmatrix_list[i-1],
                              permutation, use_numeric,
                              fortran_format, True)
                
            i += 1
    finally:
        if used_tmp_file:
            cleanup_tmpdir(TMPDIR)


    if savefilename:
        if verbose:
            sys.stderr.write('writing tableau to ' + savefilename +'\n')
        fh = open(savefilename, "w")
        if len(tableaux_list) > 1:
            sys.stderr.write('WARNING: only saving first tableau in list\n')
        if build_distance_matrix:
            pickle.dump(distmatrix, fh)
        elif use_numeric:
            # Numeric/numpy seems to have no 'packed' format for symmetric
            # matrices, so we just have to dump the whole thing.
            pickle.dump(Omega, fh)
        else:
            pickle.dump(PTTableauPacked(tableaux_list[0]), fh)
        fh.close()
Пример #10
0
def read_tableau_from_tableaucreator(pdb_filename, ptnode_list,
                                     ssesinfo_filename):
    """
    Run Arun's TableauCreator program on the supplied pdb_filename
    using SSEsInfo file.

    Parameters:
       pdb_filename - PDB file to run TableauCreator on
       ptnode_list - list of PTNode objects (ie iterable of PTNode)
                         representing the SSEs (helices,strands) the
                         tabelau is for.
       ssesinfo_filename - filename of the .SSEsInfo file that was written
                           to define SSEs for TableauCreator.
    Return value:
       PTTableau object built from TableauCreator output
       
    NB: TableauCreator is not yet published or available (October 2007)
    and I am using a private version which Arun sent me, which I modified
    to add the -s option to use STRIDE rather than DSSP
    and to have the -i option to parse .SSEsInfo files.
    """

    # TableauCreator needs an output directory where it writes all its
    # intermediate/output files, only puts progress information/errors
    # to stdout/stderr.

    tmpdir = os.tempnam(None, "pttab")
    os.mkdir(tmpdir)
    command = "TableauCreator "
    command += "-i " + ssesinfo_filename + " "
    command += pdb_filename + " " + tmpdir
    command += " >/dev/null"
    if verbose:
        sys.stderr.write("running '" + command + "'...")
    os.system(command)
    if verbose:
        sys.stderr.write("done\n")
    # output files are:
    #   <pdbfilename>.angles
    #   <pdbfilename>.SSEsInfo
    #   <pdbfilename>.stride or <pdbfilename>.dssp
    #   <pdbfilename>.tableau
    outfile_prefix = os.path.join(tmpdir, os.path.basename(pdb_filename))
    if not os.path.isfile(os.path.join(tmpdir, "TABCREATE_OK")):
        sys.stderr.write("ERROR: TableauCreator failed\n")
        cleanup_tmpdir(tmpdir)
        return None
    # Now the tricky thing is TableauCreator indexes its matrix just with
    # purely sequential numbers from 0 (as conventional)
    # assuming all SSEs in one domain and in fact one chain
    # (so we handle this by creating our own simple PDB file with only
    # ATOM records for our current domain, and only one TER record on
    # end so chains concatenated effectively).
    # And also (as in comments above functions) we have the dodginess of
    # doing the same thing in different ways in multiple places (DSSP/STRIDE
    # parsing, PDB parsing, etc.).
    # So let's check that the TableauCreator SSE info lines up with ours
    # (otherwise we can't use the tableau data).

    # parse the SSEsInfo file and check lines up with ptnodes,
    # returns list of ptnodes corresponding to Tableau entries (may be shorter
    # than our input node list; some removed as no equivalent in tableua).
    nodelist = parse_tableaucreator_ssesinfo(outfile_prefix + '.SSEsInfo',
                                             ptnode_list)
    if nodelist != None:
        tableau_filename = outfile_prefix + ".tableau"
        tableau = parse_tableaucreator_output(tableau_filename, nodelist)
        if tableau != None:
            if verbose:
                sys.stderr.write(str(tableau))
        else:
            sys.stderr.write('WARNING: problem parsing TableauCreator output;\n'
                         '         tableau information will not be used\n')
    else:
        sys.stderr.write('WARNING: problem with TableauCreator output;\n'
                         '         tableau information will not be used\n')
        tableau = None

    cleanup_tmpdir(tmpdir)
    return tableau
Пример #11
0
def main():
    """
    main for pytableaucreate.py

    Usage: pytableaucreate [-35nefuv] [-d|-b] [-t structprog] [-p domainprog]
                [-a domainid]
                [-s sse_num_list] [-c chainid] [-m min_sse_len]
                [-o savefile] <PDBfile>


    -3 specifies to include 3_10 helices in the diagram. Default is only
       alpha helices.

    -5 specifies to include pi helices in the diagram. Defaul is only
       alpha helices.

    -k use the HH and KK codes for respectively antiparallel and parallel
       strands in the same sheet, rather than the O, P etc. codes.
       
    -n output a numeric omega matrix instead of tableau.

    -e output numeric tableau angles in degrees, in the original
       TableauCreator .angles file format, with number of entries on
       first line, SSE sequence description on second line (E/H), then
       (full) matrix with angles in degrees (rather than radians).
       For distance matrix, same format with distances between SSEs
       in Angstroms.

    -f output the matrix in 'FORTRAN style' lower triangle with
       header line suitable for input to TMATN.

    -d build SSE axis midpoint distance matrix rather than tableau.

    -b build both the tableau and distance matrix and output together,
       for use with tsrchd etc. for example. If -u is used to permute
       the matrices, they are permuted the same way so they are still
       consistent.

    -p specify the domain decomposition method.
       Valid values are 'none' (default), 'ddomain', 'cath:cdf_filename'.

    -a domainid : only output specified domain

    -t specifies the secondary structure assignment program to use.
       Currently suppoed is 'pdb' and 'dfh,ssp' and 'stride' or 'pmml'.
       Default 'pdb'.

    -s sse_num_list specifies a comman-separated
       list of SSE sequential ids to build the
       tableau for. SSE sequential id's start at 1 and go from N to C
       terminus. E.g. -s1,5,8 includes only the 1st, 5th and 8ths SSEs.
       Numbers do not restart at chains (but do restart in each domain).
       These nubmers are those assigned by 'ptgraph2 -b sequential' option.

       TODO: this currently does not make sense when multiple domains
       are being procssed, this option applies to each domain.
    
   -c chainid : specify chain identifier; only build tableau for that chain

   -m min_sse_len : minimum nubmer of residues in SSE for it to be included

   -i identifier : when using fortran format (-f), specify the identifier
      to use in the output rather than deriving it from the filename

    -o savefile : save tableau in packed format for use in other
       programs, such as tabsearchqpml.py
       WARNING: savefile is overwritten if it exists

       TODO: this currently does not make sense when multiple domains
       are being procssed, this option only saves first domain.
       
    -u randomly pemute the rows+cols (symmetric) of the tableau/distance matrix.
       writes the permutation vector in form 
       permutation = i,j,..,m
       e.g. 
       permutation = 3,1,2,4
       as first line of output before identifier information and tableau

    -v specifies verbose mode: debugging output is written to stderr.
    """
    global verbose

    try:
        opts, args = getopt.getopt(sys.argv[1:], "35bdfknep:a:t:s:c:m:i:o:uv?")
    except getopt.GetoptError:
        usage(os.path.basename(sys.argv[0]))

    valid_secstruct_programs = ["dssp", "stride", "pdb", "pmml"]
    valid_domain_programs = getdomains.valid_domain_programs + [r"none"]
    valid_domain_programs_re = [
        re.compile(re_str) for re_str in valid_domain_programs
    ]

    verbose = False  # global (python globals are only 'global' to module though)
    secstruct_program = "pdb"
    include_310_helices = False
    include_pi_helices = False
    domain_program = "none"
    sse_id_list = None
    use_numeric = False
    use_hk = False
    savefilename = None
    min_sse_len = None
    fortran_format = False
    build_distance_matrix = False
    chainid = None
    fident = None
    do_shuffle = False
    build_both = False  # both tableau and dist matrix
    use_old_format = False  # size + SSE chain + degrees omega matrix
    domainid = None

    for opt, arg in opts:
        if opt == "-3":  # include 3_10 helices
            include_310_helices = True
        elif opt == "-5":  # include pi helices
            include_pi_helices = True
        elif opt == "-d":  # build SSE midpoint distance matrix not tableau
            build_distance_matrix = True
        elif opt == "-b":  # build both tableau and distance matrix
            build_both = True
        elif opt == "-k":  # use HH and KK codes
            use_hk = True
        elif opt == "-n":  # output numeric matrix not tableau
            use_numeric = True
        elif opt == "-e":  # use TableauCreator .angles file format
            use_old_format = True
        elif opt == "-f":  # FORTRAN style format for TMATN
            fortran_format = True
        elif opt == "-p":  # domain parsing program
            domain_program = None
            for valid_domarg_re in valid_domain_programs_re:
                if valid_domarg_re.match(arg):
                    domain_program = arg
                    break
            if domain_program == None:
                sys.stderr.write("valid values for -p are: " +
                                 str(valid_domain_programs) + "\n")
                usage(sys.argv[0])
        elif opt == "-a":  # only output tableau for specified domain id
            domainid = arg
        elif opt == "-t":
            if arg not in valid_secstruct_programs:
                sys.stderr.write("valid values for -t are: " +
                                 str(valid_secstruct_programs) + "\n")
                usage(sys.argv[0])
            secstruct_program = arg
        elif opt == "-s":
            sse_id_list_str = arg.split(',')
            sse_id_list = []
            sse_id_uniq_dict = {}  # { id : True } just for checking all unique
            for sse_id_str in sse_id_list_str:
                if sse_id_str.isdigit():
                    if sse_id_uniq_dict.has_key(int(sse_id_str)):
                        sys.stderr.write("duplicate SSE sequential number " +
                                         sse_id_str + "\n")
                        usage(sys.argv[0])
                    sse_id_uniq_dict[int(sse_id_str)] = True
                    sse_id_list.append(int(sse_id_str))
                else:
                    sys.stderr.write("not a valid SSE sequential number '" +
                                     sse_id_str + "'\n")
                    usage(sys.argv[0])
            sse_id_list.sort()  # ensure SSEs are in order
        elif opt == "-c":  # chain identifier
            if len(arg) != 1:
                sys.stderr.write("invalid chain identifier for -c option\n")
                usage(sys.argv[0])
            chainid = arg.upper()
        elif opt == "-m":  # min sse len
            min_sse_len = int(arg)
        elif opt == "-i":  # identifier to use for fortran format
            fident = arg
        elif opt == "-o":  # save tableau in packed format
            savefilename = arg
        elif opt == "-u":  # randomly permute the tableau/matrix
            do_shuffle = True
        elif opt == "-v":  # verbose
            verbose = True  # this module only
            ptnode_set_verbose(True)  # ptnode module
            ptsecstruct.ptsecstruct_set_verbose(True)  # ptsecstruct module
            ptdomain_set_verbose(True)  # ptdomain module
        else:
            usage(sys.argv[0])

    if use_numeric and use_hk:
        sys.stderr.write("-n (numeric) and -k (use HH and KK codes) are "
                         "mutually exlusive\n")
        usage(sys.argv[0])

    if build_distance_matrix and build_both:
        sys.stderr.write("WARNING: both -d (build dist matrix) and -b "
                         "(build both) specified, ignoring -d\n")
        build_distance_matrix = False

    if savefilename and do_shuffle:
        sys.stderr.write('WARNING: saved tableau will not be shuffled\n')

    if build_distance_matrix:
        if use_numeric:
            use_numeric = False
            sys.stderr.write(
                "WARNING: -n (numeric) ignored for -d (distance matrix)\n")
        if use_hk:
            sys.stderr.write(
                "-k (use HH and KK) invalid for -d (distance matrix)\n")
            usage(sys.argv[0])

    if (secstruct_program == "pmml"
            and (min_sse_len == None or min_sse_len < 3)):
        sys.stderr.write(
            "WARNING: PMML can give SSEs of length 1 or 2 causing axis fitting to fail, setting minimum length to 3 as if -m3 were specfified\n"
        )
        min_sse_len = 3

    if fident:
        if not fortran_format:
            sys.stderr.write("-i is only valid with -f\n")
            usage(sys.argv[0])
        elif len(fident) > 8:
            sys.stderr.write("identifier must be 8 chars or less\n")
            usage(sys.argv[0])

    if use_old_format and (build_both or use_hk or use_numeric
                           or fortran_format or do_shuffle or savefilename):
        sys.stderr.write("-e (use old .angles format) is not compatible "
                         "with -b -k or -n or -f or -u or -o\n")
        usage(os.path.basename(sys.argv[0]))

    if len(args) != 1:
        usage(os.path.basename(sys.argv[0]))

    pdb_filename = args[0]

    # check for compressed files. We only support gzip (.gz)
    # Note we are not using the zlib or GzipFile python modules
    # since we are calling to external programs which require the
    # file uncompressed themsevles anyway so we'll just run gzip
    # to uncompress the file to a temporary directory.
    pdb_file_basename = os.path.basename(pdb_filename)
    (name, extension) = os.path.splitext(pdb_file_basename)
    if extension == '.gz':
        TMPDIR = os.tempnam(None, "ptgz")
        os.mkdir(TMPDIR)
        tmp_pdbfilename = os.path.join(TMPDIR, name)
        os.system("gzip " + pdb_filename + " -d -c > " + tmp_pdbfilename)
        our_pdb_filename = tmp_pdbfilename
        used_tmp_file = True
    else:
        our_pdb_filename = pdb_filename
        used_tmp_file = False

    try:
        if fortran_format and fident:
            pdbid = fident
        else:
            pdbid = name.upper()
            if len(pdbid) >= 6 and pdbid[:3] == "PDB":
                pdbid = pdbid[3:7]
            if chainid:
                pdbid += '_' + chainid

        # parse PDB file
        pdb_parser = PDBParser()
        pdb_struct = pdb_parser.get_structure(pdbid, our_pdb_filename)
        # create the Tableaux and output them
        (tableaux_list,
         ssestr_list) = make_tableaux(our_pdb_filename, pdb_struct,
                                      secstruct_program, domain_program,
                                      include_310_helices, include_pi_helices,
                                      (use_numeric or use_old_format),
                                      sse_id_list, use_hk, min_sse_len,
                                      build_distance_matrix, chainid, domainid)
        if build_both:
            (distmatrix_list, ssestr_list) = make_tableaux(
                our_pdb_filename,
                pdb_struct,
                secstruct_program,
                domain_program,
                include_310_helices,
                include_pi_helices,
                use_numeric,
                sse_id_list,
                use_hk,
                min_sse_len,
                True,  # build_distance_matrix
                chainid,
                domainid)
        i = 1
        for tableau in tableaux_list:
            n = len(tableau)
            permutation = range(
                n)  # used to permute rows/cols: null permutation
            if do_shuffle:
                random.shuffle(
                    permutation)  # actually permute for shuffle mode
                if verbose:
                    sys.stderr.write('permutation is: ' + str(permutation) +
                                     '\n')
                sys.stdout.write('permutation = ' +
                                 ','.join([str(x + 1)
                                           for x in permutation]) + '\n')
            if i > 1:
                sys.stdout.write('\ndomain ' + str(i) + ':\n')

            if fortran_format:
                sys.stdout.write("%7.7s %4d\n" % (pdbid.upper(), n))

            if use_old_format:
                if build_distance_matrix:
                    write_distmatrix_old_format(n, tableau, ssestr_list[i - 1])
                else:
                    write_tableau_old_format(n, tableau, ssestr_list[i - 1])
            else:
                write_tableau(n, tableau, permutation, use_numeric,
                              fortran_format, build_distance_matrix)

            if build_both:
                write_tableau(n, distmatrix_list[i - 1], permutation,
                              use_numeric, fortran_format, True)

            i += 1
    finally:
        if used_tmp_file:
            cleanup_tmpdir(TMPDIR)

    if savefilename:
        if verbose:
            sys.stderr.write('writing tableau to ' + savefilename + '\n')
        fh = open(savefilename, "w")
        if len(tableaux_list) > 1:
            sys.stderr.write('WARNING: only saving first tableau in list\n')
        if build_distance_matrix:
            pickle.dump(distmatrix, fh)
        elif use_numeric:
            # Numeric/numpy seems to have no 'packed' format for symmetric
            # matrices, so we just have to dump the whole thing.
            pickle.dump(Omega, fh)
        else:
            pickle.dump(PTTableauPacked(tableaux_list[0]), fh)
        fh.close()