示例#1
0
def xml(id):
    from hpf.hddb.db import Session, Family
    session = Session()
    family = session.query(Family).get(id)
    filename = "%i.xml" % family.id

    if runtime().opt(GZIP):
        import gzip
        filename = "%s.gz" % filename
        handle = gzip.open(filename,"w")
    else:
        handle = open(filename,"w")

    try:
        doc = FamilyFeatureBuilder(
            lambda: DefaultXMLGenerator(handle,pretty=True),
            lambda handler: StructureFeatureProvider(handler),
            lambda handler: ColumnFeatureProvider(handler),
            lambda handler: IeaFeatureProvider(handler),
            lambda handler: SelectionFeatureProvider(handler)
            )
        doc.buildDocument(family)
    finally:
        handle.close()
    session.close()
示例#2
0
文件: oid.py 项目: bsmithers/hpf
 def __getitem__(self, key):
     from hpf.hddb.db import Session, SequenceAc
     amnh = self._oid_amnh[key]
     session = Session()
     try:
         ac = session.query(SequenceAc).filter(SequenceAc.ac==amnh).first()
         return ac.protein_key if ac else None
     finally:
         session.close()
示例#3
0
文件: interpro.py 项目: bsmithers/hpf
 def _make_seqfile(self, ):
     from hpf.hddb.db import Session, Sequence
     session=Session()
     sequence = session.query(Sequence).get(self.sequence_id)
     if not sequence:
         raise Exception("Getting sequence object from database failed")
     outhandle = open(self.sequence_file, 'w')
     outhandle.write(">hpf_seqid|{0}\n".format(sequence.id))
     outhandle.write("{0}\n".format(sequence.sequence))
     outhandle.close()
     session.close()
示例#4
0
def xml(id):
    from hpf.hddb.db import Session, Family
    session = Session()
    family = session.query(Family).get(id)
    filename = "%i.xml" % family.id

    if runtime().opt(GZIP):
        import gzip
        filename = "%s.gz" % filename
        handle = gzip.open(filename, "w")
    else:
        handle = open(filename, "w")

    try:
        doc = FamilyFeatureBuilder(
            lambda: DefaultXMLGenerator(handle, pretty=True),
            lambda handler: StructureFeatureProvider(handler),
            lambda handler: ColumnFeatureProvider(handler),
            lambda handler: IeaFeatureProvider(handler),
            lambda handler: SelectionFeatureProvider(handler))
        doc.buildDocument(family)
    finally:
        handle.close()
    session.close()
示例#5
0
文件: oid.py 项目: bsmithers/hpf
class OIDImporter(object):
    """
    Import a set of OID files into the database
    """

    def __init__(
        self,
        familyName,
        alignFile,
        alignColcullLog,
        alignSeqcullLog,
        treeFile,
        treeDiagCharsFile,
        codemlFile=None,
        alignFormat="fasta",
        oid_key=None,
    ):
        self.familyName = familyName
        self.treeFile = treeFile
        self.treeDiagCharsFile = treeDiagCharsFile
        self.alignFile = alignFile
        self.alignColcullLog = alignColcullLog
        self.alignSeqcullLog = alignSeqcullLog
        self.codemlFile = codemlFile
        self.alignFormat = alignFormat
        self.oid_key = oid_key

    def merge(self):
        from hpf.hddb.db import Session, Family

        self.session = Session()

        self.family = self.session.query(Family).filter(Family.name == self.familyName).first()
        if not self.family:
            runtime().debug("Creating family", self.familyName)
            self._family()
            self._alignment()
            self._tree()
        else:
            self.alignment = self.family.alignment
            self.tree = self.alignment.tree
            runtime().debug("Found family", self.family.id)

        if not self.family.alignments[0].tree.codeml:
            runtime().debug("Importing codeml")
            self._codeml()
        else:
            runtime().debug("Already found codeml", self.family.alignments[0].tree.codeml.id)

        # Commit the session, close, and finish
        self.session.commit()
        self.session.close()

    def _index(self, name):
        n = name.split("#")[-1]
        if n.startswith("N"):
            n = n[1:]
        assert n.isdigit()
        return n

    def _tree(self):
        session = self.session

        # # Load the tree file and rename the taxa.
        # from Bio.Nexus.Nexus import Nexus
        # nex=Nexus(self.treeFile)
        # self.nexus = nex.trees[0]

        from Bio.Nexus.Trees import Tree as NewickTree

        tree_str = open(self.treeFile).read()
        self.nexus = NewickTree(tree_str)

        # Rename all the taxa.
        for id in self.nexus.get_terminals():
            node = self.nexus.node(id)
            node.data.taxon = self._index(node.data.taxon)

        # Create the DB object
        from hpf.hddb.db import Tree

        self.tree = Tree(
            alignment_key=self.alignment.id,
            text=self.nexus.to_string(plain=False, plain_newick=True),
            filename=self.treeFile,
        )
        session.add(self.tree)
        session.flush()

        # Now add in the node references
        self.nexus.name = self.tree.id
        assert self.tree.id != None
        runtime().debug("Added tree", self.tree)
        from hpf.hddb.db import TreeNodeFactory

        nodes = list(TreeNodeFactory().create(self.nexus))
        for node in nodes:
            node.ancestor_node = node.ancestor.id if node.ancestor else None
            # This should add the new object into the session
            self.tree.nodes.append(node)
            # session.add(node)
            session.flush()

        runtime().debug("Appended", len(nodes), "tree nodes")
        session.flush()

        # Now import the diagnostic characters and reference the nodes.
        from hpf.amnh.oid import DiagCharsParser
        from hpf.hddb.db import TreeFactory

        biotree = TreeFactory(name_func=lambda node: str(node.id)).create(self.tree.nodes, self.tree.id)
        parser = DiagCharsParser(biotree)
        runtime().debug(self.treeDiagCharsFile)
        with open(self.treeDiagCharsFile) as handle:
            diagchars = list(parser.parse(handle))
            runtime().debug("DiagChars", len(diagchars))
            for d in diagchars:
                session.add(d)
        session.flush()

    def _codeml(self):
        if not self.codemlFile:
            return
        assert self.family.id != None
        assert self.tree.id != None

        # We need to convert the columns to the original alignment indices
        mapper = CulledColumnMapper(self.alignment, self.alignment.culled_columns)
        parser = PositiveSelectionParser()
        models = list(parser.parse(self.codemlFile))
        runtime().debug("Found", len(models), "models")
        for i, model in enumerate(models):
            model.tree_key = self.tree.id
            self.session.add(model)
            self.session.flush()
            ps = list(model.ps)
            runtime().debug("Found", len(ps), "sites in model", model.model)
            for j, site in enumerate(ps):
                site.codeml_key = model.id
                # Indices in CodeML start at 1, convert to 0 and then map
                orig = site.column
                site.column = mapper[site.column - 1]
                runtime().debug("column", orig, "mapped to", site.column, site.probability)
                try:
                    self.session.add(site)
                except:
                    runtime().debug(i, ":", j, " failure on column", orig, "mapped to", site.column, site.probability)
                    raise
            runtime().debug("Finished with model")
            self.session.flush()

        #        with open(self.codemlFile) as handle:
        #            text = handle.read()
        #        from hpf.hddb.db import CodeML
        #        self.codeml = CodeML(tree_key=self.tree.id,
        #                             filename=self.codemlFile,
        #                             text=text)
        #        self.session.add(self.codeml)
        #        self.session.flush()
        #        parser = LRTParser(self.alignment, self.alignment.culled_columns,self.codeml)
        #        with open(self.codemlFile) as handle:
        #            for selection in parser.parse(handle):
        #                selection.codeml_key = self.codeml.id
        #                self.session.merge(selection)
        runtime().debug("finished import codeml")

    def _alignment(self):
        session = self.session

        # Read the alignment
        from Bio import AlignIO

        with open(self.alignFile) as handle:
            align = AlignIO.read(handle, self.alignFormat)
        # Rename 'id' with the correct protein key
        for record in align:
            record.id = self._index(record.id)
        # Write to a text buffer and create the DB object
        text = StringIO()
        AlignIO.write([align], text, self.alignFormat)
        from hpf.hddb.db import Alignment

        self.alignment = Alignment(
            family_key=self.family.id, format=self.alignFormat, filename=self.alignFile, text=text.getvalue()
        )
        # Add to session and flush
        session.add(self.alignment)
        session.flush()

        # Flip through the proteins in the alignment and add
        # the records.
        for record in align:
            protein_key = record.id
            assert protein_key != 0 and protein_key != None, protein_key
            runtime().debug("protein: ", protein_key)
            from hpf.hddb.db import AlignmentProtein

            s = AlignmentProtein(alignment_key=self.alignment.id, protein_key=protein_key, sequence=str(record.seq))
            session.add(s)
            session.flush()

            # There may exist multiple alignments, but the definition
            # of membership in the family is done here.
            from hpf.hddb.db import FamilyProtein

            fs = FamilyProtein(family_key=self.family.id, protein_key=protein_key, seed=True)
            session.merge(fs)

        # Now read the colulmn culling log.  Indices start at 0 here.
        from hpf.hddb.db import AlignmentColcull, AlignmentSeqcull

        with open(self.alignColcullLog) as handle:
            for line in handle:
                column, gap, taxa, ratio = line.split()
                col = AlignmentColcull(alignment_key=self.alignment.id, column=column, gap_percentage=ratio)
                session.merge(col)
        with open(self.alignSeqcullLog) as handle:
            # rice#1182215    0.712765957446808
            for line in handle:
                parts = line.split()
                seq, score = parts
                seq = self._index(seq)
                # seq.split("#")[-1]
                if not seq.isdigit():
                    print parts, "SEQ:", seq
                    assert false
                cul = AlignmentSeqcull(alignment_key=self.alignment.id, protein_key=seq, score=score)
        session.flush()

    def _family(self):
        session = self.session
        from hpf.hddb.db import Family

        self.family = Family(name=self.familyName, experiment_key=0)
        session.add(self.family)
        session.flush()
示例#6
0
文件: __init__.py 项目: bsmithers/hpf
class McmDBExporter(object):
    """
    Writes an MCM data file, calculating SS values using DSSP.
    """
    
    # Order of columns for data file
    columns = ["structure_key",
               "sequence_key",
               "length", 
               "percent_alpha", 
               "percent_beta", 
               "sccs", 
               "astral_ac"]
    
    def __init__(self, 
                 mcmdb, 
                 mammoth_list="list.mammoth", 
                 info_file="data.mammothDb", 
                 dir=None):
        self.mcmdb = mcmdb
        self.mammoth_list = mammoth_list
        self.info_file = info_file
        self.dir = dir
        
    def __enter__(self):
        if not self.dir:
            self.dir = mkdtemp()
        from hpf.hddb.db import Session
        self.session = Session()
        self.list_handle = open(self.mammoth_list,"w")
        self.info_handle = open(self.info_file,"w")
        
        print >>self.info_handle, "\t".join(McmDBExporter.columns)
        print >>self.list_handle, "MAMMOTH List\n%s" % self.dir
        return self
        
    def __exit__(self, type, value, traceback):
        self.session.close()
        self.session = None
        self.list_handle.close()
        self.info_handle.close()

    def export(self):
        for id in self.mcmdb._ids:
            try:
                self._write(self._dict(id))
            except:
                print "error on",id
                continue

    def _write(self, d):
        data = [d[key] for key in McmDBExporter.columns]
        astral = d["astral"]
        print >>self.info_handle, "\t".join([str(value) for value in data])
        f = "%i.pdb" % astral.structure_key
        print >>self.list_handle, f
        full = os.path.join(self.dir,f)
        with open(full,"w") as handle:
            handle.write(astral.structure.text)
        
        
    def _dict(self, id):
        """
        @return:  
        """
        values = {}
        from hpf.hddb.db import Astral
        astral = self.session.query(Astral).get(id)
        dssp = self._dssp(astral.structure)
        
        ss = []
        first_model = list(astral.structure.pdb)[0]
        for i,res in enumerate(first_model.get_residues()):
            chain = res.get_parent().get_id() 
            hetero,seq,insertion = res.get_id()
            key = (chain,(' ',seq,insertion))
            try:
                aa, s, accessibility = dssp[key]
                ss.append(s)
            except:
                continue
        length = i+1
        assert float(abs(length-len(ss)))/float(length)<0.1
        alpha = [a for a in ss if a in ('H','G','I')]
        beta = [a for a in ss if a in ('E','B')]
        
        values['percent_alpha'] = float(len(alpha))/float(length)
        values['percent_beta'] = float(len(beta))/float(length)
        values["astral"] = astral
        values["length"] = length
        values["sccs"] = astral.sccs
        values["astral_ac"] = astral.stype+astral.pdbid+astral.part
        values["sequence_key"] = astral.sequence_key
        values["structure_key"] = astral.structure_key
        
        return values
            
    def _dssp(self, structure):
        structure_temp = NamedTemporaryFile(dir = self.dir)
        structure_file = structure_temp.name
        with open(structure_file,"w") as handle:
            handle.write(structure.text)
        
        dssp_temp = NamedTemporaryFile(dir = self.dir)
        cmd = "dssp %s > %s" % (structure_file, dssp_temp.name)
        import subprocess
        print cmd
        subprocess.check_call(cmd,shell=True)
        from Bio.PDB.DSSP import make_dssp_dict
        dict, keys = make_dssp_dict(dssp_temp.name)
        # This can be dangerous...
        #os.system("rm "+out_file)
        return dict
示例#7
0
def tasks():
    from hpf.hddb.db import Session, Family
    session = Session()
    ids = session.query(Family.id).filter(Family.manually_curated==0).all()
    session.close()
    return [i[0] for i in ids]
示例#8
0
def tasks():
    from hpf.hddb.db import Session, Family
    session = Session()
    ids = session.query(Family.id).filter(Family.manually_curated == 0).all()
    session.close()
    return [i[0] for i in ids]
示例#9
0
class MCM():
    """A class for wrapping full MCM functionality. Completes all steps necessary for running MCM
    PARAMETERS:
    decoy_file  - Rosetta denovo results file (silent format), named by code. eg: na977.result
    code        - (Optional) Rosetta prediction code. Will be parsed from filename if not given.
                  NOTE: code given should match code in decoy filename
    work_dir    - (Optional) a base dir to create working directory in (work_dir/code)
    trim_score  - First filter for decoys from denovo results. Keep the best 'trim_score' scored decoys. DEFAULT 15,000
    trim_rg     - Second filter for denovo results decoys. Of the best 'trim_score' decoys, keep the best 'trim_rg' decoys. DEFAULT 10,000
                  NOTE: rg is Radius of Gyration. Lower is better.
    extract_log - A file for the pdb file extractor to output to. DEFAULT 'extract.log'
    rosetta_pathsfile - The location of the "paths.txt" file required for Rosetta (for extract functionality)
    mammoth_listfile  - The mammoth listfile containing all PDB filenames to compare against and their location
                          NOTE: Format is "MAMMOTH List\n<dir with pdb files>\n<pdbfile>\n<pdbfile>\n...<pdbfile>\n"
    mammoth_datafile  - The mammoth database file containing information on all astral scop structures
                          NOTE: Columns: "structure_key   sequence_key    length  percent_alpha   percent_beta    sccs    astral_ac"
    ginzu_version     - The ginzu_version of this MCM run. (The ginzu version of the results data that goes into this run)
                          NOTE: Used mostly for Psipred/Secondary structure prediction gathering
    nr_db       - The location of the blast 'nr' database. Used when Psipred SS preds do not exist for decoy sequences
                  in the HPF database.
    dbstore     - True stores cluster information (centers, convergence, and structures) and Mcm results in the database, False does not
    cleanup     - True removes working files, False leaves them in 'work_dir'/'code'
    debug       - True prints results information and debug information to screen

    INSTANCE VARIABLES:
    foldable_record - (dbo) ORM object of the foldable record corresponding to prediction code
    sequence        - (str) amino acid sequence OF THE FOLDABLE RECORD (note: may be different from domain sequence)
    sequence_key    - (int) sequence key OF THE FOLDABLE RECORD (see above), retrieved by Rosetta code (eg oa123)
    parent_sequence-key - (int) sequence key of the parent protein, from foldable record fetched by rosetta code
    """

    def __init__(self, decoy_file, 
                       code=None, 
                       work_dir='/tmp/mcm', 
                       trim_score=15000, 
                       trim_rg=10000, 
                       extract_log='extract.log',
                       rosetta_pathsfile='paths.txt', 
                       mammoth_listfile='list.mammoth', 
                       mammoth_datafile='data.mammothDb',
                       ginzu_version=4,
                       ignore_ginzu_version=False,
                       nr_db="nr",
                       dbstore=True, 
                       cleanup=True,
                       debug=False):
        #DEBUG
        print "Initializing total-MCM object...",
        
        self.decoy_file = os.path.abspath(os.path.expanduser(decoy_file))
        self.code = self._check_code(code) if code else self._parse_code(decoy_file)
        
        self.base_dir = os.path.abspath(os.path.expanduser(work_dir))
        self.work_dir = os.path.join(self.base_dir, self.code)

        self.trim_score = trim_score
        self.trim_rg = trim_rg
        self.extract_log = extract_log
        self.rosetta_pathsfile = rosetta_pathsfile

        self.mammoth_listfile = mammoth_listfile
        self.mammoth_datafile = mammoth_datafile
        self.ginzu_version = ginzu_version
        self.ignore_ginzu_version = ignore_ginzu_version 
        self.nr_db = nr_db

        self.dbstore = dbstore
        self.cleanup = cleanup
        self.debug   = debug
        
        # Define a session instance var for HPF DB use
        self.session = None
        
        # Retrieve foldable record, sequence, and sequence_key from HPF db via prediction code
        self.foldable_record = self._get_foldable_record()
        self.sequence = self.foldable_record.sequence.sequence
        self.sequence_key = self.foldable_record.sequence_key
        self.parent_sequence_key = self.foldable_record.parent_sequence_key

        #DEBUG
        print "Initializing total-MCM object complete"


    def run(self, ):
        """Instance variables set:
        filtered_decoy_file - filename of file that filtered set of decoys from original silent file is written to
        cluster_cmd - filename of the command file to be passed to clusterer
        cluster_out - filename of clusterer results file
        cluster_log - filename of log where clusterer STDOUT and STDERR are written
        ss_pred     - the Psipred secondary structure prediction string for this sequence
        """

        # Setup working environment (create dirs, etc)
        print "Setting up MCM working environment in {0}".format(self.work_dir)
        self._setup_env()
    
        # Parse decoy (denovo result) file and filter to X best scores, and Y of those best RGs
        print "Parsing and filtering de Novo results: {0} best score down to {1} best RG".format(self.trim_score, self.trim_rg)
        denovo_results = DenovoResultFile(filename=self.decoy_file, prediction_code=self.code)
        best_results = denovo_results.get_top_count(self.trim_score)
        best_results = sorted(best_results, key=lambda r: r.radius_gyration)[:self.trim_rg]

        # Write filtered results to file (to be passed to clusterer - the filtered decoy file will be used from here on)
        self.filtered_decoy_file = "{0}.score{1}.rg{2}".format(os.path.join(self.work_dir, self.code), self.trim_score, self.trim_rg)
        print "Writing filtered de Novo results to file {0}".format(self.filtered_decoy_file)
        denovo_results.write_to_file(outfile=self.filtered_decoy_file, results_list=best_results)

        # Parse filtered decoy file into a DenovoResultFile object and remove old denovo results object.
        # (Because clusterer runs on the filtered decoy file, need a DRF obj that corresponds to the indices output by the clusterer)
        filtered_denovo_results = DenovoResultFile(filename=self.filtered_decoy_file, prediction_code=self.code)
        del(denovo_results)

        # Run Rosetta clusterer on filtered set of decoys (not passing sequence - will be parsed from decoyfile)
        print "Running the robetta clusterer"
        self.cluster_cmd = self.code + ".cluster_cmd"
        self.cluster_out = self.code + ".cluster_out"
        self.cluster_log = self.code + ".cluster_log"
        clusterer = RobettaClusterer(command_file=self.cluster_cmd, 
                                     decoy_file=self.filtered_decoy_file, 
                                     outfile=self.cluster_out, 
                                     log_file=self.cluster_log, 
                                     path=self.work_dir)
        
        # Parse clusterer results into RobettaCluster object - contains RobettaConvergence obj (convergence) 
        #   and list+dict of RobettaClusterCenter objs (centers + centers_dict)
        print "Parsing clusterer results from file {0}".format(clusterer.outfile)
        cluster_results = RobettaClusterer.parse(clusterer.outfile)

        # Check returned cluster results: if no centers, problem in parsing
        if not cluster_results.centers:
            raise Exception("Error: no cluster centers parsed from outfile {0}".format(clusterer.outfile))

        #DEBUG: print cluster results
        if self.debug:
            print "+++Convergence:"
            print "\t{0}".format(cluster_results.convergence)
            print "+++Cluster Centers:"
            for c in cluster_results.centers:
                print "\t{0}".format(c)

        # Extract atom records from cluster centers and decoy file, output pdb files per center: decoy_<index>.pdb
        #   return is a dict, index => pdb_filename. Sets centers' pdb_file instance var
        print "Extracting cluster centers from de Novo silent result file into pdb atom record files"
        center_files_dict = self._extract_pdbs(filtered_denovo_results, cluster_results, self.extract_log, self.rosetta_pathsfile)

        # Store cluster info and centers in DB (rosetta_convergence (linked to fsO), rosetta_cluster, structure)
        if self.dbstore:
            print "Storing cluster convergence record and cluster centers to DB"
            convergence_record = self._store_convergence(cluster_results.convergence, self.foldable_record.id, self.filtered_decoy_file)
            center_record_dict = self._store_cluster_centers(cluster_results.centers, self.sequence_key, convergence_record.id)

        # Create hpf.mcm.McmDB object to contain mammoth list and data files, used in Mammoth and MCM functionality
        print "Creating Mammoth/MCM reference database object"
        with open(self.mammoth_datafile) as handle:
            mcmdb = McmDB.load(handle)
        mcmdb.list_file = self.mammoth_listfile
        mcmdb.scop = SCOP_VERSION

        # Mammoth cluster centers (via mcm.MammothRun) and parse mammoth scores (many per cluster center)
        print "Running Mammoth on all cluster centers, and parsing mammoth scores"
        mammoth = MammothRun(decoys=center_files_dict.values(), 
                             experiment_listfile=mcmdb.list_file, 
                             prediction_listfile="prediction.mammoth", 
                             directory=self.work_dir, 
                             outfile="mammoth.results", 
                             autorun=True,
                             debug=self.debug)
        mammoth_scores = mammoth.parse_scores()
        
        # Get foldable sequence's SS pred (get parent protein's SS, snip foldable's SS out of it) 
        #print "Getting Psipred SS predictions"
        fold_start, fold_stop = self._get_foldable_range(self.code, self.parent_sequence_key, self.sequence)
        self.ss_pred = self._get_ss_region(self.parent_sequence_key, self.ginzu_version, fold_start, fold_stop, ignore_ginzu_version=self.ignore_ginzu_version)
        if self.debug:
            print "Foldable range: {0} - {1}".format(fold_start, fold_stop)
            print "Foldable Psipred SS: {0}".format(self.ss_pred)
        
        # Calculate required MCM values (% alpha/beta, seq length) using the snipped-out SS prediction.
        sequence_length, percent_alpha, percent_beta = percent_alpha_beta(self.sequence, self.ss_pred)
        if self.debug: print "Percent alpha: {0}, Percent beta: {1}".format(percent_alpha, percent_beta)

        # MCM all mammoth scores (via McmFactory fctly). mcm_scores is list of McmData DBOs
        print "Creating McmFactory and running MCM on all mammoth scores"
        mcm_factory = McmNoTaskFactory(mcmdb, percent_alpha, percent_beta, cluster_results.convergence.radius1, debug=False)
        mcm_scores = map(mcm_factory.create, mammoth_scores)

        #DEBUG: print top MCM scores (number defined by KEEP_MCM)
        if self.debug:
            print "+++Top Kept MCM Scores (x2 to see top spread):"
            for m in sorted(mcm_scores, reverse=True)[:KEEP_MCM*2]:
                print "\t{0}".format(m.str_long())

        # Store mcm scores in DB (mcm/mcmdata table)
        if self.dbstore:
            print "Storing MCM records in HPF DB"
            # Sort in reverse: highest (aka best scores) first
            mcm_scores.sort(reverse=True)
            self._store_mcm_data(mcm_scores[:KEEP_MCM], self.sequence_key, self.foldable_record.id, convergence_record.id, center_record_dict)

        # Clean up working files
        if self.cleanup: 
            print "Removing working directory '{0}' and all contents".format(self.work_dir)
            self._cleanup()

        print "MCM run on {0}, sequence {1} complete".format(self.code, self.sequence_key)
        print "decoy file: {0}, filtered decoy file: {1}".format(self.decoy_file, self.filtered_decoy_file)
        print "working files in '{0}'".format(self.work_dir)


    def _parse_code(self, filename):
        """Parse Rosetta prediction code from given filename. If a code is not found, raise exception"""
        found = re.search(CODE_PATTERN, filename)
        if not found: 
            raise Exception("Could not parse prediction code from filename '{0}'".format(filename))
        return found.group('code')        

    def _check_code(self, code):
        """Checks code against static CODE PATTERN. If correct, returns code. Otherwise, exception"""
        if not re.match(CODE_PATTERN, code):
            raise Exception("Given code '{0}' does not match Rosetta prediction code form (eg: aa111)".format(code))
        return code

    def _setup_env(self, ):
        """Creates and checks directories and files. Changes working directory"""
        if not os.path.isdir(self.base_dir):
            os.mkdir(self.base_dir)
        if not os.path.isdir(self.work_dir):
            os.mkdir(self.work_dir)
        os.chdir(self.work_dir)

        # Check decoy file's existence
        if not os.path.isfile(self.decoy_file):
            raise IOError("Given decoy file '{0}' does not exist (is not a file)".format(self.decoy_file))
        # Check for Mammoth files (always required)
        if not os.path.isfile(self.mammoth_listfile):
            raise IOError("Given mammoth list file '{0}' does not exist or is not accessible".format(self.mammoth_listfile))
        if not os.path.isfile(self.mammoth_datafile):
            raise IOError("Given mammoth data file '{0}' does not exist or is not accessible".format(self.mammoth_datafile))
        # Check for rosetta_pathsfile, required for extracting pdb atom records from silent files
        if not os.path.isfile(self.rosetta_pathsfile):
            raise IOError("Given rosetta paths file '{0}' does not exist or is not accessible".format(self.rosetta_pathsfile))
	#kdrew: removing nr db check
        # Do a warning check for files required if Psipred will be run
        #if not os.path.isfile(self.nr_db):
        #    raise Warning("Given nr_db file '{0}' does not exist or is not accessible".format(self.nr_db))

    def _check_session(self, ):
        """Opens session if self.session is None. Closes the session (keeps it from expiring, auto-opened when used)"""
        if not self.session:
            self.session = Session()
        self.session.close()

    def _extract_pdbs(self, denovo_results, cluster_results, log=None, pathfile=None):
        """For each cluster center in cluster results, make an individual silent file and then extracts the PDB
        record from that silent file (outputs PDB record to file)
          denovo_results  - DenovoResultFile object (holds silent records)
          cluster_results - RobettaCluster object (holds cluster centers)
        Returns a dict of form {index => pdb_filename} for all cluster centers
        NOTE: It looks odd to create an individ. silent file for each decoy, but is the only way to guarantee
        extractor is pulling the right decoy for given S_id (which are not unique - why, I'll never know)
        """
        from hpf.mcm.extract import extract
        pdb_files = dict()
        
        for center in cluster_results:
            # Create filenames for holding silent file and target to move output PDB file to
            silent_file = "decoy_{0}.silent".format(center.index)
            pdb_file = "decoy_{0}.pdb".format(center.index)
            
            # Get and write silent record
            silent_record = denovo_results[center.index]
            denovo_results.write_to_file(outfile=silent_file, results_list=[silent_record])
            
            # Run extract and move extracted file to named pdb file
            extract_file = extract(silent_file, center.rosetta_id, log_file=log, paths=pathfile, debug=self.debug)
            shutil.move(extract_file, pdb_file)
            if not os.path.isfile(pdb_file):
                raise OSError("Extract functionality in MCM failed to create pdb file '{0}'".format(pdb_file))
           
            # Set center's pdb_file attribute to created file and add to dictionary
            center.pdb_file = pdb_file
            pdb_files[center.index] = pdb_file
            
            if self.cleanup: 
                os.remove(silent_file)
        return pdb_files
    
    def _get_foldable_record(self, ):
        """Gets an hpf.filesystemOutfile (ORM: FilesystemOutfile) record from the DB based on prediction code
        Should be considered a record of the foldable domain sequence (filesystemOutfile name makes no sense)
        """
        from hpf.hddb.db import FilesystemOutfile
        self._check_session()
        foldable = self.session.query(FilesystemOutfile).filter_by(prediction_code=self.code).first()
        if not foldable:
            raise Exception("Failed to find foldable record with code '{0}' in DB".format(self.code))
        return foldable
    
    def _get_foldable_range(self, prediction_code, parent_seq_key, foldable_sequence):
        """Returns the start and stop RANGE numbers (beginning at 1, not 0) of the foldable record's sequence
        (the position at which the foldable sequence starts and stops in the sequence of its parent protein). 
        Returns tuple of long ints if found
        """
        from hpf.hddb.db import Domain
        self._check_session()
        domain = self.session.query(Domain).filter_by(ibm_prediction_code=prediction_code, parent_sequence_key=parent_seq_key).first()
        if not domain:
            raise Exception("No domain found in DB with code {0} and parent seq key {1}".format(prediction_code, parent_seq_key))
        if not domain.region:
            raise Exception("Domain ID:{0} has no region. Database error".format(domain.id))
        
        f = re.search(foldable_sequence, domain.sequence.sequence)
        if not f:
            raise Exception("Foldable sequence '{0}' not found within Domain ID: {1} sequence".format(foldable_sequence, domain.id))
        fold_start_index = f.start()
        fold_stop_index = f.end() - 1
        return domain.region.start + fold_start_index, domain.region.start + fold_stop_index 
        
    def _get_ss_region(self, parent_sequence_key, ginzu_version, start, stop, ignore_ginzu_version=False):
        """Returns a region of the parent sequence's Psipred SS string. First checks
        the HPF DB for a pre-existing parent SS prediction. If found, return the appropriate 
        region. Otherwise, run Psipred on the parent sequence and snip the range out of
        the resulting SS string.
        NOTE: start and stop are "range" numbers. IE, 1-indexed (first residue at 1, second at 2...
        EG  : start 1, stop 10 will give substring starting at index 0, last char at index 9
        """
        from hpf.hddb.db import Psipred
        self._check_session()
        if ignore_ginzu_version:
            parent_psipred = self.session.query(Psipred).filter_by(sequence_key=parent_sequence_key).first()
        else:
            parent_psipred = self.session.query(Psipred).filter_by(sequence_key=parent_sequence_key, ginzu_version=ginzu_version).first()

        if parent_psipred:
            parent_ss = parent_psipred.prediction
        else:
            parent_ss = HPFPsipredWrap(sequence_key=parent_sequence_key,
                                          nr_db=self.nr_db,
                                          ginzu_version=self.ginzu_version,
                                          autorun=True,
                                          dbstore=self.dbstore
                                          ).get_prediction_string()
        #DEBUG
        if self.debug:
            print "Parent protein Psipred SS: {0}".format(parent_ss)

        return parent_ss[start-1:stop]


    def _store_convergence(self, convergence, foldable_key, decoy_file):
        """Stores convergence values in hpf.rosetta_convergence (ORM: RosettaConvergence)
        Links to hpf.filesystemOutfile via foldable_key (id)
        Parameters:
          convergence   - hpf.mcm.cluster.RobettaConvergence object containing cluster convergence info
          foldable_key  - the ID of the hpf.filesystemOutfile (ORM: FilesytemOutfile) entry for this code
          decoy_file    - the filename of decoy file given to the clusterer to cluster
        Returns the successfuly added RosettaConvergence ORM object
        """
        from hpf.hddb.db import RosettaConvergence
        self._check_session()
        cv = RosettaConvergence(outfile_key=foldable_key, target=decoy_file,
                radius1=convergence.radius1, size1=convergence.size1,
                radius2=convergence.radius2, size2=convergence.size2,
                total_decoys=convergence.total_decoys)
        cv = push_to_db(self.session, cv, exception_str="Failed to add RosettaConvergence (outfile_key: {0}) to DB".format(cv.outfile_key))
        return cv

    def _store_cluster_centers(self, cluster_centers, sequence_key, convergence_key):
        """Stores cluster centers in hpf.rosetta_cluster (ORM: RosettaCluster) and centers' structures in hpf.structure (ORM: Structure)
        Parameters:
          cluster_centers - list of hpf.mcm.cluster.RobettaClusterCenter objs
          sequence_key    - the sequence key of the domain sequence decoys were created from
          convergence_key - ID of the corresponding RosettaConvergence ORM object to link ClusterCenters to
        NOTE: RobettaCluster objects MUST have their pdb_file parameters set
        Returns dict of added RosettaCluster DBOs (linked to Structure objs by key), {index => RosettaCluster obj}
        """
        from hpf.hddb.db import Structure, RosettaCluster
        self._check_session()
        centers_dict = dict()
        for center in cluster_centers:
            atom_record = center.get_atom_record()
            if atom_record == None or atom_record == "":
                raise Exception("Failed to get atom record for center {0}".format(center))
            
            # Create and push Structure ORM object
            struct = Structure(sequence_key=sequence_key, structure_type="decoy", comment=center.pdb_file, text=atom_record)
            struct = push_to_db(self.session, struct, exception_str="Failed to add Structure for center {0} to DB".format(center))
            
            # Create and push RosettaCluster
            cc = RosettaCluster(index=center.index, size=center.size, rank=center.rank, convergence_key=convergence_key, structure_key=struct.id)
            push_to_db(self.session, cc, exception_str="Failed to add RosettaCluster for center {0} to DB".format(center))

            # Add to centers dict
            centers_dict[center.index] = cc
        return centers_dict

    def _store_mcm_data(self, mcm_scores, sequence_key, outfile_key, convergence_key, centers_dict):
        """Creates and populates Mammoth and corresponding McmData ORM objs with linking IDs (sequence, outfile,
        convergence, and structure), then pushes to the DB. Generally store top 5 MCM scores
        Parameters:
          mcm_scores    - list of hpf.hddb.db.McmData objects to store in DB
          sequence_key  - ID of sequence from which MCM scores come
          outfile_key   - ID of the foldable record (FilesystemOutfile) corresponding to the seq and MCM scores
          convergence_ke- ID of the convergence info for this sequence's cluster info (RosettaConvergence)
          centers_dict  - dict of form {CC index => OBJ} where object is anything w/ instance variable 'structure_key'
                          corresponding to the CC index's structure (EG: RosettaCluster DBO)
        This function will also fetch the structure key of the MCM score's mammothed cluster center and the
        structure key of the MCM score's mammoth astral structure (must do per mcm score)
        """
        from hpf.hddb.db import MammothFactory
        factory = MammothFactory()
        
        self._check_session()

        for score in mcm_scores:
            cc_index = int(score.mammoth.prediction.split(".")[0][6:])
            structure_key = centers_dict[cc_index].structure_key
            astral_structure_key = int(score.mammoth.experiment.split(".")[0])
            
            mammoth_dbo = factory.create(score.mammoth)
            mammoth_dbo.p_structure_key = structure_key
            mammoth_dbo.e_structure_key = astral_structure_key
            push_to_db(self.session, mammoth_dbo, 
                       exception_str="Failed to add {0} to DB for sequence {1}, index {2}".format(mammoth_dbo, sequence_key, cc_index))
            
            score.sequence_key = sequence_key
            score.outfile_key  = outfile_key
            score.convergence_key = convergence_key
            score.structure_key = structure_key
            score.astral_structure_key = astral_structure_key
            push_to_db(self.session, score,
                       exception_str="Failed to add {0} to DB for sequence {1}, index {2}".format(score, sequence_key, cc_index))
    
    def _cleanup(self, ):
        """Force removal of created working directory"""
        from subprocess import check_call
        if re.search(CODE_PATTERN+r"/?$", self.work_dir):
            ret = check_call(["rm", "-r", "-f", self.work_dir])
        else:
            raise Exception("Working directory '{0}' not valid for removal (must be a code directory, eg oa123".format(self.work_dir))
        return ret
示例#10
0
class OIDImporter(object):
    """
    Import a set of OID files into the database
    """
    def __init__(self,
                 familyName,
                 alignFile,
                 alignColcullLog,
                 alignSeqcullLog,
                 treeFile,
                 treeDiagCharsFile,
                 codemlFile=None,
                 alignFormat="fasta",
                 oid_key=None):
        self.familyName = familyName
        self.treeFile = treeFile
        self.treeDiagCharsFile = treeDiagCharsFile
        self.alignFile = alignFile
        self.alignColcullLog = alignColcullLog
        self.alignSeqcullLog = alignSeqcullLog
        self.codemlFile = codemlFile
        self.alignFormat = alignFormat
        self.oid_key = oid_key

    def merge(self):
        from hpf.hddb.db import Session, Family
        self.session = Session()

        self.family = self.session.query(Family).filter(
            Family.name == self.familyName).first()
        if not self.family:
            runtime().debug("Creating family", self.familyName)
            self._family()
            self._alignment()
            self._tree()
        else:
            self.alignment = self.family.alignment
            self.tree = self.alignment.tree
            runtime().debug("Found family", self.family.id)

        if not self.family.alignments[0].tree.codeml:
            runtime().debug("Importing codeml")
            self._codeml()
        else:
            runtime().debug("Already found codeml",
                            self.family.alignments[0].tree.codeml.id)

        # Commit the session, close, and finish
        self.session.commit()
        self.session.close()

    def _index(self, name):
        n = name.split("#")[-1]
        if n.startswith("N"):
            n = n[1:]
        assert n.isdigit()
        return n

    def _tree(self):
        session = self.session

        # # Load the tree file and rename the taxa.
        # from Bio.Nexus.Nexus import Nexus
        # nex=Nexus(self.treeFile)
        # self.nexus = nex.trees[0]

        from Bio.Nexus.Trees import Tree as NewickTree
        tree_str = open(self.treeFile).read()
        self.nexus = NewickTree(tree_str)

        # Rename all the taxa.
        for id in self.nexus.get_terminals():
            node = self.nexus.node(id)
            node.data.taxon = self._index(node.data.taxon)

        # Create the DB object
        from hpf.hddb.db import Tree
        self.tree = Tree(alignment_key=self.alignment.id,
                         text=self.nexus.to_string(plain=False,
                                                   plain_newick=True),
                         filename=self.treeFile)
        session.add(self.tree)
        session.flush()

        # Now add in the node references
        self.nexus.name = self.tree.id
        assert self.tree.id != None
        runtime().debug("Added tree", self.tree)
        from hpf.hddb.db import TreeNodeFactory
        nodes = list(TreeNodeFactory().create(self.nexus))
        for node in nodes:
            node.ancestor_node = node.ancestor.id if node.ancestor else None
            # This should add the new object into the session
            self.tree.nodes.append(node)
            #session.add(node)
            session.flush()

        runtime().debug("Appended", len(nodes), "tree nodes")
        session.flush()

        # Now import the diagnostic characters and reference the nodes.
        from hpf.amnh.oid import DiagCharsParser
        from hpf.hddb.db import TreeFactory
        biotree = TreeFactory(name_func=lambda node: str(node.id)).create(
            self.tree.nodes, self.tree.id)
        parser = DiagCharsParser(biotree)
        runtime().debug(self.treeDiagCharsFile)
        with open(self.treeDiagCharsFile) as handle:
            diagchars = list(parser.parse(handle))
            runtime().debug("DiagChars", len(diagchars))
            for d in diagchars:
                session.add(d)
        session.flush()

    def _codeml(self):
        if not self.codemlFile:
            return
        assert self.family.id != None
        assert self.tree.id != None

        # We need to convert the columns to the original alignment indices
        mapper = CulledColumnMapper(self.alignment,
                                    self.alignment.culled_columns)
        parser = PositiveSelectionParser()
        models = list(parser.parse(self.codemlFile))
        runtime().debug("Found", len(models), "models")
        for i, model in enumerate(models):
            model.tree_key = self.tree.id
            self.session.add(model)
            self.session.flush()
            ps = list(model.ps)
            runtime().debug("Found", len(ps), "sites in model", model.model)
            for j, site in enumerate(ps):
                site.codeml_key = model.id
                # Indices in CodeML start at 1, convert to 0 and then map
                orig = site.column
                site.column = mapper[site.column - 1]
                runtime().debug("column", orig, "mapped to", site.column,
                                site.probability)
                try:
                    self.session.add(site)
                except:
                    runtime().debug(i, ":", j, " failure on column", orig,
                                    "mapped to", site.column, site.probability)
                    raise
            runtime().debug("Finished with model")
            self.session.flush()


#        with open(self.codemlFile) as handle:
#            text = handle.read()
#        from hpf.hddb.db import CodeML
#        self.codeml = CodeML(tree_key=self.tree.id,
#                             filename=self.codemlFile,
#                             text=text)
#        self.session.add(self.codeml)
#        self.session.flush()
#        parser = LRTParser(self.alignment, self.alignment.culled_columns,self.codeml)
#        with open(self.codemlFile) as handle:
#            for selection in parser.parse(handle):
#                selection.codeml_key = self.codeml.id
#                self.session.merge(selection)
        runtime().debug("finished import codeml")

    def _alignment(self):
        session = self.session

        # Read the alignment
        from Bio import AlignIO
        with open(self.alignFile) as handle:
            align = AlignIO.read(handle, self.alignFormat)
        # Rename 'id' with the correct protein key
        for record in align:
            record.id = self._index(record.id)
        # Write to a text buffer and create the DB object
        text = StringIO()
        AlignIO.write([align], text, self.alignFormat)
        from hpf.hddb.db import Alignment
        self.alignment = Alignment(family_key=self.family.id,
                                   format=self.alignFormat,
                                   filename=self.alignFile,
                                   text=text.getvalue())
        # Add to session and flush
        session.add(self.alignment)
        session.flush()

        # Flip through the proteins in the alignment and add
        # the records.
        for record in align:
            protein_key = record.id
            assert protein_key != 0 and protein_key != None, protein_key
            runtime().debug("protein: ", protein_key)
            from hpf.hddb.db import AlignmentProtein
            s = AlignmentProtein(alignment_key=self.alignment.id,
                                 protein_key=protein_key,
                                 sequence=str(record.seq))
            session.add(s)
            session.flush()

            # There may exist multiple alignments, but the definition
            # of membership in the family is done here.
            from hpf.hddb.db import FamilyProtein
            fs = FamilyProtein(family_key=self.family.id,
                               protein_key=protein_key,
                               seed=True)
            session.merge(fs)

        # Now read the colulmn culling log.  Indices start at 0 here.
        from hpf.hddb.db import AlignmentColcull, AlignmentSeqcull
        with open(self.alignColcullLog) as handle:
            for line in handle:
                column, gap, taxa, ratio = line.split()
                col = AlignmentColcull(alignment_key=self.alignment.id,
                                       column=column,
                                       gap_percentage=ratio)
                session.merge(col)
        with open(self.alignSeqcullLog) as handle:
            #rice#1182215    0.712765957446808
            for line in handle:
                parts = line.split()
                seq, score = parts
                seq = self._index(seq)
                #seq.split("#")[-1]
                if not seq.isdigit():
                    print parts, "SEQ:", seq
                    assert false
                cul = AlignmentSeqcull(alignment_key=self.alignment.id,
                                       protein_key=seq,
                                       score=score)
        session.flush()

    def _family(self):
        session = self.session
        from hpf.hddb.db import Family
        self.family = Family(name=self.familyName, experiment_key=0)
        session.add(self.family)
        session.flush()