Exemplo n.º 1
0
    def process(self, db, queries, nucleotide, min_hitidentity, min_hitlength, max_evalue):
        self.nucleotide = nucleotide
        self.min_hitidentity = min_hitidentity
        self.min_hitlength = min_hitlength
        self.max_evalue = max_evalue

        # we need to deal with the index files here because
        # all of the blastx jobs need them
        self.cleanup_files += [db + i for i in [".phr", ".pin", ".psq"]]

        # creates db + {phr,pin,psq} in same dir as db
        self.log.info("creating blast db...")
        Blast.makedb(db)  # XXX THIS IS ALWAYS PROTEIN, BECAUSE WE WANT TO RUN BLASTX

        # queue up the jobs
        self.log.info("starting local alignments...")
        self.q = WorkQueue()

        self.total_jobs = len(queries)
        self.complete_jobs = -self.batch_size
        self._progress()

        for query in self._batch(queries):
            self.q.enqueue(BlastJob(self.job_callback, db, query, "blastx"))

        self.log.debug("waiting for job queue to drain...")
        self.q.join()

        rm_f(self.cleanup_files)

        return self.gene_assignments
Exemplo n.º 2
0
    def stop(self) :
        self.search.stop()
        self.info.update_query_gene_mapping(self.search.get_intermediate_results())
        
        if self.q :
            self.q.stop()

        rm_f(self.cleanup_files)

        self.info.flush()
        self.param.flush()
Exemplo n.º 3
0
Arquivo: pagan.py Projeto: ajm/glutton
    def run(self, queries_fname, out_fname, alignment_fname, tree_fname=None, min_identity=0.5, min_overlap=0.1):
        tmpdir = tempfile.mkdtemp()

        parameters = [
            "--ref-seqfile",
            alignment_fname,
            "--queryfile",
            queries_fname,
            "--outfile",
            out_fname,
            "--temp-folder",
            tmpdir,
            "--fast",
            "--terminal-nodes",
            "--min-query-overlap",
            str(min_overlap),
            "--min-query-identity",
            str(min_identity),
            "--translate",
            "--threads",
            "1",
        ]

        if tree_fname:
            parameters.append("--ref-treefile")
            parameters.append(tree_fname)

        self.protein_alignment_fname = out_fname + ".fas"
        self.nucleotide_alignment_fname = out_fname + ".codon.fas"

        returncode, output = self._execute(parameters, self.output_filenames(out_fname))

        rm_f(glob(join(tmpdir, "q*.fas")) + glob(join(tmpdir, "t*.fas")))

        try:
            os.rmdir(tmpdir)

        except OSError, ose:
            self.log.error("could not delete '%s': %s" % (tmpdir, str(ose)))
Exemplo n.º 4
0
Arquivo: job.py Projeto: ajm/glutton
 def cleanup(self) :
     for f in self._get_filenames() :
         if f and isfile(f) :
             self.log.debug("deleting %s" % f)
             rm_f(f)
Exemplo n.º 5
0
    def align(self) :
        self.log.info("starting alignment procedure")

        # convert the names of the contigs to something no program can complain about
        # + filter out the ones that could never have a long enough alignment
        contigs = self._read_contigs()

        pending_contigs = [ contigs[i] for i in self.info.pending_queries() ]

        self.log.info("%d contigs have not been assigned to genes..." % len(pending_contigs))

        # depending on when the program was terminated this step may be complete or partially
        # complete 
        if pending_contigs :
            db_fname = self.db.extract_all()
            self.cleanup_files.append(db_fname)

            # do an all vs all search of contigs vs database of transcripts
            # return a dict of tmp ids with gene ids
            self.info.update_query_gene_mapping(
                self.search.process(
                    db_fname, 
                    pending_contigs,
                    self.db.nucleotide,
                    self.min_hitidentity,
                    self.min_hitlength,
                    self.max_evalue)
                )

            rm_f(db_fname)

        # save intermediate results
        self.info.flush()

        # use the database to convert the mapping from tmp id -> gene
        # to gene family -> list of (tmp id, strands)
        genefamily_contig_map = self.info.build_genefamily2contigs()
        
        self.log.info("%d contigs assigned to %d gene families" % 
                (sum([ len(i) for i in genefamily_contig_map.values() ]), len(genefamily_contig_map)))
        self.log.info("(%d have already been run)" % self.info.len_genefamily2filename())

        if self.info.len_genefamily2filename() == len(genefamily_contig_map) :
            self.log.info("alignment already done, exiting early...")
            return
        else :
            self.log.info("starting alignments...")


        # queue all the alignments up using a work queue and pagan
        self.q = WorkQueue()

        self.total_jobs = len(genefamily_contig_map) - self.info.len_genefamily2filename()
        self.complete_jobs = -1
        self._progress()

        for famid in self.sort_keys_by_complexity(genefamily_contig_map) :
            # ignore the jobs that have already been run
            if self.info.in_genefamily2filename(famid) :
                continue

            try :
                # get the alignment and tree from the database
                alignment = self.db.get_alignment(famid)
                tree = alignment.get_tree()

                # get contigs
                job_contigs = [ self._correct_strand(contigs[contigid], strand) for contigid,strand in genefamily_contig_map[famid] ]

                # queue the job
                self.q.enqueue(
                    PaganJob(
                        self.job_callback,
                        job_contigs,
                        famid,
                        alignment,
                        tree,
                        self.min_alignidentity,
                        self.min_alignoverlap)
                    )

                # avoid the split code later in the loop...
                continue

            except GluttonDBError, gde :
                # this means we have never heard of this gene family
                self.log.warn(str(gde))
                continue

            except GluttonDBFileError, gdfe :
                # this means we have heard of the gene family, but the
                # alignment files were missing...
                self.log.warn(str(gdfe))
Exemplo n.º 6
0
    def stop(self):
        if self.q:
            self.q.stop()

        rm_f(self.cleanup_files)