def process(self, db, queries, nucleotide, min_hitidentity, min_hitlength, max_evalue): self.nucleotide = nucleotide self.min_hitidentity = min_hitidentity self.min_hitlength = min_hitlength self.max_evalue = max_evalue # we need to deal with the index files here because # all of the blastx jobs need them self.cleanup_files += [db + i for i in [".phr", ".pin", ".psq"]] # creates db + {phr,pin,psq} in same dir as db self.log.info("creating blast db...") Blast.makedb(db) # XXX THIS IS ALWAYS PROTEIN, BECAUSE WE WANT TO RUN BLASTX # queue up the jobs self.log.info("starting local alignments...") self.q = WorkQueue() self.total_jobs = len(queries) self.complete_jobs = -self.batch_size self._progress() for query in self._batch(queries): self.q.enqueue(BlastJob(self.job_callback, db, query, "blastx")) self.log.debug("waiting for job queue to drain...") self.q.join() rm_f(self.cleanup_files) return self.gene_assignments
def stop(self) : self.search.stop() self.info.update_query_gene_mapping(self.search.get_intermediate_results()) if self.q : self.q.stop() rm_f(self.cleanup_files) self.info.flush() self.param.flush()
def run(self, queries_fname, out_fname, alignment_fname, tree_fname=None, min_identity=0.5, min_overlap=0.1): tmpdir = tempfile.mkdtemp() parameters = [ "--ref-seqfile", alignment_fname, "--queryfile", queries_fname, "--outfile", out_fname, "--temp-folder", tmpdir, "--fast", "--terminal-nodes", "--min-query-overlap", str(min_overlap), "--min-query-identity", str(min_identity), "--translate", "--threads", "1", ] if tree_fname: parameters.append("--ref-treefile") parameters.append(tree_fname) self.protein_alignment_fname = out_fname + ".fas" self.nucleotide_alignment_fname = out_fname + ".codon.fas" returncode, output = self._execute(parameters, self.output_filenames(out_fname)) rm_f(glob(join(tmpdir, "q*.fas")) + glob(join(tmpdir, "t*.fas"))) try: os.rmdir(tmpdir) except OSError, ose: self.log.error("could not delete '%s': %s" % (tmpdir, str(ose)))
def cleanup(self) : for f in self._get_filenames() : if f and isfile(f) : self.log.debug("deleting %s" % f) rm_f(f)
def align(self) : self.log.info("starting alignment procedure") # convert the names of the contigs to something no program can complain about # + filter out the ones that could never have a long enough alignment contigs = self._read_contigs() pending_contigs = [ contigs[i] for i in self.info.pending_queries() ] self.log.info("%d contigs have not been assigned to genes..." % len(pending_contigs)) # depending on when the program was terminated this step may be complete or partially # complete if pending_contigs : db_fname = self.db.extract_all() self.cleanup_files.append(db_fname) # do an all vs all search of contigs vs database of transcripts # return a dict of tmp ids with gene ids self.info.update_query_gene_mapping( self.search.process( db_fname, pending_contigs, self.db.nucleotide, self.min_hitidentity, self.min_hitlength, self.max_evalue) ) rm_f(db_fname) # save intermediate results self.info.flush() # use the database to convert the mapping from tmp id -> gene # to gene family -> list of (tmp id, strands) genefamily_contig_map = self.info.build_genefamily2contigs() self.log.info("%d contigs assigned to %d gene families" % (sum([ len(i) for i in genefamily_contig_map.values() ]), len(genefamily_contig_map))) self.log.info("(%d have already been run)" % self.info.len_genefamily2filename()) if self.info.len_genefamily2filename() == len(genefamily_contig_map) : self.log.info("alignment already done, exiting early...") return else : self.log.info("starting alignments...") # queue all the alignments up using a work queue and pagan self.q = WorkQueue() self.total_jobs = len(genefamily_contig_map) - self.info.len_genefamily2filename() self.complete_jobs = -1 self._progress() for famid in self.sort_keys_by_complexity(genefamily_contig_map) : # ignore the jobs that have already been run if self.info.in_genefamily2filename(famid) : continue try : # get the alignment and tree from the database alignment = self.db.get_alignment(famid) tree = alignment.get_tree() # get contigs job_contigs = [ self._correct_strand(contigs[contigid], strand) for contigid,strand in genefamily_contig_map[famid] ] # queue the job self.q.enqueue( PaganJob( self.job_callback, job_contigs, famid, alignment, tree, self.min_alignidentity, self.min_alignoverlap) ) # avoid the split code later in the loop... continue except GluttonDBError, gde : # this means we have never heard of this gene family self.log.warn(str(gde)) continue except GluttonDBFileError, gdfe : # this means we have heard of the gene family, but the # alignment files were missing... self.log.warn(str(gdfe))
def stop(self): if self.q: self.q.stop() rm_f(self.cleanup_files)