class All_vs_all_search(object): def __init__(self, batch_size=100): self.nucleotide = False self.min_hitidentity = None self.min_hitlength = None self.max_evalue = None self.batch_size = batch_size self.log = get_log() self.cleanup_files = [] self.gene_assignments = {} self.lock = threading.Lock() self.q = None self.total_jobs = 0 self.complete_jobs = 0 def _batch(self, x): tmp = [] for i in x: tmp.append(i) if len(tmp) == self.batch_size: yield tmp tmp = [] if not tmp: raise StopIteration yield tmp def process(self, db, queries, nucleotide, min_hitidentity, min_hitlength, max_evalue): self.nucleotide = nucleotide self.min_hitidentity = min_hitidentity self.min_hitlength = min_hitlength self.max_evalue = max_evalue # we need to deal with the index files here because # all of the blastx jobs need them self.cleanup_files += [db + i for i in [".phr", ".pin", ".psq"]] # creates db + {phr,pin,psq} in same dir as db self.log.info("creating blast db...") Blast.makedb(db) # XXX THIS IS ALWAYS PROTEIN, BECAUSE WE WANT TO RUN BLASTX # queue up the jobs self.log.info("starting local alignments...") self.q = WorkQueue() self.total_jobs = len(queries) self.complete_jobs = -self.batch_size self._progress() for query in self._batch(queries): self.q.enqueue(BlastJob(self.job_callback, db, query, "blastx")) self.log.debug("waiting for job queue to drain...") self.q.join() rm_f(self.cleanup_files) return self.gene_assignments def stop(self): if self.q: self.q.stop() rm_f(self.cleanup_files) def get_intermediate_results(self): return self.gene_assignments def _progress(self): self.complete_jobs += self.batch_size if self.complete_jobs > self.total_jobs: self.complete_jobs = self.total_jobs sys.stderr.write("\rProgress: %d / %d blastx alignments " % (self.complete_jobs, self.total_jobs)) if self.complete_jobs == self.total_jobs: sys.stderr.write("\n") sys.stderr.flush() def job_callback(self, job): self.log.debug("%d blast results returned" % len(job.results)) self.lock.acquire() self._progress() if job.success(): qlen = dict([(q.id, len(q)) for q in job.input]) for br in job.results: # length = max(br.qstart, br.qend) - min(br.qstart, br.qend) strand = "+" if br.qstart < br.qend else "-" if ( (br.qseqid in self.gene_assignments) or (self.max_evalue < br.evalue) or (self.min_hitidentity > br.pident) or (self.min_hitlength > br.length) ): continue self.gene_assignments[br.qseqid] = (br.sseqid, strand) for q in job.input: if q.id not in self.gene_assignments: self.gene_assignments[q.id] = None self.lock.release()