def _main(self): from BioUtils.SeqUtils import SeqView from BioUtils.Tools.Multiprocessing import parallelize_work with simple_timeit('load'): sv = SeqView() sv.load([self.large_seqdb]) ssv = sv.subview(sv.keys()[:5]) print ssv.keys() print ssv[3] print import cPickle as pickle ssv1 = pickle.loads(pickle.dumps(ssv, protocol=-1)) print ssv1.keys() print ssv1[3] print def worker(id, db): return len(db[id]) for numrecs in xrange(1000, len(sv), 1000): svs = sv[0:numrecs] with simple_timeit('sequential %d' % numrecs): res1 = [len(svs[k]) for k in svs.keys()] with simple_timeit('parallel %d' % numrecs): res2 = parallelize_work(self.abort_event, 1, 1, worker, svs.keys(), svs, init_args=lambda db: (db.clone(),)) assert res1 == res2 print '-'*80 print 'Done'
def _find_products_in_db(self, counter, seq_ids, PCR_Sim, P_Finder): #sort templates into short, suitable for plain search and long -- for mp seq_lengths = dict() short_templates = [] long_templates = [] with simple_timeit('PCR Simulation: sorting templates'): for t_id in seq_ids: template = self._seq_db[t_id] if template is None: print 'Sequence %s not found. Something is not right with the sequence database.' % t_id continue t_len = len(template) seq_lengths[t_id] = t_len if mp_better(t_len): long_templates.append(t_id) else: short_templates.append(t_id) if self.aborted(): return False if len(short_templates) == 1: long_templates += short_templates short_templates = [] #setup work counters with simple_timeit('PCR Simulation: counting work to be done'): if short_templates and long_templates: counter.set_subwork(2, (sum(seq_lengths[t_id] for t_id in short_templates), sum(seq_lengths[t_id] for t_id in long_templates))) short_counter = counter[0] long_counter = counter[1] else: long_counter = short_counter = counter if long_templates: long_counter.set_subwork( len(long_templates), [seq_lengths[t_id] for t_id in long_templates]) if short_templates: short_counter.set_subwork(len(short_templates)) print '\nPCR Simulation: searching for annealing sites in provided sequences...' #search short templates in batch if short_templates: results = self._find_products_in_templates( short_counter, self._seq_db.subview(short_templates), P_Finder) if not results or self.aborted(): return False for t_id, m_path in results.iteritems(): PCR_Sim.add_mixture(t_id, m_path) #if there're long templates, search sequentially for i, t_id in enumerate(long_templates): result = self._find_products_in_templates(long_counter[i], [self._seq_db[t_id]], P_Finder) if result is None: if self.aborted(): return False else: continue PCR_Sim.add_mixture(*result.items()[0]) return PCR_Sim.not_empty()
def _find_products_in_db(self, counter, seq_ids, PCR_Sim, P_Finder): #sort templates into short, suitable for plain search and long -- for mp seq_lengths = dict() short_templates = [] long_templates = [] with simple_timeit('PCR Simulation: sorting templates'): for t_id in seq_ids: template = self._seq_db[t_id] if template is None: print 'Sequence %s not found. Something is not right with the sequence database.' % t_id continue t_len = len(template) seq_lengths[t_id] = t_len if mp_better(t_len): long_templates.append(t_id) else: short_templates.append(t_id) if self.aborted(): return False if len(short_templates) == 1: long_templates += short_templates short_templates = [] #setup work counters with simple_timeit('PCR Simulation: counting work to be done'): if short_templates and long_templates: counter.set_subwork(2, (sum(seq_lengths[t_id] for t_id in short_templates), sum(seq_lengths[t_id] for t_id in long_templates))) short_counter = counter[0] long_counter = counter[1] else: long_counter = short_counter = counter if long_templates: long_counter.set_subwork(len(long_templates), [seq_lengths[t_id] for t_id in long_templates]) if short_templates: short_counter.set_subwork(len(short_templates)) print '\nPCR Simulation: searching for annealing sites in provided sequences...' #search short templates in batch if short_templates: results = self._find_products_in_templates(short_counter, self._seq_db.subview(short_templates), P_Finder) if not results or self.aborted(): return False for t_id, m_path in results.iteritems(): PCR_Sim.add_mixture(t_id, m_path) #if there're long templates, search sequentially for i, t_id in enumerate(long_templates): result = self._find_products_in_templates(long_counter[i], [self._seq_db[t_id]], P_Finder) if result is None: if self.aborted(): return False else: continue PCR_Sim.add_mixture(*result.items()[0]) return PCR_Sim.not_empty()
def _find_products(self, counter, PCR_Sim, P_Finder, seq_files, seq_ids): with simple_timeit('PCR Simulation: loading templates'): if not seq_files or not self._load_db(seq_files): print 'No templates were loaded from: %s' % str(seq_files) return False if self.aborted(): return False if not seq_ids: seq_ids = self._seq_db.keys() else: seq_ids = [str(sid) for sid in seq_ids] print 'Number of templates to process: %d' % len(seq_ids) return self._find_products_in_db(counter, seq_ids, PCR_Sim, P_Finder)
def _main(self): from BioUtils.SeqUtils import SeqView from BioUtils.Tools.Multiprocessing import parallelize_work with simple_timeit('load'): sv = SeqView() sv.load([self.large_seqdb]) ssv = sv.subview(sv.keys()[:5]) print ssv.keys() print ssv[3] print import cPickle as pickle ssv1 = pickle.loads(pickle.dumps(ssv, protocol=-1)) print ssv1.keys() print ssv1[3] print def worker(id, db): return len(db[id]) for numrecs in xrange(1000, len(sv), 1000): svs = sv[0:numrecs] with simple_timeit('sequential %d' % numrecs): res1 = [len(svs[k]) for k in svs.keys()] with simple_timeit('parallel %d' % numrecs): res2 = parallelize_work(self.abort_event, 1, 1, worker, svs.keys(), svs, init_args=lambda db: (db.clone(), )) assert res1 == res2 print '-' * 80 print 'Done'