def main(qbed, sbed, pairs_file, qpad, spad, unmasked_fasta, pair_fmt,blast_path, mask='F', ncpu=8): """main runner for finding cnss""" pool = Pool(ncpu) bl2seq = "%s " % blast_path + \ "-p blastn -D 1 -E 2 -q -2 -r 1 -G 5 -W 7 -F %s " % mask + \ " -e %(e_value).2f -i %(qfasta)s -j %(sfasta)s \ -I %(qstart)d,%(qstop)d -J %(sstart)d,%(sstop)d | grep -v '#' \ | grep -v 'WARNING' | grep -v 'ERROR' " fcnss = sys.stdout print >> fcnss, "#qseqid,qaccn,sseqid,saccn,[qstart,qend,sstart,send,bitscore...]" qfastas = get_masked_fastas(qbed) sfastas = get_masked_fastas(sbed) if qbed.filename != sbed.filename else qfastas pairs = [True] _get_pair_gen = get_pair(pairs_file, pair_fmt, qbed, sbed) # need this for parallization stuff. def get_pair_gen(): try: return _get_pair_gen.next() except StopIteration: return None while any(pairs): pairs = [get_pair_gen() for i in range(ncpu)] # this helps in parallelizing. spad_map = [spad] * len(pairs) qpad_map = [qpad] * len(pairs) sfastas_map = [sfastas] * len(pairs) qfastas_map = [qfastas] * len(pairs) bl2seq_map = [bl2seq] * len(pairs) #################################### cmds = [c for c in map(get_cmd, [l for l in pairs if l],bl2seq_map,qfastas_map,sfastas_map,qpad_map,spad_map) if c] results = (r for r in pool.map(commands.getoutput, [c[0] for c in cmds])) for res, (cmd, qfeat, sfeat) in zip(results, cmds): if not res.strip(): continue print >>sys.stderr, "%s %s" % (qfeat["accn"], sfeat['accn']), orient = qfeat['strand'] == sfeat['strand'] and 1 or -1 cnss = parse_blast(res, orient, qfeat, sfeat, qbed, sbed, qpad, spad, unmasked_fasta) print >>sys.stderr, "(%i)" % len(cnss) if len(cnss) == 0: continue qname, sname = qfeat['accn'], sfeat['accn'] print >> fcnss, "%s,%s,%s,%s,%s" % (qfeat['seqid'], qname, sfeat['seqid'], sname, ",".join(map(lambda l: ",".join(map(str,l)),cnss))) return None
def run(): import random from processing import Pool p=Pool(2) print "ahora vemos si escala" numero = 5000000 a = [random.randint(0, 100) for a in xrange(0, numero)] print "Ya tenemos los numeros" lista = [a[:numero/2] , a[numero/2:]] print "Lista bisectada" result=p.mapAsync(my_sort, lista) print "threads lanzados" lista1, lista2 = result.get() print "Uniendo listas" b = my_merge(lista1, lista2) # b = my_sort(a) print "largo", len(b), "llamadas a my_sort"
def main(cns_file, qdups_path, sdups_path, pair_file, fmt, qbed, sbed, qpad, spad, blast_path, unmasked_fasta, mask='F', ncpu=8): pool = Pool(ncpu) bl2seq = "%s " % blast_path + \ "-p blastn -D 1 -E 2 -q -2 -r 1 -G 5 -W 7 -F %s " % mask + \ " -e %(e_value).2f -i %(qfasta)s -j %(sfasta)s \ -I %(qstart)d,%(qstop)d -J %(sstart)d,%(sstop)d | grep -v '#' \ | grep -v 'WARNING' | grep -v 'ERROR' " qfastas = get_masked_fastas(qbed) sfastas = get_masked_fastas( sbed) if qbed.filename != sbed.filename else qfastas ################# file paths ##################### qnolocaldups_path = qbed.path.split(".")[0] + ".nolocaldups.bed" snolocaldups_path = sbed.path.split(".")[0] + ".nolocaldups.bed" qlocaldups_path = qbed.path.split(".")[0] + ".localdups" slocaldups_path = sbed.path.split(".")[0] + ".localdups" npair_file, nqlocaldups, nslocaldups, ncns_file = map( make_copy_of_file, [pair_file, qlocaldups_path, slocaldups_path, cns_file]) ########################################## qdups = parse_dups(qdups_path) sdups = parse_dups(sdups_path) dups, rdups = get_pairs(pair_file, fmt, qdups, sdups) print len(dups), len(rdups) ldups = get_large_dups(dups, qdups, sdups) rdups_dic = defaultdict(dict) rdups_both = [(qparent, sparent) for qparent, sparent in dups if qparent in rdups and sparent in rdups] for (qparent, sparent) in dups: if skip_pair(qparent, sparent, rdups, rdups_both, ldups): continue cnss_size = [] qfeat_dups = get_all_dups(qdups, qparent) sfeat_dups = get_all_dups(sdups, sparent) pairs = [True] _get_dups_gen = get_dups(qfeat_dups, sfeat_dups, qbed, sbed) def get_dups_gen(): try: return _get_dups_gen.next() except StopIteration: return None while any(pairs): cnss_dups = [] pairs = [get_dups_gen() for i in range(ncpu)] ###this is for parellization######### spad_map = [spad] * len(pairs) qpad_map = [qpad] * len(pairs) sfastas_map = [sfastas] * len(pairs) qfastas_map = [qfastas] * len(pairs) bl2seq_map = [bl2seq] * len(pairs) ################################### cmds = [ c for c in map(get_cmd, [l for l in pairs if l], bl2seq_map, qfastas_map, sfastas_map, qpad_map, spad_map) if c ] results = ( r for r in pool.map(commands.getoutput, [c[0] for c in cmds])) for res, (cmd, qfeat, sfeat) in zip(results, cmds): orient = qfeat['strand'] == sfeat['strand'] and 1 or -1 if not res.strip(): cnss = [] else: cnss = parse_blast(res, orient, qfeat, sfeat, qbed, sbed, qpad, spad, unmasked_fasta) print >> sys.stderr, "(%i)" % len(cnss) cnss_fmt = ",".join(map(lambda l: ",".join(map(str, l)), cnss)) cnss_size.append( (len(cnss) * -1, qfeat["start"], sfeat["start"], qfeat["accn"], sfeat["accn"], cnss_fmt)) pairs = [pairs[-1]] ###################################################################### if qparent in rdups: if (qparent, sparent) in rdups_dic[qparent].keys(): logging.info((qparent, sparent)) rdups_dic[qparent].update({(qparent, sparent): cnss_size}) elif sparent in rdups: if (qparent, sparent) in rdups_dic[sparent].keys(): logging.info((qparent, sparent)) rdups_dic[sparent].update({(qparent, sparent): cnss_size}) else: cnss_size.sort() cns_number, qfeat_start, sfeat_start, qaccn, saccn, largest_cnss = cnss_size[ 0] qfeat = qbed.accn(qaccn) sfeat = sbed.accn(saccn) print >> sys.stderr, "FINAL: {0},{1},{2}".format( qaccn, saccn, cns_number) write_new_dups(npair_file, ncns_file, nqlocaldups, nslocaldups, cnss_size, qparent, sparent, qfeat, sfeat, qdups, sdups) best_reps = best_repeats(rdups_dic) for dparents in best_reps.keys(): qparent, sparent = dparents ### one or list? cnss[0]? cns_number, qfeat_start, sfeat_start, qaccn, saccn, largest_cnss = best_reps[ dparents] qfeat = qbed.accn(qaccn) sfeat = sbed.accn(saccn) write_new_dups(npair_file, ncns_file, nqlocaldups, nslocaldups, [best_reps[dparents]], qparent, sparent, qfeat, sfeat, qdups, sdups) write_nolocaldups( qbed.path, nqlocaldups, "{0}.nolocaldups.bed.local".format(qbed.path.split(".")[0])) write_nolocaldups( sbed.path, nslocaldups, "{0}.nolocaldups.bed.local".format(sbed.path.split(".")[0])) pairs_to_qa(npair_file, 'pair', "{0}.nolocaldups.bed.local".format(qbed.path.split(".")[0]), "{0}.nolocaldups.bed.local".format(sbed.path.split(".")[0]), "{0}.raw.filtered.local".format(options.pairs.split(".")[0]))
def test(): print 'cpuCount() = %d\n' % cpuCount() # # Create pool # PROCESSES = 4 print 'Creating pool with %d processes\n' % PROCESSES pool = Pool(PROCESSES) # # Tests # TASKS = [(mul, (i, 7)) for i in range(10)] + \ [(plus, (i, 8)) for i in range(10)] results = [pool.applyAsync(calculate, t) for t in TASKS] imap_it = pool.imap(calculatestar, TASKS) imap_unordered_it = pool.imapUnordered(calculatestar, TASKS) print 'Ordered results using pool.applyAsync():' for r in results: print '\t', r.get() print print 'Ordered results using pool.imap():' for x in imap_it: print '\t', x print print 'Unordered results using pool.imapUnordered():' for x in imap_unordered_it: print '\t', x print print 'Ordered results using pool.map() --- will block till complete:' for x in pool.map(calculatestar, TASKS): print '\t', x print # # Simple benchmarks # N = 100000 print 'def pow3(x): return x**3' t = time.time() A = map(pow3, xrange(N)) print '\tmap(pow3, xrange(%d)):\n\t\t%s seconds' % \ (N, time.time() - t) t = time.time() B = pool.map(pow3, xrange(N)) print '\tpool.map(pow3, xrange(%d)):\n\t\t%s seconds' % \ (N, time.time() - t) t = time.time() C = list(pool.imap(pow3, xrange(N), chunksize=N // 8)) print '\tlist(pool.imap(pow3, xrange(%d), chunksize=%d)):\n\t\t%s' \ ' seconds' % (N, N//8, time.time() - t) assert A == B == C, (len(A), len(B), len(C)) print L = [None] * 1000000 print 'def noop(x): pass' print 'L = [None] * 1000000' t = time.time() A = map(noop, L) print '\tmap(noop, L):\n\t\t%s seconds' % \ (time.time() - t) t = time.time() B = pool.map(noop, L) print '\tpool.map(noop, L):\n\t\t%s seconds' % \ (time.time() - t) t = time.time() C = list(pool.imap(noop, L, chunksize=len(L) // 8)) print '\tlist(pool.imap(noop, L, chunksize=%d)):\n\t\t%s seconds' % \ (len(L)//8, time.time() - t) assert A == B == C, (len(A), len(B), len(C)) print del A, B, C, L # # Test error handling # print 'Testing error handling:' try: print pool.apply(f, (5, )) except ZeroDivisionError: print '\tGot ZeroDivisionError as expected from pool.apply()' else: raise AssertionError, 'expected ZeroDivisionError' try: print pool.map(f, range(10)) except ZeroDivisionError: print '\tGot ZeroDivisionError as expected from pool.map()' else: raise AssertionError, 'expected ZeroDivisionError' try: print list(pool.imap(f, range(10))) except ZeroDivisionError: print '\tGot ZeroDivisionError as expected from list(pool.imap())' else: raise AssertionError, 'expected ZeroDivisionError' it = pool.imap(f, range(10)) for i in range(10): try: x = it.next() except ZeroDivisionError: if i == 5: pass except StopIteration: break else: if i == 5: raise AssertionError, 'expected ZeroDivisionError' assert i == 9 print '\tGot ZeroDivisionError as expected from IMapIterator.next()' print # # Testing timeouts # print 'Testing ApplyResult.get() with timeout:', res = pool.applyAsync(calculate, TASKS[0]) while 1: sys.stdout.flush() try: sys.stdout.write('\n\t%s' % res.get(0.02)) break except TimeoutError: sys.stdout.write('.') print print print 'Testing IMapIterator.next() with timeout:', it = pool.imap(calculatestar, TASKS) while 1: sys.stdout.flush() try: sys.stdout.write('\n\t%s' % it.next(0.02)) except StopIteration: break except TimeoutError: sys.stdout.write('.') print print # # Testing callback # print 'Testing callback:' A = [] B = [56, 0, 1, 8, 27, 64, 125, 216, 343, 512, 729] r = pool.applyAsync(mul, (7, 8), callback=A.append) r.wait() r = pool.mapAsync(pow3, range(10), callback=A.extend) r.wait() if A == B: print '\tcallbacks succeeded\n' else: print '\t*** callbacks failed\n\t\t%s != %s\n' % (A, B) # # Check there are no outstanding tasks # assert not pool._cache, 'cache = %r' % pool._cache # # Check close() methods # print 'Testing close():' for worker in pool._pool: assert worker.isAlive() result = pool.applyAsync(time.sleep, [0.5]) pool.close() pool.join() assert result.get() is None for worker in pool._pool: assert not worker.isAlive() print '\tclose() succeeded\n' # # Check terminate() method # print 'Testing terminate():' pool = Pool(2) ignore = pool.apply(pow3, [2]) results = [pool.applyAsync(time.sleep, [10]) for i in range(10)] pool.terminate() pool.join() for worker in pool._pool: assert not worker.isAlive() print '\tterminate() succeeded\n' # # Check garbage collection # print 'Testing garbage collection:' pool = Pool(2) processes = pool._pool ignore = pool.apply(pow3, [2]) results = [pool.applyAsync(time.sleep, [10]) for i in range(10)] del results, pool time.sleep(0.2) for worker in processes: assert not worker.isAlive() print '\tgarbage collection succeeded\n'
import os, shutil, subprocess, zipfile, random from processing import Pool from zipfile import ZipFile from config import * def unzip(params): base, ext = params zf = zipfile.ZipFile(rawDir + '/' + base + '.' + ext, 'r') for name in zf.namelist(): if name != 'PIC/': print name zf.extract(name, extractDir) files = [] for f in os.listdir(rawDir): if (f.endswith('exe')): base, ext = f.split('.') if (len(base) == 3): files.append((base, ext)) p = Pool() p.map(unzip, files)
def main(qbed, sbed, pairs_file, pair_fmt, mask='F', ncpu=8): """main runner for finding cnss""" pool = Pool(options.ncpu) bl2seq = "~/src/blast-2.2.25/bin/bl2seq " \ "-p blastn -D 1 -E 2 -q -2 -r 1 -G 5 -W 7 -F %s " % mask + \ " -e %(e_value).2f -i %(qfasta)s -j %(sfasta)s \ -I %(qstart)d,%(qstop)d -J %(sstart)d,%(sstop)d | grep -v '#' \ | grep -v 'WARNING' | grep -v 'ERROR' " fcnss = sys.stdout print >> fcnss, "#qaccn,qseqid,saccn,[sleft_gene,sright_gene],sseqid,res"#"qseqid,qaccn,sseqid,saccn,[qstart,qend,sstart,send...]" qfastas = get_masked_fastas(qbed) sfastas = get_masked_fastas(sbed) if qbed.filename != sbed.filename else qfastas pairs = [True] _get_pair_gen = get_pair(pairs_file , pair_fmt, sbed, qbed) # need this for parallization stuff. def get_pair_gen(): try: return _get_pair_gen.next() except StopIteration: return None while any(pairs): pairs = [get_pair_gen() for i in range(ncpu)] # this helps in parallelizing. def get_cmd(pair): if pair is None: return None sfeat, qfeat = pair #if qfeat['accn'] != "Bradi4g01820": return None #print >>sys.stderr, qfeat, sfeat qfasta = qfastas[qfeat['seqid']] sfasta = sfastas[sfeat['seqid']] sstart, sstop = sfeat['start'], sfeat['end'] #region gets no padding qstart, qstop = grab_flanking_region(qfeat, sfeat) # sfeat here is the final table with sfeat info from qfeat dict m = sstop - sstart n = qstop - qstart # if (m*n) >= 812045000: # if the database and query is large keep e_value at 2.11 else change it to something smaller # e_value = 2.11 # else: e_value = m*n*(2**(-28.51974)) # bit score above 15/15 noise assert e_value > 0 cmd = bl2seq % dict(qfasta=qfasta, sfasta=sfasta, qstart=qstart, sstart=sstart, qstop=qstop, sstop=sstop, e_value=e_value) return cmd, qfeat, sfeat cmds = [c for c in map(get_cmd, [l for l in pairs if l]) if c] results = (r for r in pool.map(commands.getoutput, [c[0] for c in cmds])) #results = (r for r in map(commands.getoutput, [c[0] for c in cmds])) for res, (cmd, qfeat, sfeat) in zip(results, cmds): if not res.strip(): continue print >>sys.stderr, "%s %s" % (qfeat["accn"], sfeat["accn"]), orient = qfeat['strand'] == sfeat['strand'] and 1 or -1 cnss = parse_blast(res, orient, qfeat, sfeat, qbed, sbed) print >>sys.stderr, "(%i)" % len(cnss) if len(cnss) == 0: continue qname, sname = qfeat['accn'], sfeat['accn'] #urls = url_params(cnss, qfeat['seqid'], sfeat['seqid'], qfeat['ORG2_qfeat']) print >> fcnss, "%s,%s,%s,[%s,%s],%s,%s" % (qname, qfeat['seqid'], sname, sfeat['qleft_gene'], sfeat['qright_gene'], sfeat['seqid'], ",".join(map(lambda l: ",".join(map(str,l)), cnss))) return None
def main(qbed, sbed,cns_bed, pairs_file, qpad, spad, pair_fmt, blast_path, mask='F', ncpu=8): """main runner for finding cnss""" pool = Pool(ncpu) bl2seq = "%s " % blast_path + \ "-p blastn -D 1 -E 2 -q -2 -r 1 -G 5 -W 7 -F %s " % mask + \ " -e %(e_value).2f -i %(qfasta)s -j %(sfasta)s \ -I %(qstart)d,%(qstop)d -J %(sstart)d,%(sstop)d | grep -v '#' \ | grep -v 'WARNING' | grep -v 'ERROR' " fcnss = sys.stdout print >> fcnss, "#qseqid,qaccn,sseqid,saccn,[qstart,qend,sstart,send,evalue...]" qfastas = get_masked_fastas(qbed) sfastas = get_masked_fastas(sbed) if qbed.filename != sbed.filename else qfastas pairs = [True] _get_pair_gen = get_pair(pairs_file, pair_fmt, cns_bed, sbed) # need this for parallization stuff. def get_pair_gen(): try: return _get_pair_gen.next() except StopIteration: return None while any(pairs): pairs = [get_pair_gen() for i in range(ncpu)] # this helps in parallelizing. def get_cmd(pair): if pair is None: return None qfeat, sfeat = pair #if qfeat['accn'] != "Bradi4g01820": return None #print >>sys.stderr, qfeat, sfeat qfasta = qfastas[qfeat['seqid']] sfasta = sfastas[sfeat['seqid']] qstart, qstop = max(qfeat['start'] - qpad, 1), qfeat['end'] + qpad sstart, sstop = max(sfeat['start'] - spad, 1), sfeat['end'] + spad assert qstop - qstart > 2 * qpad or qstart == 1, (qstop, qstart) assert sstop - sstart > 2 * spad or sstart == 1, (sstop, sstart) #m = qstop - qstart #n = sstop - sstart #e_value = m*n*(2**(-28.51974)) # bit score above 15/15 noise #assert e_value > 0 cmd = bl2seq % dict(qfasta=qfasta, sfasta=sfasta, qstart=qstart, sstart=sstart, qstop=qstop, sstop=sstop, e_value=30) return cmd, qfeat, sfeat cmds = [c for c in map(get_cmd, [l for l in pairs if l]) if c] results = (r for r in pool.map(commands.getoutput, [c[0] for c in cmds])) #results = (r for r in map(commands.getoutput, [c[0] for c in cmds])) for res, (cmd, qfeat, sfeat) in zip(results, cmds): if not res.strip(): continue print >>sys.stderr, "%s %s" % (qfeat["accn"], sfeat['accn']), orient = qfeat['strand'] == sfeat['strand'] and 1 or -1 cnss = parse_blast(res, orient, qfeat, sfeat, cns_bed, sbed, qpad, spad) print >>sys.stderr, "(%i)" % len(cnss) if len(cnss) == 0: continue qname, sname = qfeat['accn'], sfeat['accn'] print >> fcnss, "%s,%s,%s,%s,%s" % (qfeat['seqid'], qname, sfeat['seqid'], sname, ",".join(map(lambda l: ",".join(map(str,l)),cnss))) return None