Exemplo n.º 1
0
def main(qbed, sbed, pairs_file, qpad, spad, unmasked_fasta, pair_fmt,blast_path, mask='F', ncpu=8):
    """main runner for finding cnss"""
    pool = Pool(ncpu)
    
    bl2seq = "%s " % blast_path + \
            "-p blastn -D 1 -E 2 -q -2 -r 1 -G 5 -W 7 -F %s " % mask + \
            " -e %(e_value).2f -i %(qfasta)s -j %(sfasta)s \
            -I %(qstart)d,%(qstop)d -J %(sstart)d,%(sstop)d | grep -v '#' \
            | grep -v 'WARNING' | grep -v 'ERROR' "


    fcnss = sys.stdout
    print >> fcnss,
    "#qseqid,qaccn,sseqid,saccn,[qstart,qend,sstart,send,bitscore...]"

    qfastas = get_masked_fastas(qbed)
    sfastas = get_masked_fastas(sbed) if qbed.filename != sbed.filename else qfastas

    pairs = [True]
    _get_pair_gen = get_pair(pairs_file, pair_fmt, qbed, sbed)
    # need this for parallization stuff.
    
    def get_pair_gen():
        try: return _get_pair_gen.next()
        except StopIteration: return None

    while any(pairs):
        pairs = [get_pair_gen() for i in range(ncpu)]
        # this helps in parallelizing.
	spad_map = [spad] * len(pairs)
        qpad_map = [qpad] * len(pairs)
        sfastas_map = [sfastas] * len(pairs)
        qfastas_map = [qfastas] * len(pairs)
        bl2seq_map =  [bl2seq] * len(pairs)
	####################################       
 
	cmds = [c for c in map(get_cmd, [l for l in pairs if
                l],bl2seq_map,qfastas_map,sfastas_map,qpad_map,spad_map) if c]
	results = (r for r in pool.map(commands.getoutput, [c[0] for c in cmds]))

        for res, (cmd, qfeat, sfeat) in zip(results, cmds):
            if not res.strip(): continue
            print >>sys.stderr,  "%s %s" % (qfeat["accn"], sfeat['accn']),
            orient = qfeat['strand'] == sfeat['strand'] and 1 or -1
            cnss = parse_blast(res, orient, qfeat, sfeat, qbed, sbed, qpad, spad, unmasked_fasta)
            print >>sys.stderr, "(%i)" % len(cnss)
            if len(cnss) == 0: continue

            qname, sname = qfeat['accn'], sfeat['accn']
            print >> fcnss, "%s,%s,%s,%s,%s" % (qfeat['seqid'], qname, sfeat['seqid'], sname,
                             ",".join(map(lambda l: ",".join(map(str,l)),cnss)))

    return None
Exemplo n.º 2
0
def run():    
    import random
    from processing import Pool

    p=Pool(2)

    print "ahora vemos si escala"
 
    numero = 5000000
    a = [random.randint(0, 100) for a in xrange(0, numero)]
    
    print "Ya tenemos los numeros"
    lista = [a[:numero/2] , a[numero/2:]]
    print "Lista bisectada"
    result=p.mapAsync(my_sort, lista)
    print "threads lanzados"
    lista1, lista2 = result.get()
    print "Uniendo listas"
    b = my_merge(lista1, lista2)
   # b = my_sort(a)
    print "largo", len(b), "llamadas a my_sort"
Exemplo n.º 3
0
def main(cns_file,
         qdups_path,
         sdups_path,
         pair_file,
         fmt,
         qbed,
         sbed,
         qpad,
         spad,
         blast_path,
         unmasked_fasta,
         mask='F',
         ncpu=8):
    pool = Pool(ncpu)
    bl2seq = "%s " % blast_path + \
            "-p blastn -D 1 -E 2 -q -2 -r 1 -G 5 -W 7 -F %s " % mask + \
            " -e %(e_value).2f -i %(qfasta)s -j %(sfasta)s \
             -I %(qstart)d,%(qstop)d -J %(sstart)d,%(sstop)d | grep -v '#' \
             | grep -v 'WARNING' | grep -v 'ERROR' "

    qfastas = get_masked_fastas(qbed)
    sfastas = get_masked_fastas(
        sbed) if qbed.filename != sbed.filename else qfastas

    ################# file paths #####################
    qnolocaldups_path = qbed.path.split(".")[0] + ".nolocaldups.bed"
    snolocaldups_path = sbed.path.split(".")[0] + ".nolocaldups.bed"
    qlocaldups_path = qbed.path.split(".")[0] + ".localdups"
    slocaldups_path = sbed.path.split(".")[0] + ".localdups"
    npair_file, nqlocaldups, nslocaldups, ncns_file = map(
        make_copy_of_file,
        [pair_file, qlocaldups_path, slocaldups_path, cns_file])
    ##########################################

    qdups = parse_dups(qdups_path)
    sdups = parse_dups(sdups_path)
    dups, rdups = get_pairs(pair_file, fmt, qdups, sdups)
    print len(dups), len(rdups)
    ldups = get_large_dups(dups, qdups, sdups)

    rdups_dic = defaultdict(dict)
    rdups_both = [(qparent, sparent) for qparent, sparent in dups
                  if qparent in rdups and sparent in rdups]
    for (qparent, sparent) in dups:
        if skip_pair(qparent, sparent, rdups, rdups_both, ldups): continue
        cnss_size = []
        qfeat_dups = get_all_dups(qdups, qparent)
        sfeat_dups = get_all_dups(sdups, sparent)
        pairs = [True]
        _get_dups_gen = get_dups(qfeat_dups, sfeat_dups, qbed, sbed)

        def get_dups_gen():
            try:
                return _get_dups_gen.next()
            except StopIteration:
                return None

        while any(pairs):
            cnss_dups = []
            pairs = [get_dups_gen() for i in range(ncpu)]
            ###this is for parellization#########
            spad_map = [spad] * len(pairs)
            qpad_map = [qpad] * len(pairs)
            sfastas_map = [sfastas] * len(pairs)
            qfastas_map = [qfastas] * len(pairs)
            bl2seq_map = [bl2seq] * len(pairs)
            ###################################
            cmds = [
                c for c in map(get_cmd, [l for l in pairs if l], bl2seq_map,
                               qfastas_map, sfastas_map, qpad_map, spad_map)
                if c
            ]
            results = (
                r for r in pool.map(commands.getoutput, [c[0] for c in cmds]))
            for res, (cmd, qfeat, sfeat) in zip(results, cmds):
                orient = qfeat['strand'] == sfeat['strand'] and 1 or -1
                if not res.strip(): cnss = []
                else:
                    cnss = parse_blast(res, orient, qfeat, sfeat, qbed, sbed,
                                       qpad, spad, unmasked_fasta)
                print >> sys.stderr, "(%i)" % len(cnss)
                cnss_fmt = ",".join(map(lambda l: ",".join(map(str, l)), cnss))
                cnss_size.append(
                    (len(cnss) * -1, qfeat["start"], sfeat["start"],
                     qfeat["accn"], sfeat["accn"], cnss_fmt))
            pairs = [pairs[-1]]
        ######################################################################
        if qparent in rdups:
            if (qparent, sparent) in rdups_dic[qparent].keys():
                logging.info((qparent, sparent))
            rdups_dic[qparent].update({(qparent, sparent): cnss_size})
        elif sparent in rdups:
            if (qparent, sparent) in rdups_dic[sparent].keys():
                logging.info((qparent, sparent))
            rdups_dic[sparent].update({(qparent, sparent): cnss_size})
        else:
            cnss_size.sort()
            cns_number, qfeat_start, sfeat_start, qaccn, saccn, largest_cnss = cnss_size[
                0]
            qfeat = qbed.accn(qaccn)
            sfeat = sbed.accn(saccn)
            print >> sys.stderr, "FINAL: {0},{1},{2}".format(
                qaccn, saccn, cns_number)
            write_new_dups(npair_file, ncns_file, nqlocaldups, nslocaldups,
                           cnss_size, qparent, sparent, qfeat, sfeat, qdups,
                           sdups)

    best_reps = best_repeats(rdups_dic)
    for dparents in best_reps.keys():
        qparent, sparent = dparents
        ### one or list? cnss[0]?
        cns_number, qfeat_start, sfeat_start, qaccn, saccn, largest_cnss = best_reps[
            dparents]
        qfeat = qbed.accn(qaccn)
        sfeat = sbed.accn(saccn)
        write_new_dups(npair_file, ncns_file, nqlocaldups, nslocaldups,
                       [best_reps[dparents]], qparent, sparent, qfeat, sfeat,
                       qdups, sdups)

    write_nolocaldups(
        qbed.path, nqlocaldups,
        "{0}.nolocaldups.bed.local".format(qbed.path.split(".")[0]))
    write_nolocaldups(
        sbed.path, nslocaldups,
        "{0}.nolocaldups.bed.local".format(sbed.path.split(".")[0]))
    pairs_to_qa(npair_file, 'pair',
                "{0}.nolocaldups.bed.local".format(qbed.path.split(".")[0]),
                "{0}.nolocaldups.bed.local".format(sbed.path.split(".")[0]),
                "{0}.raw.filtered.local".format(options.pairs.split(".")[0]))
Exemplo n.º 4
0
def test():
    print 'cpuCount() = %d\n' % cpuCount()

    #
    # Create pool
    #

    PROCESSES = 4
    print 'Creating pool with %d processes\n' % PROCESSES
    pool = Pool(PROCESSES)

    #
    # Tests
    #

    TASKS = [(mul, (i, 7)) for i in range(10)] + \
            [(plus, (i, 8)) for i in range(10)]

    results = [pool.applyAsync(calculate, t) for t in TASKS]
    imap_it = pool.imap(calculatestar, TASKS)
    imap_unordered_it = pool.imapUnordered(calculatestar, TASKS)

    print 'Ordered results using pool.applyAsync():'
    for r in results:
        print '\t', r.get()
    print

    print 'Ordered results using pool.imap():'
    for x in imap_it:
        print '\t', x
    print

    print 'Unordered results using pool.imapUnordered():'
    for x in imap_unordered_it:
        print '\t', x
    print

    print 'Ordered results using pool.map() --- will block till complete:'
    for x in pool.map(calculatestar, TASKS):
        print '\t', x
    print

    #
    # Simple benchmarks
    #

    N = 100000
    print 'def pow3(x): return x**3'

    t = time.time()
    A = map(pow3, xrange(N))
    print '\tmap(pow3, xrange(%d)):\n\t\t%s seconds' % \
          (N, time.time() - t)

    t = time.time()
    B = pool.map(pow3, xrange(N))
    print '\tpool.map(pow3, xrange(%d)):\n\t\t%s seconds' % \
          (N, time.time() - t)

    t = time.time()
    C = list(pool.imap(pow3, xrange(N), chunksize=N // 8))
    print '\tlist(pool.imap(pow3, xrange(%d), chunksize=%d)):\n\t\t%s' \
          ' seconds' % (N, N//8, time.time() - t)

    assert A == B == C, (len(A), len(B), len(C))
    print

    L = [None] * 1000000
    print 'def noop(x): pass'
    print 'L = [None] * 1000000'

    t = time.time()
    A = map(noop, L)
    print '\tmap(noop, L):\n\t\t%s seconds' % \
          (time.time() - t)

    t = time.time()
    B = pool.map(noop, L)
    print '\tpool.map(noop, L):\n\t\t%s seconds' % \
          (time.time() - t)

    t = time.time()
    C = list(pool.imap(noop, L, chunksize=len(L) // 8))
    print '\tlist(pool.imap(noop, L, chunksize=%d)):\n\t\t%s seconds' % \
          (len(L)//8, time.time() - t)

    assert A == B == C, (len(A), len(B), len(C))
    print

    del A, B, C, L

    #
    # Test error handling
    #

    print 'Testing error handling:'

    try:
        print pool.apply(f, (5, ))
    except ZeroDivisionError:
        print '\tGot ZeroDivisionError as expected from pool.apply()'
    else:
        raise AssertionError, 'expected ZeroDivisionError'

    try:
        print pool.map(f, range(10))
    except ZeroDivisionError:
        print '\tGot ZeroDivisionError as expected from pool.map()'
    else:
        raise AssertionError, 'expected ZeroDivisionError'

    try:
        print list(pool.imap(f, range(10)))
    except ZeroDivisionError:
        print '\tGot ZeroDivisionError as expected from list(pool.imap())'
    else:
        raise AssertionError, 'expected ZeroDivisionError'

    it = pool.imap(f, range(10))
    for i in range(10):
        try:
            x = it.next()
        except ZeroDivisionError:
            if i == 5:
                pass
        except StopIteration:
            break
        else:
            if i == 5:
                raise AssertionError, 'expected ZeroDivisionError'

    assert i == 9
    print '\tGot ZeroDivisionError as expected from IMapIterator.next()'
    print

    #
    # Testing timeouts
    #

    print 'Testing ApplyResult.get() with timeout:',
    res = pool.applyAsync(calculate, TASKS[0])
    while 1:
        sys.stdout.flush()
        try:
            sys.stdout.write('\n\t%s' % res.get(0.02))
            break
        except TimeoutError:
            sys.stdout.write('.')
    print
    print

    print 'Testing IMapIterator.next() with timeout:',
    it = pool.imap(calculatestar, TASKS)
    while 1:
        sys.stdout.flush()
        try:
            sys.stdout.write('\n\t%s' % it.next(0.02))
        except StopIteration:
            break
        except TimeoutError:
            sys.stdout.write('.')
    print
    print

    #
    # Testing callback
    #

    print 'Testing callback:'

    A = []
    B = [56, 0, 1, 8, 27, 64, 125, 216, 343, 512, 729]

    r = pool.applyAsync(mul, (7, 8), callback=A.append)
    r.wait()

    r = pool.mapAsync(pow3, range(10), callback=A.extend)
    r.wait()

    if A == B:
        print '\tcallbacks succeeded\n'
    else:
        print '\t*** callbacks failed\n\t\t%s != %s\n' % (A, B)

    #
    # Check there are no outstanding tasks
    #

    assert not pool._cache, 'cache = %r' % pool._cache

    #
    # Check close() methods
    #

    print 'Testing close():'

    for worker in pool._pool:
        assert worker.isAlive()

    result = pool.applyAsync(time.sleep, [0.5])
    pool.close()
    pool.join()

    assert result.get() is None

    for worker in pool._pool:
        assert not worker.isAlive()

    print '\tclose() succeeded\n'

    #
    # Check terminate() method
    #

    print 'Testing terminate():'

    pool = Pool(2)
    ignore = pool.apply(pow3, [2])
    results = [pool.applyAsync(time.sleep, [10]) for i in range(10)]
    pool.terminate()
    pool.join()

    for worker in pool._pool:
        assert not worker.isAlive()

    print '\tterminate() succeeded\n'

    #
    # Check garbage collection
    #

    print 'Testing garbage collection:'

    pool = Pool(2)
    processes = pool._pool

    ignore = pool.apply(pow3, [2])
    results = [pool.applyAsync(time.sleep, [10]) for i in range(10)]

    del results, pool

    time.sleep(0.2)

    for worker in processes:
        assert not worker.isAlive()

    print '\tgarbage collection succeeded\n'
Exemplo n.º 5
0
import os, shutil, subprocess, zipfile, random
from processing import Pool
from zipfile import ZipFile
from config import *


def unzip(params):
    base, ext = params
    zf = zipfile.ZipFile(rawDir + '/' + base + '.' + ext, 'r')
    for name in zf.namelist():
        if name != 'PIC/':
            print name
            zf.extract(name, extractDir)


files = []
for f in os.listdir(rawDir):
    if (f.endswith('exe')):
        base, ext = f.split('.')
        if (len(base) == 3):
            files.append((base, ext))
p = Pool()
p.map(unzip, files)
Exemplo n.º 6
0
def main(qbed, sbed, pairs_file, pair_fmt, mask='F', ncpu=8):
    """main runner for finding cnss"""
    pool = Pool(options.ncpu)


    bl2seq = "~/src/blast-2.2.25/bin/bl2seq " \
           "-p blastn -D 1 -E 2 -q -2 -r 1 -G 5 -W 7 -F %s " % mask + \
           " -e %(e_value).2f -i %(qfasta)s -j %(sfasta)s \
             -I %(qstart)d,%(qstop)d -J %(sstart)d,%(sstop)d | grep -v '#' \
            | grep -v 'WARNING' | grep -v 'ERROR' "

    fcnss = sys.stdout
    print >> fcnss, "#qaccn,qseqid,saccn,[sleft_gene,sright_gene],sseqid,res"#"qseqid,qaccn,sseqid,saccn,[qstart,qend,sstart,send...]"

    qfastas = get_masked_fastas(qbed)
    sfastas = get_masked_fastas(sbed) if qbed.filename != sbed.filename else qfastas



    pairs = [True]
    _get_pair_gen = get_pair(pairs_file , pair_fmt, sbed, qbed)
    # need this for parallization stuff.
    def get_pair_gen():
        try: return _get_pair_gen.next()
        except StopIteration: return None

    while any(pairs):
        pairs = [get_pair_gen() for i in range(ncpu)]

        # this helps in parallelizing.
        def get_cmd(pair):
            if pair is None: return None
            sfeat, qfeat = pair
            
            #if qfeat['accn'] != "Bradi4g01820": return None
            #print >>sys.stderr, qfeat, sfeat

            qfasta = qfastas[qfeat['seqid']]
            sfasta = sfastas[sfeat['seqid']]

            sstart, sstop = sfeat['start'], sfeat['end'] #region gets no padding
            qstart, qstop = grab_flanking_region(qfeat, sfeat) # sfeat here is the final table with sfeat info from qfeat dict
            
            m = sstop - sstart
            n = qstop - qstart
            # if (m*n) >= 812045000: # if the database and query is large keep e_value at 2.11 else change it to something smaller
            #     e_value = 2.11
            # else:
            e_value = m*n*(2**(-28.51974)) # bit score above 15/15 noise
            assert e_value > 0

            cmd = bl2seq % dict(qfasta=qfasta, sfasta=sfasta, qstart=qstart,
                                sstart=sstart, qstop=qstop, sstop=sstop, e_value=e_value)
            return cmd, qfeat, sfeat

        cmds = [c for c in map(get_cmd, [l for l in pairs if l]) if c]
        results = (r for r in pool.map(commands.getoutput, [c[0] for c in cmds]))
        #results = (r for r in map(commands.getoutput, [c[0] for c in cmds]))

        for res, (cmd, qfeat, sfeat) in zip(results, cmds):
            if not res.strip(): continue
            print >>sys.stderr,  "%s %s" % (qfeat["accn"], sfeat["accn"]),
            orient = qfeat['strand'] == sfeat['strand'] and 1 or -1
            
            cnss =  parse_blast(res, orient, qfeat, sfeat, qbed, sbed)
            print >>sys.stderr, "(%i)" % len(cnss)
            if len(cnss) == 0: continue
                       
            qname, sname = qfeat['accn'], sfeat['accn']
            
            #urls = url_params(cnss, qfeat['seqid'], sfeat['seqid'], qfeat['ORG2_qfeat'])
            
            print >> fcnss, "%s,%s,%s,[%s,%s],%s,%s" % (qname, qfeat['seqid'], sname, sfeat['qleft_gene'], sfeat['qright_gene'], sfeat['seqid'],
                             ",".join(map(lambda l: ",".join(map(str,l)), cnss)))

    return None
Exemplo n.º 7
0
def main(qbed, sbed,cns_bed, pairs_file, qpad, spad, pair_fmt, blast_path, mask='F', ncpu=8):
    """main runner for finding cnss"""
    pool = Pool(ncpu)


    bl2seq = "%s " % blast_path + \
           "-p blastn -D 1 -E 2 -q -2 -r 1 -G 5 -W 7 -F %s " % mask + \
           " -e %(e_value).2f -i %(qfasta)s -j %(sfasta)s \
              -I %(qstart)d,%(qstop)d -J %(sstart)d,%(sstop)d | grep -v '#' \
            | grep -v 'WARNING' | grep -v 'ERROR' "

    fcnss = sys.stdout
    print >> fcnss, "#qseqid,qaccn,sseqid,saccn,[qstart,qend,sstart,send,evalue...]"

    qfastas = get_masked_fastas(qbed)
    sfastas = get_masked_fastas(sbed) if qbed.filename != sbed.filename else qfastas

    pairs = [True]
    _get_pair_gen = get_pair(pairs_file, pair_fmt, cns_bed, sbed)
    # need this for parallization stuff.
    def get_pair_gen():
        try: return _get_pair_gen.next()
        except StopIteration: return None

    while any(pairs):
        pairs = [get_pair_gen() for i in range(ncpu)]

        # this helps in parallelizing.
        def get_cmd(pair):
            if pair is None: return None
            qfeat, sfeat = pair
            #if qfeat['accn'] != "Bradi4g01820": return None
            #print >>sys.stderr, qfeat, sfeat

            qfasta = qfastas[qfeat['seqid']]
            sfasta = sfastas[sfeat['seqid']]

            qstart, qstop = max(qfeat['start'] - qpad, 1), qfeat['end'] + qpad
            sstart, sstop = max(sfeat['start'] - spad, 1), sfeat['end'] + spad

            assert qstop - qstart > 2 * qpad or qstart == 1, (qstop, qstart)
            assert sstop - sstart > 2 * spad or sstart == 1, (sstop, sstart)
            
            #m = qstop - qstart
            #n = sstop - sstart
            #e_value = m*n*(2**(-28.51974)) # bit score above 15/15 noise
            #assert e_value > 0

            cmd = bl2seq % dict(qfasta=qfasta, sfasta=sfasta, qstart=qstart,
                                sstart=sstart, qstop=qstop, sstop=sstop,
                                e_value=30)
            return cmd, qfeat, sfeat

        cmds = [c for c in map(get_cmd, [l for l in pairs if l]) if c]
        results = (r for r in pool.map(commands.getoutput, [c[0] for c in cmds]))
        #results = (r for r in map(commands.getoutput, [c[0] for c in cmds]))

        for res, (cmd, qfeat, sfeat) in zip(results, cmds):
            if not res.strip(): continue
            print >>sys.stderr,  "%s %s" % (qfeat["accn"], sfeat['accn']),
            orient = qfeat['strand'] == sfeat['strand'] and 1 or -1

            cnss = parse_blast(res, orient, qfeat, sfeat, cns_bed, sbed, qpad, spad)
            print >>sys.stderr, "(%i)" % len(cnss)
            if len(cnss) == 0: continue

            qname, sname = qfeat['accn'], sfeat['accn']
            print >> fcnss, "%s,%s,%s,%s,%s" % (qfeat['seqid'], qname, sfeat['seqid'], sname,
                             ",".join(map(lambda l: ",".join(map(str,l)),cnss)))

    return None