Exemplo n.º 1
0
def worker(scorer, fut):
    while True:
        value = legion.coordinator().get_future(fut)
        if value is None:
            break

        item, reply_fut = value
        result = scorer(item)
        fut = legion.coordinator().new_future()
        legion.coordinator().deliver_future(reply_fut, (result, fut))
Exemplo n.º 2
0
def worker(scorer, fut):
    while True:
        value = legion.coordinator().get_future(fut)
        if value is None: 
            break
        
        item, reply_fut = value
        result = scorer(item)        
        fut = legion.coordinator().new_future()
        legion.coordinator().deliver_future(reply_fut, (result, fut))
Exemplo n.º 3
0
def execute(args, stdin=None, stdout=None, stderr=None, cores=1, **kwargs):
    """ Run a program.
    
        Raise an error if it has an exit code other than 0.
    """
    from nesoni import legion
    
    if cores > 1:
        legion.coordinator().trade_cores(1, cores)
    try:
        p = run(args, stdin=stdin, stdout=stdout, stderr=stderr, **kwargs)
        assert p.wait() == 0, 'Failed to execute "%s"' % _describe_args(args,kwargs)
    finally:
        if cores > 1:
            legion.coordinator().trade_cores(cores, 1)
Exemplo n.º 4
0
def execute(args, stdin=None, stdout=None, stderr=None, cores=1, **kwargs):
    """ Run a program.
    
        Raise an error if it has an exit code other than 0.
    """
    from nesoni import legion

    if cores > 1:
        legion.coordinator().trade_cores(1, cores)
    try:
        p = run(args, stdin=stdin, stdout=stdout, stderr=stderr, **kwargs)
        assert p.wait() == 0, 'Failed to execute "%s"' % _describe_args(
            args, kwargs)
    finally:
        if cores > 1:
            legion.coordinator().trade_cores(cores, 1)
Exemplo n.º 5
0
    def run_job():
        import sys, os, imp, base64

        # Connect to coordinator
        current_dir, python_path, main_file, address, authkey, mail_number = eval(
            base64.b64decode(sys.argv[1]))

        # Try to recreate execution environment
        os.chdir(current_dir)
        sys.path = python_path

        from nesoni import legion

        legion.manager(address, authkey, connect=True)

        if main_file is not None:  # so unpickling functions in __main__ works
            module = imp.new_module('__job__')
            module.__file__ = main_file
            sys.modules['__job__'] = module
            sys.modules['__main__'] = module
            execfile(main_file, module.__dict__)

        # Retrieve function and execute
        func, args, kwargs = legion.coordinator().get_mail(mail_number)
        func(*args, **kwargs)
        sys.exit(0)
 def run_job():
     import sys, os, imp, base64
 
     # Connect to coordinator
     current_dir, python_path, main_file, address, authkey, mail_number = eval(base64.b64decode(sys.argv[1]))
     
     # Try to recreate execution environment
     os.chdir(current_dir)
     sys.path = python_path
     
     from nesoni import legion
         
     legion.manager(address, authkey, connect=True)
     
     if main_file is not None: # so unpickling functions in __main__ works
         module = imp.new_module('__job__')
         module.__file__ = main_file
         sys.modules['__job__'] = module
         sys.modules['__main__'] = module
         execfile(main_file, module.__dict__)
     
     # Retrieve function and execute
     func, args, kwargs = legion.coordinator().get_mail(mail_number)
     func(*args,**kwargs) 
     sys.exit(0)
Exemplo n.º 7
0
def sort_bam(in_filename, out_prefix, by_name=False, cores=8):
    cores = min(cores, legion.coordinator().get_cores())
    megs = max(10, 800 // cores)
    
    io.execute(
        [ 'samtools', 'sort', '-@', '%d' % cores, '-m', '%dM' % megs ] +
        ([ '-n' ] if by_name else [ ]) +
        [ in_filename, out_prefix ], cores=cores)
Exemplo n.º 8
0
def sort_bam(in_filename, out_prefix, by_name=False, cores=8):
    cores = min(cores, legion.coordinator().get_cores())
    megs = max(10, 800 // cores)
    
    io.execute(
        [ 'samtools', 'sort', '-@', '%d' % cores, '-m', '%dM' % megs ] +
        ([ '-n' ] if by_name else [ ]) +
        [ in_filename, out_prefix ], cores=cores)
Exemplo n.º 9
0
def pipe_from(args, stdin=None, stderr=None, cores=1, **kwargs):
    """ Context to pipe from a process, eg
    
        with io.pipe_from(['ls']) as f:
            print f.read().rstrip('\n').split('\n')
    
    """
    if cores > 1:
        legion.coordinator().trade_cores(1, cores)
    process = run(args, stdin=stdin, stdout=PIPE, stderr=stderr, **kwargs)
    try:
        yield process.stdout
    finally:
        process.stdout.close()
        exit_code = process.wait()
        if cores > 1:
            legion.coordinator().trade_cores(cores, 1)
    assert exit_code == 0, 'Failed: "%s"' % _describe_args(args,kwargs)
Exemplo n.º 10
0
def pipe_to(args, stdout=None, stderr=None, cores=1, **kwargs):
    """ Context to pipe to a process, eg
    
        with io.pipe_to(['less']) as f:
            print >> f, 'Hello, world.'
    
    """
    if cores > 1:
        legion.coordinator().trade_cores(1, cores)
    process = run(args, stdin=PIPE, stdout=stdout, stderr=stderr, **kwargs)
    try:
        yield process.stdin
    finally:
        process.stdin.close()
        exit_code = process.wait()
        if cores > 1:
            legion.coordinator().trade_cores(cores, 1)
    assert exit_code == 0, 'Failed: "%s"' % _describe_args(args,kwargs)
Exemplo n.º 11
0
def pipe_from(args, stdin=None, stderr=None, cores=1, **kwargs):
    """ Context to pipe from a process, eg
    
        with io.pipe_from(['ls']) as f:
            print f.read().rstrip('\n').split('\n')
    
    """
    if cores > 1:
        legion.coordinator().trade_cores(1, cores)
    process = run(args, stdin=stdin, stdout=PIPE, stderr=stderr, **kwargs)
    try:
        yield process.stdout
    finally:
        process.stdout.close()
        exit_code = process.wait()
        if cores > 1:
            legion.coordinator().trade_cores(cores, 1)
    assert exit_code == 0, 'Failed: "%s"' % _describe_args(args, kwargs)
Exemplo n.º 12
0
def pipe_to(args, stdout=None, stderr=None, cores=1, **kwargs):
    """ Context to pipe to a process, eg
    
        with io.pipe_to(['less']) as f:
            print >> f, 'Hello, world.'
    
    """
    if cores > 1:
        legion.coordinator().trade_cores(1, cores)
    process = run(args, stdin=PIPE, stdout=stdout, stderr=stderr, **kwargs)
    try:
        yield process.stdin
    finally:
        process.stdin.close()
        exit_code = process.wait()
        if cores > 1:
            legion.coordinator().trade_cores(cores, 1)
    assert exit_code == 0, 'Failed: "%s"' % _describe_args(args, kwargs)
Exemplo n.º 13
0
def status(string):
    """ Display a status string. """
    from nesoni import legion
    return legion.coordinator().set_status(legion.process_identity(), string)
Exemplo n.º 14
0
    def run(self):
        assert self.reads or self.pairs or self.interleaved, 'No reads given'

        io.check_name_uniqueness(self.reads, self.pairs, self.interleaved)

        working = self.get_workspace()
        working.setup_reference(self.references, bowtie=True)
        working.update_param(snp_cost=2.0)
        reference = working.get_reference()

        log_file = open(self.log_filename(), 'wb')

        with workspace.tempspace(dir=working.working_dir) as temp:
            n = [0]

            def tempname():
                n[0] += 1
                return temp / ('%d.fq' % n[0])

            def convert(filename):
                info = io.get_file_info(filename)
                ok = selection.matches(
                    'type-fastq:[compression-none/compression-gzip/compression-bzip2]',
                    info)
                if ok:
                    return filename
                result_name = tempname()
                with open(result_name, 'wb') as f:
                    for name, seq, qual in io.read_sequences(
                            filename, qualities='required'):
                        io.write_fastq(f, name, seq, qual)
                return result_name

            ones = []
            twos = []
            singles = []

            for pair in self.pairs:
                assert len(
                    pair) == 2, 'Need two files in each "pair:" section.'
                ones.append(convert(pair[0]))
                twos.append(convert(pair[1]))

            for item in self.interleaved:
                left_name = tempname()
                right_name = tempname()
                ones.append(left_name)
                twos.append(right_name)
                with open(left_name,'wb') as left, \
                     open(right_name,'wb') as right:
                    reader = io.read_sequences(item, qualities='required')
                    while True:
                        try:
                            name, seq, qual = reader.next()
                        except StopIteration:
                            break
                        io.write_fastq(left, name, seq, qual)

                        try:
                            name, seq, qual = reader.next()
                        except StopIteration:
                            raise grace.Error(
                                'Interleaved file contains odd number of sequences'
                            )
                        io.write_fastq(right, name, seq, qual)

            for item in self.reads:
                singles.append(convert(item))

            cores = min(self.cores, legion.coordinator().get_cores())

            command = ([
                'bowtie2',
                '--threads',
                str(cores),
                '--rg-id',
                '1',
                '--rg',
                'SM:' + working.name,
            ] + self.bowtie_options +
                       ['-x', reference.get_bowtie_index_prefix()])
            commands = []
            if ones:
                commands.append(command +
                                ['-1', ','.join(ones), '-2', ','.join(twos)])
            if singles:
                commands.append(command + ['-U', ','.join(singles)])

            temp_bam_name = temp / 'temp.bam'

            with io.pipe_to(['samtools', 'view', '-S', '-b', '-'],
                            stdout=open(temp_bam_name, 'wb'),
                            stderr=log_file) as f:
                header_sent = False
                for command in commands:
                    self.log.log('Running:\n' + ' '.join(command) + '\n')
                    with io.pipe_from(command, stderr=log_file,
                                      cores=cores) as f_out:
                        for line in f_out:
                            if not header_sent or not line.startswith('@'):
                                f.write(line)
                    header_sent = True

            #io.execute([
            #    'samtools', 'sort', '-n', temp_bam_name, working/'alignments'
            #    ])

            sam.sort_bam(temp_bam_name,
                         working / 'alignments',
                         by_name=True,
                         cores=self.cores)

        log_file.close()
Exemplo n.º 15
0
def improve(comment, constrainer, scorer, start_x, ftol=1e-4, xtol=1e-6, initial_accuracy=0.001, monitor = lambda x,y: None):
    pool_size = legion.coordinator().get_cores()
    
    worker_futs = [ legion.coordinator().new_future() for i in xrange(pool_size) ]
    reply_futs = [ ]
    
    workers = [
        legion.future(worker,scorer,fut)
        for fut in worker_futs 
        ]
    
    last_t = 0.0
    try:
        best = start_x
        c_score = constrainer(best)
        if c_score:
            best_score = (c_score, 0.0)
        else:
            best_score = (0.0, scorer(best))
        
        n_good = 0
        n_real = 0
        i = 0
        jobs = [ ]
        
        pool_size = int(len(best)*5)
        print len(best),'parameters, pool size', pool_size

        currents = [ (best, best_score) ]
        
        done = False
        while not done or reply_futs:
            t = time.time()
            if t > last_t+20.0:
                def rep(x): 
                    if x[0]: return 'C%.6f' % x[0]
                    return '%.6f' % x[1]
                grace.status('%s %s %d %d %d %d %s'%(rep(best_score), rep(max(item[1] for item in currents)), len(currents), n_good, n_real, i, comment))
                if best_score[0] == 0:
                    monitor(best, [ item[0] for item in currents ])
                last_t = time.time()
            
            have_score = False
            
            if not done and worker_futs:
                new = make_update([item[0] for item in currents], initial_accuracy, len(currents) < pool_size)
                
                c_score = constrainer(new)
                if c_score:
                    have_score = True
                    new_score = (c_score, 0.0)
                else:
                    reply_fut = legion.coordinator().new_future()
                    worker_fut = worker_futs.pop(0)                    
                    legion.coordinator().deliver_future(worker_fut, (new, reply_fut))
                    reply_futs.append( (new, reply_fut) )
            
            if not have_score:
                if not reply_futs or (not done and worker_futs):
                    continue
                new, reply_fut = reply_futs.pop(0)
                new_score, worker_fut = legion.coordinator().get_future(reply_fut)
                new_score = (0.0, new_score)
                worker_futs.append(worker_fut)
            
            if new_score[0] == 0.0:
                n_real += 1

            l = sorted( item[1][1] for item in currents )
            if pool_size < len(l):
                c = l[pool_size]
            else:
                c = 1e30
            cutoff = (best_score[0], c)
            
            if new_score <= cutoff:
                currents = [ item for item in currents if item[1] <= cutoff ]
                currents.append((new,new_score))
                
                n_good += 1
            
                if new_score < best_score:
                    best_score = new_score
                    best = new
            
            if len(currents) >= pool_size and best_score[0] == 0.0:
                xspan = 0.0
                for i in xrange(len(start_x)):
                    xspan = max(xspan,
                        max(item[0][i] for item in currents) -
                          min(item[0][i] for item in currents)
                        )
                
                fspan = (max(item[1] for item in currents)[1]-best_score[1]) 
                
                if xspan < xtol or (n_good >= 5000 and fspan < ftol):
                    done = True
            i += 1
        
        grace.status('')
        print '%s %.5f\n' % (comment, best_score[1])
        
    finally:
        #pool.terminate()
        pass
    
    while worker_futs:
        fut = worker_futs.pop(0)
        legion.coordinator().deliver_future(fut, None)
    
    for item in workers:
        item()
    
    return best
        
Exemplo n.º 16
0
def status(string):
    """ Display a status string. """
    from nesoni import legion
    legion.coordinator().set_status( legion.process_identity(), string )
Exemplo n.º 17
0
    def run(self):
        grace.require_shrimp_2()
        grace.require_samtools()
        assert self.references, 'No reference sequences given'
        assert self.reads or self.pairs or self.interleaved, 'No reads given'
        for pair in self.pairs:
            assert len(pair) == 2, 'Two files required in each pair: section'

        io.check_name_uniqueness(self.reads, self.pairs, self.interleaved)

        read_sets = []
        for item in self.reads:
            read_sets.append(([item], False))
        for item in self.pairs:
            read_sets.append((item, True))
        for item in self.interleaved:
            read_sets.append(([item], True))

        #Create working directory

        workspace = self.get_workspace()
        workspace.setup_reference(self.references)
        workspace.update_param(snp_cost=25)
        reference = workspace.get_reference()
        reference_filename = reference.reference_fasta_filename()

        cores = min(self.cores, legion.coordinator().get_cores())

        default_options = {
            '-E': None,
            '-T': None,
            '-N': str(cores),
            '-n': '2',
            '-w': '200%',
            '-p': 'opp-in',
            '-I': '0,500',
            '-X': None,
        }

        if self.sam_unaligned:
            default_options['--sam-unaligned'] = None

        if self.half_paired:
            default_options['--half-paired'] = None
        else:
            default_options['--no-half-paired'] = None

        cutoff = '55%'  #Default changed in SHRiMP 2.0.2
        if '-h' in self.shrimp_options:
            cutoff = self.shrimp_options[self.shrimp_options.index('-h') + 1]

        #Run shrimp

        bam_filename = io.abspath(self.output_dir, 'alignments.bam')
        bam_prefix = io.abspath(self.output_dir, 'alignments')
        bam_sorted_prefix = io.abspath(self.output_dir, 'alignments_sorted')

        temp_filename = io.abspath(self.output_dir, 'temp.bam')

        log_filename = io.abspath(self.output_dir, 'shrimp_log.txt')
        log_file = open(log_filename, 'wb')

        sam_eater = sam.Bam_writer(temp_filename)

        sam_header_sent = [False]
        n_seen = [0]

        def eat(f):
            for line in f:
                if line.startswith('@'):
                    if sam_header_sent[0]: continue
                else:
                    n_seen[0] += 1
                    if n_seen[0] % 100000 == 0:
                        grace.status('%s alignments produced' %
                                     grace.pretty_number(n_seen[0]))
                sam_eater.write_raw(line)
            sam_header_sent[0] = True

        def remove_pair_options(options):
            for flag in ['-p', '-I']:
                while flag in options:
                    pos = options.index(flag)
                    options = options[:pos] + options[pos + 2:]
            for flag in ['--half-paired']:
                while flag in options:
                    pos = options.index(flag)
                    options = options[:pos] + options[pos + 1:]
            return options

        for i, (filenames, is_paired) in enumerate(read_sets):
            options = self.shrimp_options[:]

            has_qualities = all(
                len(io.read_sequences(filename, qualities=True).next()) ==
                3  #A little ugly
                for filename in filenames)
            if has_qualities:
                options.append('--fastq')

            if len(filenames) == 1:
                reads_parameters = [filenames[0]]
            else:
                reads_parameters = ['-1', filenames[0], '-2', filenames[1]]

            if '--qv-offset' not in self.shrimp_options:
                #guesses = [ ]
                #for filename in filenames:
                #    guesses.append(io.guess_quality_offset(filename))
                #assert len(set(guesses)) == 1, 'Conflicting quality offset guesses, please specify --qv-offset manually.'
                #default_options['--qv-offset'] = str(guesses[0])
                default_options['--qv-offset'] = str(
                    io.guess_quality_offset(*filenames))

            default_options['--read-group'] = '%s,%s' % (
                workspace.name.replace(',',
                                       '_'), workspace.name.replace(',', '_'))
            for flag in default_options:
                if flag not in options:
                    options.append(flag)
                    if default_options[flag] is not None:
                        options.append(default_options[flag])

            if not is_paired:
                options = remove_pair_options(options)

            grace.status('')

            full_param = reference.shrimp_command(self.cs,
                                                  options + reads_parameters)

            print >> sys.stderr, 'Running', ' '.join(full_param)

            with io.pipe_from(full_param, stderr=log_file, cores=cores) as f:
                eat(f)

        log_file.close()

        sam_eater.close()

        grace.status('Sort')

        #io.execute([
        #    'samtools', 'sort', '-n', temp_filename, bam_prefix
        #])
        sam.sort_bam(temp_filename, bam_prefix, by_name=True, cores=self.cores)

        os.unlink(temp_filename)

        grace.status('')
Exemplo n.º 18
0
    def run(self):
        assert self.reads or self.pairs or self.interleaved, 'No reads given'
    
        io.check_name_uniqueness(self.reads, self.pairs, self.interleaved)
        
        working = self.get_workspace()
        working.setup_reference(self.references, bowtie=True)
        working.update_param(snp_cost=2.0)        
        reference = working.get_reference()
        
        log_file = open(self.log_filename(),'wb')
              
        with workspace.tempspace(dir=working.working_dir) as temp:
            n = [ 0 ]
            def tempname():
                n[0] += 1
                return temp/('%d.fq'%n[0])
            def convert(filename):
                info = io.get_file_info(filename)
                ok = selection.matches('type-fastq:[compression-none/compression-gzip/compression-bzip2]', info)
                if ok:
                    return filename            
                result_name = tempname()
                with open(result_name,'wb') as f:
                    for name, seq, qual in io.read_sequences(filename, qualities='required'):
                        io.write_fastq(f, name, seq, qual)
                return result_name
            
            ones = [ ]
            twos = [ ]
            singles = [ ]
            
            for pair in self.pairs:
                assert len(pair) == 2, 'Need two files in each "pair:" section.'
                ones.append(convert(pair[0]))
                twos.append(convert(pair[1]))
            
            for item in self.interleaved:
                left_name = tempname()
                right_name = tempname()
                ones.append(left_name)
                twos.append(right_name)
                with open(left_name,'wb') as left, \
                     open(right_name,'wb') as right:
                    reader = io.read_sequences(item, qualities='required')
                    while True:
                        try:
                            name, seq, qual = reader.next()
                        except StopIteration:
                            break
                        io.write_fastq(left, name,seq,qual)
                        
                        try:
                            name, seq, qual = reader.next()
                        except StopIteration:
                            raise grace.Error('Interleaved file contains odd number of sequences')
                        io.write_fastq(right, name,seq,qual)
            
            for item in self.reads:
                singles.append(convert(item))

            cores = min(self.cores, legion.coordinator().get_cores())

            command = (
                [ 'bowtie2', 
                    '--threads', str(cores),
                    '--rg-id', '1',
                    '--rg', 'SM:'+working.name,                    
                    ] + 
                self.bowtie_options + 
                [ '-x', reference.get_bowtie_index_prefix() ]
                )
            commands = [ ]
            if ones:
                commands.append(command + [ '-1', ','.join(ones), '-2', ','.join(twos) ])
            if singles:
                commands.append(command + [ '-U', ','.join(singles) ])
            
            temp_bam_name = temp/'temp.bam'

            with io.pipe_to(
                     ['samtools', 'view', '-S', '-b', '-'],
                     stdout=open(temp_bam_name,'wb'),
                     stderr=log_file
                     ) as f:
                header_sent = False
                for command in commands:
                    self.log.log('Running:\n' + ' '.join(command) + '\n')            
                    with io.pipe_from(
                        command,
                        stderr=log_file,
                        cores=cores
                        ) as f_out:
                        for line in f_out:
                            if not header_sent or not line.startswith('@'):
                                f.write(line)
                    header_sent = True

            #io.execute([
            #    'samtools', 'sort', '-n', temp_bam_name, working/'alignments'
            #    ])
            
            sam.sort_bam(temp_bam_name, working/'alignments', by_name=True, cores=self.cores)
            
        log_file.close()
Exemplo n.º 19
0
def improve(comment,
            constrainer,
            scorer,
            start_x,
            ftol=1e-4,
            xtol=1e-6,
            initial_accuracy=0.001,
            monitor=lambda x, y: None):
    pool_size = legion.coordinator().get_cores()

    worker_futs = [
        legion.coordinator().new_future() for i in xrange(pool_size)
    ]
    reply_futs = []

    workers = [legion.future(worker, scorer, fut) for fut in worker_futs]

    last_t = 0.0
    try:
        best = start_x
        c_score = constrainer(best)
        if c_score:
            best_score = (c_score, 0.0)
        else:
            best_score = (0.0, scorer(best))

        n_good = 0
        n_real = 0
        i = 0
        jobs = []

        pool_size = int(len(best) * 5)  #5
        print len(best), 'parameters, pool size', pool_size

        currents = [(best, best_score)]

        done = False
        while not done or reply_futs:
            t = time.time()
            if t > last_t + 20.0:

                def rep(x):
                    if x[0]: return 'C%.6f' % x[0]
                    return '%.6f' % x[1]

                grace.status(
                    '%s %s %d %d %d %d %s' %
                    (rep(best_score), rep(max(item[1] for item in currents)),
                     len(currents), n_good, n_real, i, comment))
                if best_score[0] == 0:
                    monitor(best, [item[0] for item in currents])
                last_t = time.time()

            have_score = False

            if not done and worker_futs:
                new = make_update([item[0] for item in currents],
                                  initial_accuracy,
                                  len(currents) < pool_size)

                c_score = constrainer(new)
                if c_score:
                    have_score = True
                    new_score = (c_score, 0.0)
                else:
                    reply_fut = legion.coordinator().new_future()
                    worker_fut = worker_futs.pop(0)
                    legion.coordinator().deliver_future(
                        worker_fut, (new, reply_fut))
                    reply_futs.append((new, reply_fut))

            if not have_score:
                if not reply_futs or (not done and worker_futs):
                    continue
                new, reply_fut = reply_futs.pop(0)
                new_score, worker_fut = legion.coordinator().get_future(
                    reply_fut)
                new_score = (0.0, new_score)
                worker_futs.append(worker_fut)

            if new_score[0] == 0.0:
                n_real += 1

            l = sorted(item[1][1] for item in currents)
            if pool_size < len(l):
                c = l[pool_size]
            else:
                c = 1e30
            cutoff = (best_score[0], c)

            if new_score <= cutoff:
                currents = [item for item in currents if item[1] <= cutoff]
                currents.append((new, new_score))

                n_good += 1

                if new_score < best_score:
                    best_score = new_score
                    best = new

            if len(currents) >= pool_size and best_score[0] == 0.0:
                xspan = 0.0
                for i in xrange(len(start_x)):
                    xspan = max(
                        xspan,
                        max(item[0][i]
                            for item in currents) - min(item[0][i]
                                                        for item in currents))

                fspan = (max(item[1] for item in currents)[1] - best_score[1])

                if xspan < xtol or (n_good >= 5000 and fspan < ftol):
                    done = True
            i += 1

        grace.status('')
        print '%s %.5f\n' % (comment, best_score[1])

    finally:
        #pool.terminate()
        pass

    while worker_futs:
        fut = worker_futs.pop(0)
        legion.coordinator().deliver_future(fut, None)

    for item in workers:
        item()

    return best
Exemplo n.º 20
0
    def run(self):
        grace.require_shrimp_2()
        grace.require_samtools()
        assert self.references, "No reference sequences given"
        assert self.reads or self.pairs or self.interleaved, "No reads given"
        for pair in self.pairs:
            assert len(pair) == 2, "Two files required in each pair: section"

        io.check_name_uniqueness(self.reads, self.pairs, self.interleaved)

        read_sets = []
        for item in self.reads:
            read_sets.append(([item], False))
        for item in self.pairs:
            read_sets.append((item, True))
        for item in self.interleaved:
            read_sets.append(([item], True))

        # Create working directory

        workspace = self.get_workspace()
        workspace.setup_reference(self.references)
        workspace.update_param(snp_cost=25)
        reference = workspace.get_reference()
        reference_filename = reference.reference_fasta_filename()

        cores = min(self.cores, legion.coordinator().get_cores())

        default_options = {
            "-E": None,
            "-T": None,
            "-N": str(cores),
            "-n": "2",
            "-w": "200%",
            "-p": "opp-in",
            "-I": "0,500",
            "-X": None,
        }

        if self.sam_unaligned:
            default_options["--sam-unaligned"] = None

        if self.half_paired:
            default_options["--half-paired"] = None
        else:
            default_options["--no-half-paired"] = None

        cutoff = "55%"  # Default changed in SHRiMP 2.0.2
        if "-h" in self.shrimp_options:
            cutoff = self.shrimp_options[self.shrimp_options.index("-h") + 1]

        # Run shrimp

        bam_filename = io.abspath(self.output_dir, "alignments.bam")
        bam_prefix = io.abspath(self.output_dir, "alignments")
        bam_sorted_prefix = io.abspath(self.output_dir, "alignments_sorted")

        temp_filename = io.abspath(self.output_dir, "temp.bam")

        log_filename = io.abspath(self.output_dir, "shrimp_log.txt")
        log_file = open(log_filename, "wb")

        sam_eater = sam.Bam_writer(temp_filename)

        sam_header_sent = [False]
        n_seen = [0]

        def eat(f):
            for line in f:
                if line.startswith("@"):
                    if sam_header_sent[0]:
                        continue
                else:
                    n_seen[0] += 1
                    if n_seen[0] % 100000 == 0:
                        grace.status("%s alignments produced" % grace.pretty_number(n_seen[0]))
                sam_eater.write_raw(line)
            sam_header_sent[0] = True

        def remove_pair_options(options):
            for flag in ["-p", "-I"]:
                while flag in options:
                    pos = options.index(flag)
                    options = options[:pos] + options[pos + 2 :]
            for flag in ["--half-paired"]:
                while flag in options:
                    pos = options.index(flag)
                    options = options[:pos] + options[pos + 1 :]
            return options

        for i, (filenames, is_paired) in enumerate(read_sets):
            options = self.shrimp_options[:]

            has_qualities = all(
                len(io.read_sequences(filename, qualities=True).next()) == 3 for filename in filenames  # A little ugly
            )
            if has_qualities:
                options.append("--fastq")

            if len(filenames) == 1:
                reads_parameters = [filenames[0]]
            else:
                reads_parameters = ["-1", filenames[0], "-2", filenames[1]]

            if "--qv-offset" not in self.shrimp_options:
                guesses = []
                for filename in filenames:
                    guesses.append(io.guess_quality_offset(filename))
                assert (
                    len(set(guesses)) == 1
                ), "Conflicting quality offset guesses, please specify --qv-offset manually."
                default_options["--qv-offset"] = str(guesses[0])

            default_options["--read-group"] = "%s,%s" % (
                workspace.name.replace(",", "_"),
                workspace.name.replace(",", "_"),
            )
            for flag in default_options:
                if flag not in options:
                    options.append(flag)
                    if default_options[flag] is not None:
                        options.append(default_options[flag])

            if not is_paired:
                options = remove_pair_options(options)

            grace.status("")

            full_param = reference.shrimp_command(self.cs, options + reads_parameters)

            print >>sys.stderr, "Running", " ".join(full_param)

            with io.pipe_from(full_param, stderr=log_file, cores=cores) as f:
                eat(f)

        log_file.close()

        sam_eater.close()

        grace.status("Sort")

        io.execute(["samtools", "sort", "-n", temp_filename, bam_prefix])

        os.unlink(temp_filename)

        grace.status("")
Exemplo n.º 21
0
 def cores_required(self):
     return legion.coordinator().get_cores()
Exemplo n.º 22
0
    def run(self):
        grace.require_shrimp_2()
        grace.require_samtools()
        assert self.references, 'No reference sequences given'
        assert self.reads or self.pairs or self.interleaved, 'No reads given'
        for pair in self.pairs:
            assert len(pair) == 2, 'Two files required in each pair: section'

        io.check_name_uniqueness(self.reads, self.pairs, self.interleaved)

        read_sets = [ ]
        for item in self.reads:
            read_sets.append( ([item], False) )
        for item in self.pairs:
            read_sets.append( (item, True) )
        for item in self.interleaved:
            read_sets.append( ([item], True) )

        #Create working directory
        
        workspace = self.get_workspace()
        workspace.setup_reference(self.references)        
        workspace.update_param(snp_cost=25)
        reference = workspace.get_reference()
        reference_filename = reference.reference_fasta_filename()

        cores = min(self.cores, legion.coordinator().get_cores())
                
        default_options = { 
            '-E' : None, 
            '-T' : None, 
            '-N' : str(cores), 
            '-n':'2', 
            '-w':'200%',
            '-p': 'opp-in', 
            '-I': '0,500', 
            '-X':None,
        }
        
        if self.sam_unaligned:
            default_options['--sam-unaligned'] = None
        
        if self.half_paired:
            default_options['--half-paired'] = None
        else:
            default_options['--no-half-paired'] = None

        cutoff = '55%' #Default changed in SHRiMP 2.0.2
        if '-h' in self.shrimp_options:
            cutoff = self.shrimp_options[ self.shrimp_options.index('-h')+1 ]
        
        #Run shrimp
        
        bam_filename = io.abspath(self.output_dir, 'alignments.bam')
        bam_prefix = io.abspath(self.output_dir, 'alignments')
        bam_sorted_prefix = io.abspath(self.output_dir, 'alignments_sorted')
        
        temp_filename = io.abspath(self.output_dir, 'temp.bam')
        
        log_filename = io.abspath(self.output_dir, 'shrimp_log.txt')
        log_file = open(log_filename, 'wb')
        
        sam_eater = sam.Bam_writer(temp_filename)
        
        sam_header_sent = [False]
        n_seen = [0]
        
        def eat(f):
            for line in f:
                if line.startswith('@'):
                    if sam_header_sent[0]: continue
                else:
                    n_seen[0] += 1
                    if n_seen[0] % 100000 == 0:
                        grace.status('%s alignments produced' % grace.pretty_number(n_seen[0]))
                sam_eater.write_raw(line)
            sam_header_sent[0] = True
        
        def remove_pair_options(options):
            for flag in ['-p','-I']:
                while flag in options:
                    pos = options.index(flag)
                    options = options[:pos] + options[pos+2:]
            for flag in ['--half-paired']:
                while flag in options:
                    pos = options.index(flag)
                    options = options[:pos] + options[pos+1:]
            return options
        
        for i, (filenames, is_paired) in enumerate(read_sets):
            options = self.shrimp_options[:]
               
            has_qualities = all(
                len( io.read_sequences(filename, qualities=True).next() ) == 3  #A little ugly
                for filename in filenames
            )
            if has_qualities:
                options.append( '--fastq' )
            
            if len(filenames) == 1:
                reads_parameters = [ filenames[0] ]
            else:
                reads_parameters = [ '-1', filenames[0], '-2', filenames[1] ]
            
            if '--qv-offset' not in self.shrimp_options:
                #guesses = [ ]
                #for filename in filenames:
                #    guesses.append(io.guess_quality_offset(filename))
                #assert len(set(guesses)) == 1, 'Conflicting quality offset guesses, please specify --qv-offset manually.'
                #default_options['--qv-offset'] = str(guesses[0])
                default_options['--qv-offset'] = str( io.guess_quality_offset(*filenames) )
                
            default_options['--read-group'] = '%s,%s' % (
                workspace.name.replace(',','_'),
                workspace.name.replace(',','_')
            )
            for flag in default_options:
                if flag not in options:
                    options.append(flag)
                    if default_options[flag] is not None:
                        options.append(default_options[flag])
            
            if not is_paired:
               options = remove_pair_options(options)
            
            grace.status('')
            
            full_param = reference.shrimp_command(self.cs, options + reads_parameters)
            
            print >> sys.stderr, 'Running', ' '.join(full_param)
            
            with io.pipe_from(full_param,
                    stderr=log_file,
                    cores=cores) as f:
                eat(f)
        
        log_file.close()
        
        sam_eater.close()
        
        grace.status('Sort')
        
        #io.execute([
        #    'samtools', 'sort', '-n', temp_filename, bam_prefix
        #])
        sam.sort_bam(temp_filename, bam_prefix, by_name=True, cores=self.cores)
        
        os.unlink(temp_filename)
        
        grace.status('')