Exemplo n.º 1
0
def main(args):
    mincov, args = grace.get_option_value(args, '--mincov', int, 1) 
    maxdiff, args = grace.get_option_value(args, '--maxdiff', int, 16) 
    minsize, args = grace.get_option_value(args, '--minsize', int, 200)
    what, args = grace.get_option_value(args, '--what', as_core_or_unique, 'core')    
    is_core = (what == 'core') 

    grace.expect_no_further_options(args)
    
    if len(args) < 2:
        print >> sys.stderr, HELP
        raise grace.Help_shown()
    
    output_dir, working_dirs = args[0], args[1:]
    
    assert not path.exists(path.join(output_dir, 'reference.fa')), \
        'Output directory not given'
    
    if not path.exists(output_dir):
        os.mkdir(output_dir)
    
    for name, seq in io.read_sequences(path.join(working_dirs[0],'reference.fa')):
        print name
        friendly_name = grace.filesystem_friendly_name(name)
        
        good = [ True ] * len(seq)
        
        for working_dir in working_dirs:
            if is_core:
               suffix = '-depth.userplot'
            else:
               suffix = '-ambiguous-depth.userplot'
            data = trivia.read_unstranded_userplot(
                os.path.join(working_dir, friendly_name+suffix)
            )
            assert len(seq) == len(data)
            for i in xrange(len(seq)):
               if good[i]:
                   if is_core:
                       good[i] = data[i] >= mincov
                   else:
                       good[i] = data[i] < mincov

        #Close holes
        start = -maxdiff-1
        n_holes = 0
        for i in xrange(len(seq)):
            if good[i]:
                 if 0 < i-start <= maxdiff:
                     for j in xrange(start,i): good[j] = True
                     n_holes += 1
                 start = i+1
        print 'Closed', grace.pretty_number(n_holes), 'holes'
        
        
        f = open(path.join(output_dir, '%s-%s.fa' % (friendly_name,what)), 'wb')
        io.write_fasta(f, name,
            ''.join([ (seq[i] if good[i] else 'N')
                      for i in xrange(len(seq)) ])
        )
        f.close()

        f = open(path.join(output_dir, '%s-%s_masked.fa' % (friendly_name,what)), 'wb')
        io.write_fasta(f, name,
            ''.join([ (seq[i] if good[i] else seq[i].lower())
                      for i in xrange(len(seq)) ])
        )
        f.close()

        f_good = open(path.join(output_dir, '%s-%s_parts.fa' % (friendly_name,what)), 'wb')
        f_nongood = open(path.join(output_dir, '%s-non%s_parts.fa' % (friendly_name,what)), 'wb')
        start = 0
        n_good = [0]
        n_good_bases = [0]    
        def emit(i):
            if i-start < minsize: return
            if good[start]:
                n_good[0] += 1
                n_good_bases[0] += i-start
            io.write_fasta(
                f_good if good[start] else f_nongood,
                '%s:%d..%d' % (name, start+1,i),
                seq[start:i]
            )
        for i in xrange(1,len(seq)):
            if good[i] != good[start]:
                emit(i)
                start = i
        emit(len(seq))
        f_nongood.close()
        f_good.close()
        
        print grace.pretty_number(sum(good)), 'bases are '+what+', of', grace.pretty_number(len(seq)), 'in reference sequence'
        print grace.pretty_number(n_good[0]), 'parts at least', grace.pretty_number(minsize), 'bases long with', grace.pretty_number(n_good_bases[0]), 'total bases'

        print
Exemplo n.º 2
0
    def run(self):
        #mincov, args = grace.get_option_value(args, '--mincov', int, 1)
        #maxdiff, args = grace.get_option_value(args, '--maxdiff', int, 16)
        #minsize, args = grace.get_option_value(args, '--minsize', int, 200)
        #what, args = grace.get_option_value(args, '--what', as_core_or_unique, 'core')
        #is_core = (what == 'core')
        #
        #grace.expect_no_further_options(args)
        #
        #if len(args) < 2:
        #    print >> sys.stderr, HELP
        #    raise grace.Help_shown()
        #
        #output_dir, working_dirs = args[0], args[1:]
        #
        ##assert not path.exists(path.join(output_dir, 'reference.fa')), \
        #assert not path.exists(path.join(output_dir, 'parameters')), \
        #        'Output directory not given'
        #
        #if not path.exists(output_dir):
        #    os.mkdir(output_dir)

        assert self.what in (
            'core',
            'unique'), 'Expected --what to be either "core" or "unique".'
        is_core = (self.what == 'core')

        workspace = self.get_workspace()

        for name, seq in io.read_sequences(
                working_directory.Working(self.working_dirs[0]).get_reference(
                ).reference_fasta_filename()):
            self.log.log(name + '\n')
            friendly_name = grace.filesystem_friendly_name(name)

            good = [True] * len(seq)

            for working_dir in self.working_dirs:
                if is_core:
                    suffix = '-depth.userplot'
                else:
                    suffix = '-ambiguous-depth.userplot'
                data = trivia.read_unstranded_userplot(
                    os.path.join(working_dir, friendly_name + suffix))
                assert len(seq) == len(data)
                for i in xrange(len(seq)):
                    if good[i]:
                        if is_core:
                            good[i] = data[i] >= self.mincov
                        else:
                            good[i] = data[i] < self.mincov

            #Close holes
            start = -self.maxdiff - 1
            n_holes = 0
            for i in xrange(len(seq)):
                if good[i]:
                    if 0 < i - start <= self.maxdiff:
                        for j in xrange(start, i):
                            good[j] = True
                        n_holes += 1
                    start = i + 1
            self.log.log('Closed ' + grace.pretty_number(n_holes) + ' holes\n')

            f = open(workspace / ('%s-%s.fa' % (friendly_name, self.what)),
                     'wb')
            io.write_fasta(
                f, name, ''.join([(seq[i] if good[i] else 'N')
                                  for i in xrange(len(seq))]))
            f.close()

            f = open(
                workspace / ('%s-%s_masked.fa' % (friendly_name, self.what)),
                'wb')
            io.write_fasta(
                f, name, ''.join([(seq[i] if good[i] else seq[i].lower())
                                  for i in xrange(len(seq))]))
            f.close()

            f_good = open(
                workspace / ('%s-%s_parts.fa' % (friendly_name, self.what)),
                'wb')
            f_nongood = open(
                workspace / ('%s-non%s_parts.fa' % (friendly_name, self.what)),
                'wb')
            start = 0
            n_good = [0]
            n_good_bases = [0]

            def emit(i):
                if i - start < self.minsize: return
                if good[start]:
                    n_good[0] += 1
                    n_good_bases[0] += i - start
                io.write_fasta(f_good if good[start] else f_nongood,
                               '%s:%d..%d' % (name, start + 1, i),
                               seq[start:i])

            for i in xrange(1, len(seq)):
                if good[i] != good[start]:
                    emit(i)
                    start = i
            emit(len(seq))
            f_nongood.close()
            f_good.close()

            self.log.log(
                grace.pretty_number(sum(good)) + ' bases are ' + self.what +
                ', of ' + grace.pretty_number(len(seq)) +
                ' in reference sequence\n')
            self.log.log(
                grace.pretty_number(n_good[0]) + ' parts at least ' +
                grace.pretty_number(self.minsize) + ' bases long with ' +
                grace.pretty_number(n_good_bases[0]) + ' total bases\n')
            self.log.log('\n')
Exemplo n.º 3
0
    def run(self):
        #mincov, args = grace.get_option_value(args, '--mincov', int, 1) 
        #maxdiff, args = grace.get_option_value(args, '--maxdiff', int, 16) 
        #minsize, args = grace.get_option_value(args, '--minsize', int, 200)
        #what, args = grace.get_option_value(args, '--what', as_core_or_unique, 'core')    
        #is_core = (what == 'core') 
        #
        #grace.expect_no_further_options(args)
        #
        #if len(args) < 2:
        #    print >> sys.stderr, HELP
        #    raise grace.Help_shown()
        #
        #output_dir, working_dirs = args[0], args[1:]
        #
        ##assert not path.exists(path.join(output_dir, 'reference.fa')), \
        #assert not path.exists(path.join(output_dir, 'parameters')), \
        #        'Output directory not given'
        #
        #if not path.exists(output_dir):
        #    os.mkdir(output_dir)

        assert self.what in ('core','unique'), 'Expected --what to be either "core" or "unique".'
        is_core = (self.what == 'core') 
        
        workspace = self.get_workspace()
        
        for name, seq in io.read_sequences(working_directory.Working(self.working_dirs[0]).get_reference().reference_fasta_filename()):
            self.log.log(name + '\n')
            friendly_name = grace.filesystem_friendly_name(name)
            
            good = [ True ] * len(seq)
            
            for working_dir in self.working_dirs:
                if is_core:
                   suffix = '-depth.userplot'
                else:
                   suffix = '-ambiguous-depth.userplot'
                data = trivia.read_unstranded_userplot(
                    os.path.join(working_dir, friendly_name+suffix)
                )
                assert len(seq) == len(data)
                for i in xrange(len(seq)):
                   if good[i]:
                       if is_core:
                           good[i] = data[i] >= self.mincov
                       else:
                           good[i] = data[i] < self.mincov
    
            #Close holes
            start = -self.maxdiff-1
            n_holes = 0
            for i in xrange(len(seq)):
                if good[i]:
                     if 0 < i-start <= self.maxdiff:
                         for j in xrange(start,i): good[j] = True
                         n_holes += 1
                     start = i+1
            self.log.log('Closed '+grace.pretty_number(n_holes)+' holes\n')
            
            
            f = open( workspace/('%s-%s.fa' % (friendly_name,self.what)), 'wb')
            io.write_fasta(f, name,
                ''.join([ (seq[i] if good[i] else 'N')
                          for i in xrange(len(seq)) ])
            )
            f.close()
    
            f = open( workspace/('%s-%s_masked.fa' % (friendly_name,self.what)), 'wb')
            io.write_fasta(f, name,
                ''.join([ (seq[i] if good[i] else seq[i].lower())
                          for i in xrange(len(seq)) ])
            )
            f.close()
    
            f_good = open( workspace/('%s-%s_parts.fa' % (friendly_name,self.what)), 'wb')
            f_nongood = open( workspace/('%s-non%s_parts.fa' % (friendly_name,self.what)), 'wb')
            start = 0
            n_good = [0]
            n_good_bases = [0]    
            def emit(i):
                if i-start < self.minsize: return
                if good[start]:
                    n_good[0] += 1
                    n_good_bases[0] += i-start
                io.write_fasta(
                    f_good if good[start] else f_nongood,
                    '%s:%d..%d' % (name, start+1,i),
                    seq[start:i]
                )
            for i in xrange(1,len(seq)):
                if good[i] != good[start]:
                    emit(i)
                    start = i
            emit(len(seq))
            f_nongood.close()
            f_good.close()
            
            self.log.log(grace.pretty_number(sum(good))+' bases are '+self.what+', of '+grace.pretty_number(len(seq))+' in reference sequence\n')
            self.log.log(grace.pretty_number(n_good[0])+' parts at least '+grace.pretty_number(self.minsize)+' bases long with '+grace.pretty_number(n_good_bases[0])+' total bases\n')
            self.log.log('\n')
Exemplo n.º 4
0
def main(args):
    mincov, args = grace.get_option_value(args, '--mincov', int, 1)
    maxdiff, args = grace.get_option_value(args, '--maxdiff', int, 16)
    minsize, args = grace.get_option_value(args, '--minsize', int, 200)
    what, args = grace.get_option_value(args, '--what', as_core_or_unique,
                                        'core')
    is_core = (what == 'core')

    grace.expect_no_further_options(args)

    if len(args) < 2:
        print >> sys.stderr, HELP
        raise grace.Help_shown()

    output_dir, working_dirs = args[0], args[1:]

    assert not path.exists(path.join(output_dir, 'reference.fa')), \
        'Output directory not given'

    if not path.exists(output_dir):
        os.mkdir(output_dir)

    for name, seq in io.read_sequences(
            path.join(working_dirs[0], 'reference.fa')):
        print name
        friendly_name = grace.filesystem_friendly_name(name)

        good = [True] * len(seq)

        for working_dir in working_dirs:
            if is_core:
                suffix = '-depth.userplot'
            else:
                suffix = '-ambiguous-depth.userplot'
            data = trivia.read_unstranded_userplot(
                os.path.join(working_dir, friendly_name + suffix))
            assert len(seq) == len(data)
            for i in xrange(len(seq)):
                if good[i]:
                    if is_core:
                        good[i] = data[i] >= mincov
                    else:
                        good[i] = data[i] < mincov

        #Close holes
        start = -maxdiff - 1
        n_holes = 0
        for i in xrange(len(seq)):
            if good[i]:
                if 0 < i - start <= maxdiff:
                    for j in xrange(start, i):
                        good[j] = True
                    n_holes += 1
                start = i + 1
        print 'Closed', grace.pretty_number(n_holes), 'holes'

        f = open(path.join(output_dir, '%s-%s.fa' % (friendly_name, what)),
                 'wb')
        io.write_fasta(
            f, name,
            ''.join([(seq[i] if good[i] else 'N') for i in xrange(len(seq))]))
        f.close()

        f = open(
            path.join(output_dir, '%s-%s_masked.fa' % (friendly_name, what)),
            'wb')
        io.write_fasta(
            f, name, ''.join([(seq[i] if good[i] else seq[i].lower())
                              for i in xrange(len(seq))]))
        f.close()

        f_good = open(
            path.join(output_dir, '%s-%s_parts.fa' % (friendly_name, what)),
            'wb')
        f_nongood = open(
            path.join(output_dir, '%s-non%s_parts.fa' % (friendly_name, what)),
            'wb')
        start = 0
        n_good = [0]
        n_good_bases = [0]

        def emit(i):
            if i - start < minsize: return
            if good[start]:
                n_good[0] += 1
                n_good_bases[0] += i - start
            io.write_fasta(f_good if good[start] else f_nongood,
                           '%s:%d..%d' % (name, start + 1, i), seq[start:i])

        for i in xrange(1, len(seq)):
            if good[i] != good[start]:
                emit(i)
                start = i
        emit(len(seq))
        f_nongood.close()
        f_good.close()

        print grace.pretty_number(
            sum(good)), 'bases are ' + what + ', of', grace.pretty_number(
                len(seq)), 'in reference sequence'
        print grace.pretty_number(
            n_good[0]), 'parts at least', grace.pretty_number(
                minsize), 'bases long with', grace.pretty_number(
                    n_good_bases[0]), 'total bases'

        print