Пример #1
0
def read_sequences(filename, qualities=False, genbank_callback=None):
    """ Read fasta or illumina sequences, possibly compressed 
    
        Valid values for qualities: False,True,'required'
    
        Post reading filters can be applied.
    """
    assert qualities in (False, True, 'required')

    parts = filename.split('~~')

    info = get_file_info(parts[0])

    have_qualities = False

    if 'type-empty' in info:
        have_qualities = True
        result = read_empty(parts[0])
    elif 'type-fasta' in info:
        result = read_fasta(parts[0])
    elif 'type-genbank' in info:
        result = read_genbank_sequence(parts[0], genbank_callback)
    elif 'type-fastq' in info:
        have_qualities = True
        result = read_illumina_with_quality(parts[0])
    elif 'type-gff' in info:
        result = read_gff3_sequence(parts[0])
    elif 'type-sff' in info:
        f.close()
        grace.require_sff2fastq()
        have_qualities = True
        process = run(['sff2fastq', parts[0]])
        result = read_illumina_with_quality(process.stdout)
    else:
        raise grace.Error('Unrecognized file format for ' + filename)

    if qualities == 'required' and not have_qualities:
        raise grace.Error('Need base qualities in ' + filename)

    for part in parts[1:]:
        for prefix in FILTERS:
            if part.lower().startswith(prefix):
                result = FILTERS[prefix](result, part[len(prefix):])
                break
        else:
            raise grace.Error('Unrecognized filter: ' + part)

    if have_qualities and not qualities:
        result = filter_no_qualities(result)

    return result
Пример #2
0
def reader(working_dirs, references, use_reference, annotations={}):
    for name, sequence in references:
        features = annotations.get(sequence, [])
    
        if use_reference:
            readers = [ reference_reader(sequence) ]
        else:
            readers = [ ]
        
        readers.extend( evidence_reader(working_dir, name) for working_dir in working_dirs )
        
        active_features = [ ]
        feature_pos = 0        
        
        for i in xrange(len(sequence)):
            if i % 10000 == 0:
                grace.status('%s %s' % (name, grace.pretty_number(i)))
            
            active_features = [ item for item in active_features if item.location.nofuzzy_end > i ]
            while feature_pos < len(features) and \
                  features[feature_pos].location.nofuzzy_start <= i:
                active_features.append(features[feature_pos])
                feature_pos += 1
        
            for is_insertion in (True, False):
                 yield Calls(name, i, is_insertion, [ item.next() for item in readers ], active_features)
        
        for reader in readers:
            for item in reader:
                raise grace.Error('Unexpected extra data in evidence file')

    grace.status('')
Пример #3
0
 def run(self):
     sequences = [ ]
     annotations = [ ]
     for filename in self.filenames:
         any = False
         if io.is_sequence_file(filename):
             sequences.append(filename)
             any = True
         if annotation.is_annotation_file(filename):
             annotations.append(filename)
             any = True            
         if not any:
             raise grace.Error(filename + ' is neither a sequence file nor an annotation file that nesoni can read.')
     
     if not sequences:
         assert not annotations, 'Annotations given without any reference sequences.'
         reference = Reference(self.output_dir, must_exist=True)        
     else:
         reference = Reference(self.output_dir, must_exist=False)        
         reference.set_sequences(sequences)
         reference.set_annotations(annotations)
     
     with legion.Stage() as stage:
         if self.genome:
             stage.process(reference.build_genome, self.genome_select)
         if config.apply_ifavailable_program(self.bowtie, 'bowtie2-build'):
             stage.process(reference.build_bowtie_index)
         if config.apply_ifavailable_program(self.ls, 'gmapper-ls'):
             stage.process(reference.build_shrimp_mmap, False)
         if config.apply_ifavailable_program(self.cs, 'gmapper-cs'):
             stage.process(reference.build_shrimp_mmap, True)
         if config.apply_ifavailable_jar(self.snpeff, 'snpEff.jar'):
             stage.process(reference.build_snpeff)
Пример #4
0
def normalize(args):
    min_depth, args = grace.get_option_value(args, '--min-depth', int, 5)
    grace.expect_no_further_options(args)

    if len(args) < 2:
        print NORMALIZE_HELP
        raise grace.Help_shown()

    dirnames = args

    filenames = []
    for dirname in dirnames:
        assert os.path.isdir(dirname), dirname + ' is not a directory'

        filenames.append(
            sorted(
                item for item in os.listdir(dirname)
                #if item.endswith('.userplot') and not item.endswith('-norm.userplot')
                if item.endswith('-depth.userplot')
                and not item.endswith('-ambiguous-depth.userplot')
                and not item.endswith('-pairspan-depth.userplot')))

    for i in xrange(1, len(dirnames)):
        if filenames[i] != filenames[0]:
            raise grace.Error('Userplots in %s differ from those in %s' %
                              (dirnames[i], dirnames[0]))
    filenames = filenames[0]

    for filename in filenames:
        normalize_files(dirnames, filename[:-15], min_depth)
Пример #5
0
def open_possibly_compressed_file(filename, compression_type=None):
    """ Notionally, cast "filename" to a file-like object.
    
        If filename is already file-like, return it.
        If it's compressed, return a decompressing file-like object.
        If it's a BAM file, return a file-like object that produces SAM format.
        Otherwise, just return an open file!
    """
    if hasattr(filename, 'read'):
        return filename  #It's already file-like

    if compression_type is None:
        compression_type = get_compression_type(filename)

    if compression_type == 'none':
        return open(filename, 'rb')
    elif compression_type == 'gzip':
        return gzip.open(filename, 'rb')
    elif compression_type == 'bzip2':
        return bz2.BZ2File(filename, 'rb')
    elif compression_type == 'bam':
        from nesoni import sam
        return sam.open_bam(filename)
    else:
        raise grace.Error('Unknown compression type: ' + compression_type)
Пример #6
0
def read_sequences(filename, qualities=False, genbank_callback=None):
    """ Read fasta or illumina sequences, possibly compressed 
    
        Post reading filters can be applied.
    """

    parts = filename.split('~~')

    f = open_possibly_compressed_file(parts[0])
    peek = f.read(8)
    f.close()

    have_qualities = False

    if not peek:
        result = read_empty(parts[0])
    elif peek.startswith('>'):
        result = read_fasta(parts[0])
    elif peek.startswith('LOCUS'):
        result = read_genbank_sequence(parts[0], genbank_callback)
    elif peek.startswith('@'):
        have_qualities = True
        result = read_illumina_with_quality(parts[0])
    elif peek.startswith('##gff'):
        result = read_gff3_sequence(parts[0])
    elif peek.startswith('.sff'):
        f.close()
        grace.require_sff2fastq()
        have_qualities = True
        process = run(['sff2fastq', parts[0]])
        result = read_illumina_with_quality(process.stdout)
    else:
        raise grace.Error('Unrecognized file format for ' + filename)

    for part in parts[1:]:
        for prefix in FILTERS:
            if part.lower().startswith(prefix):
                result = FILTERS[prefix](result, part[len(prefix):])
                break
        else:
            raise grace.Error('Unrecognized filter: ' + part)

    if have_qualities and not qualities:
        result = filter_no_qualities(result)

    return result
Пример #7
0
def is_colorspace(filename):
    for name, seq in read_sequences(filename):
        tail = seq[1:].upper()
        for char in '0123.':
            if char in tail:
                return True
        for char in 'ACGTN':
            if char in tail:
                return False
    raise grace.Error('Couldn\'t determine if sequence file is colorspace: ' +
                      filename)
Пример #8
0
def read_gff3_sequence(filename):
    f = open_possibly_compressed_file(filename)

    for line in f:
        if line.rstrip() == '##FASTA':
            break
    else:
        raise grace.Error(
            'Tried reading file as a GFF3 but it contains no ##FASTA section')

    return read_fasta(f)
Пример #9
0
 def original_name(self):
     #Assuming it was Illumina
     if self.flag&FLAG_PAIRED:
         if self.flag&FLAG_FIRST:
             return self.qname+'/1'
         elif self.flag&FLAG_SECOND:
             return self.qname+'/2'
         else:
             raise grace.Error('Confused by SAM file')                
     else:
         return self.qname
Пример #10
0
def read_annotations(filename, joiner=None):
    f = io.open_possibly_compressed_file(filename)
    peek = f.read(1024)
    f.close()

    if peek.startswith('LOCUS'):
        return read_genbank(filename)
    elif peek.startswith('##gff') or peek.split('\n')[0].count('\t') in (7, 8):
        return read_gff(filename, joiner)
    else:
        raise grace.Error('Not an annotation file.')
Пример #11
0
    def run(self):
        f = self.begin_output()

        for filename in self.filenames:
            info = io.get_file_info(filename)

            any = False

            name = os.path.splitext(os.path.split(filename)[1])[0]

            if info.matches('sequences'):
                total = 0
                total_length = 0
                for seq in io.read_sequences(filename, qualities=True):
                    total += 1
                    total_length += len(seq[1])
                print >> f, grace.datum(name, 'sequences', total)
                print >> f, grace.datum(name, 'total bases', total_length)
                if total:
                    print >> f, grace.datum(name, 'average length',
                                            float(total_length) / total)
                print >> f
                any = True

            if info.matches('annotations'):
                total = 0
                counts = {}
                for item in annotation.read_annotations(filename, "/"):
                    total += 1
                    counts[item.type] = counts.get(item.type, 0) + 1

                print >> f, grace.datum(name, 'features', total)
                for key in sorted(counts):
                    print >> f, grace.datum(name, key + ' features',
                                            counts[key])
                print >> f
                any = True

            if info.matches('type-vcf'):
                reader_f = io.open_possibly_compressed_file(filename)
                reader = vcf.Reader(reader_f)
                n = 0
                for item in reader:
                    n += 1
                print >> f, grace.datum(name, 'variants', n)
                any = True

            if not any:
                raise grace.Error('Don\'t know what to do with ' + filename)

        self.end_output(f)
Пример #12
0
    def run(self):
        f = self.begin_output()

        for filename in self.filenames:
            any = False

            name = os.path.splitext(os.path.split(filename)[1])[0]

            try:
                iterator = io.read_sequences(filename, qualities=True)
            except grace.Error:
                iterator = None

            if iterator is not None:
                total = 0
                total_length = 0
                for seq in io.read_sequences(filename, qualities=True):
                    total += 1
                    total_length += len(seq[1])
                print >> f, grace.datum(name, 'sequences', total)
                print >> f, grace.datum(name, 'average length',
                                        float(total_length) / total)
                print >> f
                any = True

            try:
                iterator = annotation.read_annotations(filename)
            except grace.Error:
                iterator = None

            if iterator:
                total = 0
                counts = {}
                for item in iterator:
                    total += 1
                    counts[item.type] = counts.get(item.type, 0) + 1

                print >> f, grace.datum(name, 'features', total)
                for key in sorted(counts):
                    print >> f, grace.datum(name, key + ' features',
                                            counts[key])
                print >> f
                any = True

            if not any:
                raise grace.Error(
                    filename +
                    ' is neither a sequence file nor an annotation file that nesoni can read.'
                )

        self.end_output(f)
Пример #13
0
def evidence_reader(working_dir, name):
    filename = os.path.join(working_dir, grace.filesystem_friendly_name(name) + '-evidence.txt')
    f = open(filename,'rb')
    
    header = f.readline()
    if header.count('\t') != 7:
        raise grace.Error('Old style evidence file. Please re-run nesoni consensus.')
    
    for line in f:
        fields = line.rstrip('\n').split('\t')
        yield Call(fields[4], fields[1], fields[6])
        yield Call(fields[5], fields[2], fields[7])

    f.close()
Пример #14
0
def classify_files(filenames, selectors):
    """ Put each of a set of files into one or more categories.    
    """
    results = [[] for item in categories]
    for filename in filenames:
        info = get_file_info(filename)
        any = False
        for i, selector in enumerate(selectors):
            if selection.matches(selector, info):
                results[i].append(filename)
                any = True
        if not any:
            raise grace.Error('Don\'t know what to do with ' + filename)
    return results
Пример #15
0
def find_jar(jarname, extra_help=''):
    search = []
    if 'JARPATH' in os.environ:  # I just made this up
        search.extend(os.environ['JARPATH'].split(':'))
    if 'PATH' in os.environ:
        search.extend(os.environ['PATH'].split(os.pathsep))

    for dirname in search:
        filename = os.path.join(dirname, jarname)
        if os.path.isabs(dirname) and os.path.exists(filename):
            return filename
    raise grace.Error(
        'Couldn\'t find "%s". Directories listed in JARPATH and PATH were searched. %s'
        % (jarname, extra_help))
Пример #16
0
def matches(expression, tags):
    tokens = list('[]-:/^')

    def parse2(expression):
        assert expression, 'unexpected end of expression'
        if expression[0] == '[':
            value, expression = parse(expression[1:])
            assert expression.startswith(']'), 'expected a closing ]'
            return value, expression[1:]

        i = 0
        while i < len(expression) and expression[i] not in '[]:/^':
            i += 1
        assert i > 0, 'unexpected ' + expression[0]
        return expression[:i] == 'all' or expression[:i] in tags, expression[
            i:]

    def parse1(expression):
        assert expression, 'unexpected end of expression'
        if expression.startswith('-'):
            value, expression = parse2(expression[1:])
            return not value, expression
        else:
            value, expression = parse2(expression)
            return value, expression

    def parse(expression):
        value, expression = parse1(expression)
        while expression and expression[0] in ':/^':
            operator, expression = expression[0], expression[1:]
            value2, expression = parse1(expression)
            if operator == ':':
                value = value and value2
            elif operator == '/':
                value = value or value2
            else:
                value = (not value2 and value) or (not value and value2)
        return value, expression

    if expression == '':
        return False
    try:
        value, expression = parse(expression)
        assert not expression, 'don\'t know what to do with: ' + expression
    except AssertionError, e:
        raise grace.Error('Could not parse: ' + expression + ', ' + e.args[0])
Пример #17
0
def run(args,
        stdin=None,
        stdout=PIPE,
        stderr=None,
        cwd=None,
        no_display=False,
        **kwargs):
    """ Start a process using subprocess.Popen    
        
        Set close_fds=True so process doesn't inherit any other pipes we might be using.
        
        stdin stdout and stderr may be:
          None                - inherit existing
          nesoni.io.PIPE      - create a pipe          
          a file or fd number - the file 
                                (be sure to flush() anything you've written to it first!)
        
        stderr may also be nesoni.io.STDOUT
    """
    args = _interpret_args(args, kwargs)

    if not no_display:
        env = None
    else:
        env = dict(os.environ)
        if 'DISPLAY' in env:
            del env['DISPLAY']

    try:
        return subprocess.Popen(
            args,
            bufsize=1 << 24,
            stdin=stdin,
            stdout=stdout,
            stderr=stderr,
            cwd=cwd,
            env=env,
            close_fds=True,
        )

    except OSError, err:
        raise grace.Error("Failed to run: %s" % (' '.join(args)))
Пример #18
0
def _make_inner(action):
    timestamp = coordinator().time()
    assert timestamp > LOCAL.time, 'Time running in reverse.'
    
    cores = action.cores_required()
    if cores > 1:
        coordinator().trade_cores(1, cores)
    try:        
        config.write_colored_text(sys.stderr, '\n'+action.describe()+'\n')
        
        if LOCAL.abort_make:
            raise grace.Error('%s would be run. Stopping here.' % action.ident())
        
        grace.status(action.ident())
        try:
            _run_and_save_state(action, timestamp)
        finally:
            grace.status('')
    finally:
        if cores > 1:
            coordinator().trade_cores(cores, 1)
Пример #19
0
def iter_reads(config, qualities=False):
    if 'stride' not in config:
        raise grace.Error(
            'Please re-run nesoni shrimp, output format has changed')

    stride = config['stride']
    for reads_filename_set in config['reads']:
        if config['solid']:
            reader = [
                io.read_solid(filename) for filename in reads_filename_set
            ]
        else:
            reader = [
                io.read_sequences(filename, qualities)
                for filename in reads_filename_set
            ]
        reader = itertools.izip(*reader)

        for i, items in enumerate(reader):
            if i % stride == 0:
                for item in items:
                    yield item
Пример #20
0
def run_toolbox(action_classes, script_name=''):
    """
    Provide a command line interface for a list of Actions.
    
    Note:    
    strings included in the action_classes list will be printed 
    as help text, for example to display section headings.
    """
    commands = { }
    help = [ '\n' ]
    for item in action_classes:
        if isinstance(item, str):
            help.append(config.wrap(item, 70) + '\n\n')
            continue
        name = item.shell_name()
        commands[ name ] = item
        help.append('    %s\n' % config.colored(1,name+':'))
        help.append(config.wrap(item.help_short, 70, '        ') + '\n\n')

    args = sys.argv[1:]
    
    if not args:
        config.write_colored_text(sys.stdout, ''.join(help)+'\n\n')
        sys.exit(1)
        
    try:        
        command, args = args[0], args[1:]
        
        mangled_command = command.lower().rstrip(':')
        if mangled_command not in commands:
            raise grace.Error("Don't know how to "+command)        
    except:
        config.report_exception()
        sys.exit(1)

    config.shell_run(commands[mangled_command](), args, (script_name+' ' if script_name else '') + mangled_command+':')
Пример #21
0
    def run(self):
        sequences = []
        annotations = []
        for filename in self.filenames:
            any = False
            if io.is_sequence_file(filename):
                sequences.append(filename)
                any = True
            if annotation.is_annotation_file(filename):
                annotations.append(filename)
                any = True
            if not any:
                raise grace.Error(
                    filename +
                    ' is neither a sequence file nor an annotation file that nesoni can read.'
                )

        reference = Reference(self.output_dir, must_exist=False)
        reference.set_sequences(sequences)
        reference.set_annotations(annotations)
        if self.ls:
            reference.build_shrimp_mmap(False)
        if self.cs:
            reference.build_shrimp_mmap(True)
Пример #22
0
    def run(self):
        bams = []
        reference = None
        reference2 = None

        extra = []

        for sample in self.samples:
            if sam.is_bam(sample):
                bams.append(sample)
            elif os.path.isdir(sample):
                working = working_directory.Working(sample, True)
                bams.append(working.get_filtered_sorted_bam())
                extra.append('##sampleTags=' + ','.join(working.get_tags()))
                if reference2 is None:
                    reference2 = working.get_reference(
                    ).reference_fasta_filename()
            elif io.is_sequence_file(sample):
                assert reference is None, 'Only one reference FASTA file allowed.'
                reference = sample

        if reference is None:
            reference = reference2
        if reference is None:
            raise grace.Error('No reference FASTA file given.')

        with nesoni.Stage() as stage:
            tempspace = stage.enter(workspace.tempspace())
            if self.depth_limit:
                with nesoni.Stage() as stage2:
                    for i in xrange(len(bams)):
                        sam.Bam_depth_limit(
                            tempspace / ('%d' % i),
                            bams[i],
                            depth=self.depth_limit).process_make(stage2)
                        bams[i] = tempspace / ('%d.bam' % i)

            # FreeBayes claims to handle multiple bams, but it doesn't actually work
            if len(bams) > 1:
                sam.Bam_merge(tempspace / 'merged', bams=bams,
                              index=False).run()
                bams = [tempspace / 'merged.bam']

            command = [
                'freebayes',
                '-f',
                reference,
                '--ploidy',
                str(self.ploidy),
                '--pvar',
                str(self.pvar),
            ] + self.freebayes_options + bams

            self.log.log('Running: ' + ' '.join(command) + '\n')

            f_out = stage.enter(open(self.prefix + '.vcf', 'wb'))
            f_in = stage.enter(io.pipe_from(command))
            done_extra = False
            for line in f_in:
                if not done_extra and not line.startswith('##'):
                    for extra_line in extra:
                        f_out.write(extra_line + '\n')
                    done_extra = True
                f_out.write(line)

        index_vcf(self.prefix + '.vcf')
Пример #23
0
    def run(self):
        assert self.reads or self.pairs or self.interleaved, 'No reads given'

        io.check_name_uniqueness(self.reads, self.pairs, self.interleaved)

        working = self.get_workspace()
        working.setup_reference(self.references, bowtie=True)
        working.update_param(snp_cost=2.0)
        reference = working.get_reference()

        log_file = open(self.log_filename(), 'wb')

        with workspace.tempspace(dir=working.working_dir) as temp:
            n = [0]

            def tempname():
                n[0] += 1
                return temp / ('%d.fq' % n[0])

            def convert(filename):
                info = io.get_file_info(filename)
                ok = selection.matches(
                    'type-fastq:[compression-none/compression-gzip/compression-bzip2]',
                    info)
                if ok:
                    return filename
                result_name = tempname()
                with open(result_name, 'wb') as f:
                    for name, seq, qual in io.read_sequences(
                            filename, qualities='required'):
                        io.write_fastq(f, name, seq, qual)
                return result_name

            ones = []
            twos = []
            singles = []

            for pair in self.pairs:
                assert len(
                    pair) == 2, 'Need two files in each "pair:" section.'
                ones.append(convert(pair[0]))
                twos.append(convert(pair[1]))

            for item in self.interleaved:
                left_name = tempname()
                right_name = tempname()
                ones.append(left_name)
                twos.append(right_name)
                with open(left_name,'wb') as left, \
                     open(right_name,'wb') as right:
                    reader = io.read_sequences(item, qualities='required')
                    while True:
                        try:
                            name, seq, qual = reader.next()
                        except StopIteration:
                            break
                        io.write_fastq(left, name, seq, qual)

                        try:
                            name, seq, qual = reader.next()
                        except StopIteration:
                            raise grace.Error(
                                'Interleaved file contains odd number of sequences'
                            )
                        io.write_fastq(right, name, seq, qual)

            for item in self.reads:
                singles.append(convert(item))

            cores = min(self.cores, legion.coordinator().get_cores())

            command = ([
                'bowtie2',
                '--threads',
                str(cores),
                '--rg-id',
                '1',
                '--rg',
                'SM:' + working.name,
            ] + self.bowtie_options +
                       ['-x', reference.get_bowtie_index_prefix()])
            commands = []
            if ones:
                commands.append(command +
                                ['-1', ','.join(ones), '-2', ','.join(twos)])
            if singles:
                commands.append(command + ['-U', ','.join(singles)])

            temp_bam_name = temp / 'temp.bam'

            with io.pipe_to(['samtools', 'view', '-S', '-b', '-'],
                            stdout=open(temp_bam_name, 'wb'),
                            stderr=log_file) as f:
                header_sent = False
                for command in commands:
                    self.log.log('Running:\n' + ' '.join(command) + '\n')
                    with io.pipe_from(command, stderr=log_file,
                                      cores=cores) as f_out:
                        for line in f_out:
                            if not header_sent or not line.startswith('@'):
                                f.write(line)
                    header_sent = True

            #io.execute([
            #    'samtools', 'sort', '-n', temp_bam_name, working/'alignments'
            #    ])

            sam.sort_bam(temp_bam_name,
                         working / 'alignments',
                         by_name=True,
                         cores=self.cores)

        log_file.close()
Пример #24
0
def run_toolbox(action_classes, script_name='', show_make_flags=True):
    """
    Provide a command line interface for a list of Actions.
    
    Note:    
    strings included in the action_classes list will be printed 
    as help text, for example to display section headings.
    """
    args = configure_making(sys.argv[1:])

    commands = {}

    for item in action_classes:
        if isinstance(item, str):
            continue
        name = item.shell_name()
        commands[name] = item

    if args == ['--help-make']:
        help = ['\n']
        help.append(
            '\nMake options:\n' +
            Make().describe('', show_help=True, escape_newlines=False) + '\n')

        config.write_colored_text(sys.stdout, ''.join(help) + '\n\n')
        sys.exit(1)

    if not args or args == ['-h'] or args == ['--help']:
        help = ['\n']

        for item in action_classes:
            if isinstance(item, str):
                help.append(config.wrap(item, 70) + '\n\n')
                continue
            name = item.shell_name()
            help.append('    %s\n' % config.colored(1, name + ':'))
            help.append(
                config.color_as_comment(
                    config.wrap(item.help_short, 70, '        ')) + '\n\n')

        if show_make_flags:
            #help.append('\nMake options:\n'+Make().describe('', show_help=True, escape_newlines=False)+'\n')
            help.append(
                '\nFor workflow make options type "%s --help-make".\n' %
                script_name)

        config.write_colored_text(sys.stdout, ''.join(help))
        sys.exit(1)

    try:
        command, args = args[0], args[1:]

        mangled_command = command.lower().rstrip(':')
        if mangled_command not in commands:
            raise grace.Error("Don't know how to " + command)
    except:
        config.report_exception()
        sys.exit(1)

    config.shell_run(commands[mangled_command](), args,
                     (script_name + ' ' if script_name else '') +
                     mangled_command + ':')
Пример #25
0
    def run(self):
        log = self.log

        #quality_cutoff, args = grace.get_option_value(args, '--quality', int, 10)
        #qoffset, args = grace.get_option_value(args, '--qoffset', int, None)
        #clip_ambiguous, args = grace.get_option_value(args, '--clip-ambiguous', grace.as_bool, True)
        #length_cutoff, args = grace.get_option_value(args, '--length', int, 24)
        #adaptor_cutoff, args = grace.get_option_value(args, '--match', int, 10)
        #max_error, args = grace.get_option_value(args, '--max-errors', int, 1)
        #adaptor_set, args = grace.get_option_value(args, '--adaptors', str, 'truseq-adapter,truseq-srna,genomic,multiplexing,pe,srna')
        #disallow_homopolymers, args = grace.get_option_value(args, '--homopolymers', grace.as_bool, False)
        #reverse_complement, args = grace.get_option_value(args, '--revcom', grace.as_bool, False)
        #trim_start, args = grace.get_option_value(args, '--trim-start', int, 0)
        #trim_end, args = grace.get_option_value(args, '--trim-end', int, 0)
        #output_fasta, args = grace.get_option_value(args, '--fasta', grace.as_bool, False)
        #use_gzip, args = grace.get_option_value(args, '--gzip', grace.as_bool, True)
        #output_rejects, args = grace.get_option_value(args, '--rejects', grace.as_bool, False)
        #grace.expect_no_further_options(args)

        prefix = self.prefix
        log_name = os.path.split(prefix)[1]

        quality_cutoff = self.quality
        qoffset = self.qoffset
        clip_ambiguous = self.clip_ambiguous
        length_cutoff = self.length
        adaptor_cutoff = self.match
        max_error = self.max_errors
        adaptor_set = self.adaptors
        disallow_homopolymers = self.homopolymers
        reverse_complement = self.revcom
        trim_start = self.trim_start
        trim_end = self.trim_end
        output_fasta = self.fasta
        use_gzip = self.gzip
        output_rejects = self.rejects

        iterators = []
        filenames = []
        any_paired = False

        for filename in self.reads:
            filenames.append(filename)
            iterators.append(
                itertools.izip(io.read_sequences(filename, qualities=True)))

        for pair_filenames in self.pairs:
            assert len(pair_filenames
                       ) == 2, 'Expected a pair of files for "pairs" section.'
            filenames.extend(pair_filenames)
            any_paired = True
            iterators.append(
                itertools.izip(
                    io.read_sequences(pair_filenames[0], qualities=True),
                    io.read_sequences(pair_filenames[1], qualities=True)))

        for filename in self.interleaved:
            filenames.extend(filename)
            any_paired = True
            iterators.append(
                deinterleave(io.read_sequences(filename, qualities=True)))

        fragment_reads = (2 if any_paired else 1)
        read_in_fragment_names = ['read-1', 'read-2'
                                  ] if any_paired else ['read']

        assert iterators, 'Nothing to clip'

        if qoffset is None:
            guesses = [
                io.guess_quality_offset(filename) for filename in filenames
            ]
            assert len(
                set(guesses)
            ) == 1, 'Conflicting quality offset guesses, please specify manually.'
            qoffset = guesses[0]
            log.log('FASTQ offset seems to be %d\n' % qoffset)

        quality_cutoff_char = chr(qoffset + quality_cutoff)

        #log.log('Minimum quality:        %d (%s)\n' % (quality_cutoff, quality_cutoff_char))
        #log.log('Clip ambiguous bases:   %s\n' % (grace.describe_bool(clip_ambiguous)))
        #log.log('Minimum adaptor match:  %d bases, %d errors\n' % (adaptor_cutoff, max_error))
        #log.log('Minimum length:         %d bases\n' % length_cutoff)

        adaptor_seqs = []
        adaptor_names = []
        if adaptor_set and adaptor_set.lower() != 'none':
            for item in adaptor_set.split(','):
                item = item.strip().lower() + ' '
                any = False
                for line in ADAPTORS.strip().split('\n'):
                    if line.startswith('#'): continue
                    if not line.lower().startswith(item): continue
                    any = True
                    name, seq = line.rsplit(None, 1)
                    seq = seq.replace('U', 'T')

                    #if seq in adaptor_seqs: print 'Dup', name
                    adaptor_seqs.append(seq)
                    adaptor_names.append(name)
                    adaptor_seqs.append(bio.reverse_complement(seq))
                    adaptor_names.append(name)
                if not any:
                    raise grace.Error('Unknown adaptor set: ' + item)

        matcher = Matcher(adaptor_seqs, adaptor_names, max_error)

        start_clips = [
            collections.defaultdict(list) for i in xrange(fragment_reads)
        ]
        end_clips = [
            collections.defaultdict(list) for i in xrange(fragment_reads)
        ]

        if output_fasta:
            write_sequence = io.write_fasta_single_line
        else:
            write_sequence = io.write_fastq

        f_single = io.open_possibly_compressed_writer(
            self.reads_output_filenames()[0])
        if fragment_reads == 2:
            f_paired = io.open_possibly_compressed_writer(
                self.interleaved_output_filenames()[0])
        if output_rejects:
            f_reject = io.open_possibly_compressed_writer(
                self.rejects_output_filenames()[0])

        n_single = 0
        n_paired = 0

        n_in_single = 0
        n_in_paired = 0
        total_in_length = [0] * fragment_reads

        n_out = [0] * fragment_reads
        n_q_clipped = [0] * fragment_reads
        n_a_clipped = [0] * fragment_reads
        n_homopolymers = [0] * fragment_reads
        total_out_length = [0] * fragment_reads

        #log.attach(open(prefix + '_log.txt', 'wb'))

        for iterator in iterators:
            for fragment in iterator:
                if (n_in_single + n_in_paired) % 10000 == 0:
                    grace.status(
                        'Clipping fragment %s' %
                        grace.pretty_number(n_in_single + n_in_paired))

                if len(fragment) == 1:
                    n_in_single += 1
                else:
                    n_in_paired += 1

                graduates = []
                rejects = []
                for i, (name, seq, qual) in enumerate(fragment):
                    name = name.split()[0]
                    seq = seq.upper()
                    total_in_length[i] += len(seq)

                    start = trim_start
                    best_start = 0
                    best_len = 0
                    for j in xrange(len(seq) - trim_end):
                        if qual[j] < quality_cutoff_char or \
                           (clip_ambiguous and seq[j] not in 'ACGT'):
                            if best_len < j - start:
                                best_start = start
                                best_len = j - start
                            start = j + 1
                    j = len(seq) - trim_end
                    if best_len < j - start:
                        best_start = start
                        best_len = j - start

                    clipped_seq = seq[best_start:best_start + best_len]
                    clipped_qual = qual[best_start:best_start + best_len]
                    if len(clipped_seq) < length_cutoff:
                        n_q_clipped[i] += 1
                        rejects.append((name, seq, qual, 'quality'))
                        continue

                    match = matcher.match(clipped_seq)
                    if match and match[0] >= adaptor_cutoff:
                        clipped_seq = clipped_seq[match[0]:]
                        clipped_qual = clipped_qual[match[0]:]
                        start_clips[i][match[0]].append(match[1][0])
                        if len(clipped_seq) < length_cutoff:
                            n_a_clipped[i] += 1
                            rejects.append((name, seq, qual, 'adaptor'))
                            continue

                    match = matcher.match(bio.reverse_complement(clipped_seq))
                    if match and match[0] >= adaptor_cutoff:
                        clipped_seq = clipped_seq[:len(clipped_seq) - match[0]]
                        clipped_qual = clipped_qual[:len(clipped_qual) -
                                                    match[0]]
                        end_clips[i][match[0]].append(match[1][0])
                        if len(clipped_seq) < length_cutoff:
                            n_a_clipped[i] += 1
                            rejects.append((name, seq, qual, 'adaptor'))
                            continue

                    if disallow_homopolymers and len(set(clipped_seq)) <= 1:
                        n_homopolymers[i] += 1
                        rejects.append((name, seq, qual, 'homopolymer'))
                        continue

                    graduates.append((name, clipped_seq, clipped_qual))
                    n_out[i] += 1
                    total_out_length[i] += len(clipped_seq)

                if output_rejects:
                    for name, seq, qual, reason in rejects:
                        write_sequence(f_reject, name + ' ' + reason, seq,
                                       qual)

                if graduates:
                    if reverse_complement:
                        graduates = [(name, bio.reverse_complement(seq),
                                      qual[::-1])
                                     for name, seq, qual in graduates]

                    if len(graduates) == 1:
                        this_f = f_single
                        n_single += 1
                    else:
                        assert len(graduates) == 2
                        this_f = f_paired
                        n_paired += 1

                    for name, seq, qual in graduates:
                        write_sequence(this_f, name, seq, qual)

        grace.status('')

        if output_rejects:
            f_reject.close()
        if fragment_reads == 2:
            f_paired.close()
        f_single.close()

        def summarize_clips(name, location, clips):
            total = 0
            for i in clips:
                total += len(clips[i])
            log.datum(log_name, name + ' adaptors clipped at ' + location,
                      total)

            if not clips:
                return

            for i in xrange(min(clips), max(clips) + 1):
                item = clips[i]
                log.quietly_log('%3d bases: %10d ' % (i, len(item)))
                if item:
                    avg_errors = float(sum(item2[0]
                                           for item2 in item)) / len(item)
                    log.quietly_log(' avg errors: %5.2f  ' % avg_errors)

                    counts = collections.defaultdict(int)
                    for item2 in item:
                        counts[item2[1]] += 1
                    #print counts
                    for no in sorted(counts,
                                     key=lambda item2: counts[item2],
                                     reverse=True)[:2]:
                        log.quietly_log('%dx%s ' %
                                        (counts[no], matcher.names[no]))
                    if len(counts) > 2: log.quietly_log('...')

                log.quietly_log('\n')
            log.quietly_log('\n')

        if n_in_paired:
            log.datum(log_name, 'read-pairs', n_in_paired)
        if n_in_single:
            log.datum(log_name, 'single reads', n_in_single)

        for i in xrange(fragment_reads):
            if start_clips:
                summarize_clips(read_in_fragment_names[i], 'start',
                                start_clips[i])

            if end_clips:
                summarize_clips(read_in_fragment_names[i], 'end', end_clips[i])

                prefix = read_in_fragment_names[i]

            log.datum(log_name, prefix + ' too short after quality clip',
                      n_q_clipped[i])
            log.datum(log_name, prefix + ' too short after adaptor clip',
                      n_a_clipped[i])
            if disallow_homopolymers:
                log.datum(log_name, prefix + ' homopolymers',
                          n_homopolymers[i])
            if fragment_reads > 1:
                log.datum(log_name, prefix + ' kept', n_out[i])
            log.datum(log_name, prefix + ' average input length',
                      float(total_in_length[i]) / (n_in_single + n_in_paired))
            if n_out[i]:
                log.datum(log_name, prefix + ' average output length',
                          float(total_out_length[i]) / n_out[i])

        if fragment_reads == 2:
            log.datum(log_name, 'pairs kept after clipping', n_paired)
        log.datum(log_name, 'reads kept after clipping', n_single)
Пример #26
0
def nway_main(gbk_filename, use_indels, use_reference, give_evidence, give_consequences,
              require_all, require_bisect, full_output, format, working_dirs, split_a, split_b, f=sys.stdout):
    assert working_dirs, 'Need at least one working directory.'
    workspaces = [ working_directory.Working(dirname, must_exist=True) for dirname in working_dirs ]
    reference = workspaces[0].get_reference()
    #if not annotation_filename:
    #    annotation_filename = reference.annotations_filename() #May still be None
    
    if use_reference:
        names = ['reference']
        evidence_start = 1
    else:
        names = [ ]
        evidence_start = 0
        
    names.extend( norm_name(item) for item in  working_dirs )
    
    references = io.read_sequences(reference.reference_fasta_filename())
    
    annotations = { }
    if gbk_filename:
        from Bio import SeqIO
        for record in SeqIO.parse(io.open_possibly_compressed_file(gbk_filename),'genbank'):
            sequence = record.seq.tostring()
            features = [ item for item in record.features if item.type != 'source' ]
            features.sort(key=lambda item: item.location.nofuzzy_start)
            annotations[sequence] = features
    
    iterator = reader(working_dirs, references, use_reference, annotations)
    
    if not use_indels:
        iterator = itertools.ifilter(has_no_indels, iterator)

    if require_all or require_bisect or format == 'counts':
        iterator = itertools.ifilter(fully_unambiguous, iterator)
    
    if require_bisect:
        iterator = itertools.ifilter(is_binary_partition, iterator)

    if not require_bisect:
        if full_output:
            iterator = itertools.ifilter(not_boring_insertion, iterator)
        else:
            iterator = itertools.ifilter(is_interesting, iterator)

    if split_a or split_b:
        assert len(names) == len(set(names)), 'Two samples with the same name'
        try:
            split_a = [ names.index(norm_name(item)) for item in split_a ]
            split_b = [ names.index(norm_name(item)) for item in split_b ]
        except ValueError:
            raise grace.Error('Sample to be split is not amongst samples given')
        iterator = itertools.ifilter(is_split(split_a, split_b), iterator)

    #if limit:
    #    iterator = itertools.islice(iterator, limit)
    
    if format == 'table':
        line = 'Reference\tPosition\tChange type'
        line +=  '\t' + '\t'.join(names)
        if give_evidence:
            line += '\t' + '\t'.join(names[evidence_start:])
        if give_consequences:
            line += '\t' + '\t'.join(names[evidence_start:])
        if annotations:
            line += '\tAnnotations'
        print >> f, line
        for calls in iterator:
            line = '%s\t%d\t%s\t%s' % (
                calls.ref_name, 
                calls.ref_pos+1, 
                change_type(calls), 
                '\t'.join(item.consensus for item in calls.calls))
            if give_evidence:
                line += '\t' + '\t'.join(item.evidence for item in calls.calls[evidence_start:])
            if give_consequences:
                line += '\t' + '\t'.join(item.consequences for item in calls.calls[evidence_start:])
            if annotations:
                line += '\t' + describe_features(calls.features)
            print >> f, line

    elif format == 'compact':
        for line in transpose_strings(names):
            print >> f, line
        print >> f
        
        for calls in iterator:
            if calls.is_insertion:
                footer = '%12d.5 %s' % (calls.ref_pos, calls.ref_name)
            else: 
                footer = '%12d   %s' % (calls.ref_pos+1, calls.ref_name)
            
            t = transpose_strings([ item.consensus for item in calls.calls ], '-', 1)
            top = t[0] + ' ' + footer
            if give_consequences:
                consequences = [ ]
                for call in calls.calls:
                    if call.consequences:
                        for item in call.consequences.split(', '):
                            item = ' '.join(item.split()[:3])
                            if item not in consequences: consequences.append(item)
                        
                if consequences:
                    top += '  ' + ' / '.join(sorted(consequences))
            top += '  ' + describe_features(calls.features)
            print >> f, top
            for line in t[1:]:
                print >> f, line            
    
    elif format == 'nexus':
        buckets = [ [ ] for name in names ]
        for calls in iterator:
            for i, char in enumerate(partition_string(calls)):
                buckets[i].append(char)
        
        print >> f, '#NEXUS'
        print >> f, 'begin taxa;'
        print >> f, 'dimensions ntax=%d;' % len(names)
        print >> f, 'taxlabels'
        for name in names:
            print >> f, name
        print >> f, ';'
        print >> f, 'end;'

        print >> f, 'begin characters;'
        print >> f, 'dimensions nchar=%d;' % len(buckets[0])
        print >> f, 'format datatype=STANDARD symbols="ACGT-0123456789" missing=N;'
        print >> f, 'matrix'
        for name, bucket in itertools.izip(names, buckets):
            print >> f, name, ''.join(bucket)
        print >> f, ';'
        print >> f, 'end;'
    
    elif format == 'counts':
        for line in transpose_strings(names):
            print >> f, line
        print >> f

        counts = { }
        for calls in iterator:
            count_str = partition_string(calls)
            if count_str not in counts:
                counts[count_str] = 1
            else:
                counts[count_str] += 1
        
        for count_str in sorted(counts, key=lambda x: (counts[x], x), reverse=True):
            print >> f, '%s   %d' % (transpose_strings(count_str)[0], counts[count_str])
    
    else:
        raise grace.Error('Unknown output format: ' + format)
Пример #27
0
def main(args):
    genbank_filename, args = grace.get_option_value(args,'--gbk',str,None)
    use_indels, args = grace.get_option_value(args,'--indels',grace.as_bool,True)
    use_reference, args = grace.get_option_value(args,'--reference',grace.as_bool,True)
    give_evidence, args = grace.get_option_value(args,'--evidence',grace.as_bool,True)
    give_consequences, args = grace.get_option_value(args,'--consequences',grace.as_bool,True)
    require_all, args = grace.get_option_value(args,'--require-all',grace.as_bool,False)
    require_bisect, args = grace.get_option_value(args,'--require-bisect',grace.as_bool,False)
    full_output, args = grace.get_option_value(args,'--full',grace.as_bool,False)
    format, args = grace.get_option_value(args,'--as',str,'table')
    
    # Secret option!
    limit, args = grace.get_option_value(args,'--limit',int,None)
    
    grace.expect_no_further_options(args)

    if len(args) < 1:
        sys.stderr.write(USAGE)
        return 1

    working_dirs = [ ]
    split_a = [ ]
    split_b = [ ]
    def default(args):
        working_dirs.extend(args)
    def splitting(args):
        split_a.extend(args)
    def splitting_from(args):
        split_b.extend(args)
        
    grace.execute(args, {
        'splitting' : splitting,
        'from' : splitting_from 
    }, default
    )
    
    if use_reference:
        names = ['reference']
        evidence_start = 1
    else:
        names = [ ]
        evidence_start = 0
        
    names.extend( norm_name(item) for item in  working_dirs )
        
    references = io.read_sequences(os.path.join(working_dirs[0], 'reference.fa'))
    
    annotations = { }
    if genbank_filename:
        from Bio import SeqIO
        for record in SeqIO.parse(io.open_possibly_compressed_file(genbank_filename),'genbank'):
            sequence = record.seq.tostring()
            features = [ item for item in record.features if item.type != 'source' ]
            features.sort(key=lambda item: item.location.nofuzzy_start)
            annotations[sequence] = features
    
    iterator = reader(working_dirs, references, use_reference, annotations)
    
    if not use_indels:
        iterator = itertools.ifilter(has_no_indels, iterator)

    if require_all or require_bisect or format == 'counts':
        iterator = itertools.ifilter(fully_unambiguous, iterator)
    
    if require_bisect:
        iterator = itertools.ifilter(is_binary_partition, iterator)

    if not require_bisect:
        if full_output:
            iterator = itertools.ifilter(not_boring_insertion, iterator)
        else:
            iterator = itertools.ifilter(is_interesting, iterator)

    if split_a or split_b:
        assert len(names) == len(set(names)), 'Two samples with the same name'
        try:
            split_a = [ names.index(norm_name(item)) for item in split_a ]
            split_b = [ names.index(norm_name(item)) for item in split_b ]
        except ValueError:
            raise grace.Error('Sample to be split is not amongst samples given')
        iterator = itertools.ifilter(is_split(split_a, split_b), iterator)

    if limit:
        iterator = itertools.islice(iterator, limit)
    
    if format == 'table':
        line = 'Reference\tPosition\tChange type'
        line +=  '\t' + '\t'.join(names)
        if give_evidence:
            line += '\t' + '\t'.join(names[evidence_start:])
        if give_consequences:
            line += '\t' + '\t'.join(names[evidence_start:])
        if annotations:
            line += '\tAnnotations'
        print line
        for calls in iterator:
            line = '%s\t%d\t%s\t%s' % (
                calls.ref_name, 
                calls.ref_pos+1, 
                change_type(calls), 
                '\t'.join(item.consensus for item in calls.calls))
            if give_evidence:
                line += '\t' + '\t'.join(item.evidence for item in calls.calls[evidence_start:])
            if give_consequences:
                line += '\t' + '\t'.join(item.consequences for item in calls.calls[evidence_start:])
            if annotations:
                line += '\t' + describe_features(calls.features)
            print line

    elif format == 'compact':
        for line in transpose_strings(names):
            print line
        print
        
        for calls in iterator:
            if calls.is_insertion:
                footer = '%12d.5 %s' % (calls.ref_pos, calls.ref_name)
            else: 
                footer = '%12d   %s' % (calls.ref_pos+1, calls.ref_name)
            
            t = transpose_strings([ item.consensus for item in calls.calls ], '-', 1)
            top = t[0] + ' ' + footer
            if give_consequences:
                consequences = [ ]
                for call in calls.calls:
                    if call.consequences:
                        for item in call.consequences.split(', '):
                            item = ' '.join(item.split()[:3])
                            if item not in consequences: consequences.append(item)
                        
                if consequences:
                    top += '  ' + ' / '.join(sorted(consequences))
            top += '  ' + describe_features(calls.features)
            print top
            for line in t[1:]:
                print line            
    
    elif format == 'nexus':
        buckets = [ [ ] for name in names ]
        for calls in iterator:
            for i, char in enumerate(partition_string(calls)):
                buckets[i].append(char)
        
        print '#NEXUS'
        print 'begin taxa;'
        print 'dimensions ntax=%d;' % len(names)
        print 'taxlabels'
        for name in names:
            print name
        print ';'
        print 'end;'

        print 'begin characters;'
        print 'dimensions nchar=%d;' % len(buckets[0])
        print 'format datatype=STANDARD symbols="ACGT-0123456789" missing=N;'
        print 'matrix'
        for name, bucket in itertools.izip(names, buckets):
            print name, ''.join(bucket)
        print ';'
        print 'end;'
    
    elif format == 'counts':
        for line in transpose_strings(names):
            print line
        print

        counts = { }
        for calls in iterator:
            count_str = partition_string(calls)
            if count_str not in counts:
                counts[count_str] = 1
            else:
                counts[count_str] += 1
        
        for count_str in sorted(counts, key=lambda x: (counts[x], x), reverse=True):
            print '%s   %d' % (transpose_strings(count_str)[0], counts[count_str])
    
    else:
        raise grace.Error('Unknown output format: ' + format)
Пример #28
0
def main(args):
    default_transl_table, args = grace.get_option_value(
        args, '--transl_table', int, 11)
    use_coverage, args = grace.get_flag(args, '--use-coverage')
    coverage_cutoff, args = grace.get_option_value(args, '--coverage-cutoff',
                                                   float, 0.1)
    tabular, args = grace.get_flag(args, '--tabular')
    noheader, args = grace.get_flag(args, '--noheader')
    verbose, args = grace.get_flag(args, '--verbose')
    bandwidth, args = grace.get_option_value(args, '--band', int, 20)
    grace.expect_no_further_options(args)

    if len(args) != 2:
        print USAGE
        return 1

    genbank_filename = args[0]
    alignment_filename = args[1]

    if os.path.isdir(alignment_filename):
        alignment_filename = os.path.join(alignment_filename, 'alignment.maf')

    working_dir = os.path.split(alignment_filename)[0]

    alignments = load_alignments(alignment_filename)

    summaries = []
    details = []

    if not noheader:
        fields = 'Sequence\tLocus tag\tOld length (aa)\tNew length (aa)\tAmino acid changes\t'
        if use_coverage:
            fields += 'Unambiguous coverage vs expected\t\tAmbiguous coverage vs expected\t\tAmbiguous percent with any hits\t'
        fields += 'Gene\tProduct'
        if tabular: fields += '\tChanges of note'
        print fields

    for record in SeqIO.parse(
            io.open_possibly_compressed_file(genbank_filename), 'genbank'):
        sequence = record.seq.tostring()

        for name, seq1, seq2, alignment in alignments:
            if seq1 == sequence: break
        else:
            raise grace.Error(
                'Genbank record %s sequence not identical to any reference sequence'
                % record.id)

        if use_coverage:
            depth = get_graph(working_dir, name, 'depth')
            ambiguous_depth = get_graph(working_dir, name, 'ambiguous-depth')
            median_depth = numpy.median(depth)
            median_ambiguous_depth = numpy.median(ambiguous_depth)
            ambiguous_factor = float(median_ambiguous_depth) / median_depth
            depth_expect = expected_depth(name, sequence, depth,
                                          ambiguous_depth)

        for feature in record.features:
            if feature.type != 'CDS': continue

            if 'locus_tag' not in feature.qualifiers:
                locus_tag = '%d..%d' % (feature.location.nofuzzy_start + 1,
                                        feature.location.nofuzzy_end)
            else:
                locus_tag = feature.qualifiers['locus_tag'][0]

            if 'transl_table' in feature.qualifiers:
                transl_table_no = int(feature.qualifiers['transl_table'][0])
            else:
                assert default_transl_table is not None, 'No /transl_table for CDS, and default transl_table not given'
                transl_table_no = default_transl_table

            transl_table = CodonTable.ambiguous_dna_by_id[transl_table_no]
            start_codons = transl_table.start_codons

            try:
                feature_alignment = alignment_from_feature(sequence, feature)
            except Weird_alignment:
                warn('%s has a location I could not handle, skipping, sorry' %
                     locus_tag)
                continue

            dna = []
            new_dna = []
            shifts = []
            for i in xrange(feature_alignment.end2):
                p1 = feature_alignment.back_project(i, left=False)
                p2 = feature_alignment.back_project(i + 1, left=True)
                assert abs(p2 - p1) < 2
                dna.append(sequence_slice(sequence, p1, p2))

                p1a = alignment.project(p1, left=False)
                p2a = alignment.project(p2, left=False)  #Hmm

                diff = (p2 - p1) - (p2a - p1a)
                #if diff:
                #    if diff%3:
                #        frame_shift = True
                #    else:
                #        frame_preserving_shift = True
                new_dna.append(sequence_slice(seq2, p1a, p2a))

                if diff:
                    shifts.append((i, dna[-1], new_dna[-1]))

            dna = ''.join(dna)
            new_dna = ''.join(new_dna)

            # This usually indicated a CDS truncated at the start?
            # in which case, will probably fail some way or other down the line.
            if 'codon_start' in feature.qualifiers:
                codon_start = int(feature.qualifiers['codon_start'][0]) - 1
            else:
                codon_start = 0
            dna = dna[codon_start:]
            new_dna = new_dna[codon_start:]

            if len(dna) % 3 != 0:
                warn(locus_tag + ' length not a multiple of 3')
            #assert len(new_dna) % 3 == 0

            protein = Seq.Seq(dna).translate(table=transl_table_no).tostring()
            # http://en.wikipedia.org/wiki/Start_codon is always translated to M
            protein = 'M' + protein[1:]

            if dna[:3] not in start_codons:
                warn(locus_tag + ' has unknown start codon: ' + dna[:3])

            original_lacks_stop_codon = not protein.endswith('*')
            if original_lacks_stop_codon:
                warn(locus_tag + ' lacks end codon')
            original_stops_before_end = '*' in protein[:-1]
            if original_stops_before_end:
                warn(locus_tag + ' contains stop codon before end')

            if 'translation' in feature.qualifiers:
                expect = feature.qualifiers['translation'][0]
                if protein[:-1] != expect:
                    warn(
                        locus_tag +
                        ' translation given in feature does not match translation from DNA'
                    )

            new_protein = Seq.Seq(new_dna).translate(
                table=transl_table_no).tostring()
            new_protein = 'M' + new_protein[1:]

            # If end codon changed, find new end
            # Don't bother if there are unknown amino acids or
            # the original protein lacks a stop codon
            if 'X' not in new_protein and '*' not in new_protein and not original_lacks_stop_codon:
                #This is very inefficient
                i = feature_alignment.end2
                while True:
                    p1 = feature_alignment.back_project(i, left=False)
                    p2 = feature_alignment.back_project(i + 1, left=True)
                    p1a = alignment.project(p1, left=False)
                    p2a = alignment.project(p2, left=False)  #Hmm
                    if p1a < 0 or p2a < 0 or p1a > len(seq2) or p2a > len(
                            seq2):
                        break

                    new_dna += sequence_slice(seq2, p1a, p2a)
                    new_protein = Seq.Seq(new_dna).translate(
                        table=transl_table_no).tostring()
                    new_protein = 'M' + new_protein[1:]
                    if 'X' in new_protein or '*' in new_protein: break

                    i += 1

            # Is the protein shorter?
            # Don't bother checking if the original protein has extra stop codons
            if '*' in new_protein and not original_stops_before_end:
                new_protein = new_protein[:new_protein.index('*') + 1]

            # If indels occurred, do an alignment
            # Don't bother otherwise
            if shifts:
                # Penalize gaps with cost 2 (vs 1 for mismatch)
                # If lengths don't match, pad with spaces (won't match longer seq),
                # aligner prefers mismatch to gaps

                #result = pairwise2.align.globalxs(protein      + ' '*max(0,len(new_protein)-len(protein)),
                #                                  new_protein  + ' '*max(0,len(protein)-len(new_protein)),
                #                                  -2.001,-2.000)[0]
                # 2.001 : very slightly prefer contiguous gaps. Also much faster!

                result = band_limited_align(
                    protein + ' ' * max(0,
                                        len(new_protein) - len(protein)),
                    new_protein + ' ' * max(0,
                                            len(protein) - len(new_protein)),
                    bandwidth)

                protein_ali = result[0]
                new_protein_ali = result[1]
            else:
                protein_ali = protein
                new_protein_ali = new_protein

            diffs = []
            j = 0
            k = 0
            for i in xrange(min(len(new_protein_ali), len(protein_ali))):
                if protein_ali[i] != ' ' and new_protein_ali[i] != ' ' and (
                        protein_ali[i] == '-' or new_protein_ali[i] == '-'
                        or not bio.might_be_same_amino(protein_ali[i],
                                                       new_protein_ali[i])):
                    diffs.append((i, j, k))
                if protein_ali[i] != '-':
                    j += 1
                if new_protein_ali[i] != '-':
                    k += 1

            diff_start = not bio.might_be_same_base(new_dna[0],dna[0]) or \
                         not bio.might_be_same_base(new_dna[1],dna[1]) or \
                         not bio.might_be_same_base(new_dna[2],dna[2])

            interesting_coverage = False
            if use_coverage:
                cds_depth = depth[feature_alignment.start1:
                                  feature_alignment.end1]  #/ median_depth
                if not feature_alignment.forward1: cds_depth = cds_depth[::-1]
                cds_ambiguous_depth = ambiguous_depth[
                    feature_alignment.start1:
                    feature_alignment.end1]  #/ median_ambiguous_depth
                if not feature_alignment.forward1:
                    cds_ambiguous_depth = cds_ambiguous_depth[::-1]

                cds_depth_expect = depth_expect[feature_alignment.
                                                start1:feature_alignment.end1]
                if not feature_alignment.forward1:
                    cds_depth_expect = cds_depth_expect[::-1]

                #cds_average_depth_ratio = numpy.average(depth[feature_alignment.start1:feature_alignment.end1]) / median_depth
                #cds_average_ambiguous_depth_ratio = numpy.average(ambiguous_depth[feature_alignment.start1:feature_alignment.end1]) / median_ambiguous_depth
                #line += '%.1f\t' % cds_average_depth_ratio
                #line += '%.1f\t' % cds_average_ambiguous_depth_ratio

                #line += '%.1f..%.1f\t' % (numpy.minimum.reduce(cds_depth)/median_depth, numpy.maximum.reduce(cds_depth)/median_depth)
                #line += '%.1f+/-%.1f\t' % (numpy.average(cds_depth)/median_depth, numpy.var(cds_depth)**0.5/median_depth)
                #line += '%.1f..%.1f\t' % (numpy.minimum.reduce(cds_ambiguous_depth)/median_ambiguous_depth, numpy.maximum.reduce(cds_ambiguous_depth)/median_ambiguous_depth)

                avg_expect = numpy.average(cds_depth_expect)
                if avg_expect > 0.0:
                    cds_avg_depth = numpy.average(cds_depth) / avg_expect
                    cds_avg_ambiguous_depth = numpy.average(
                        cds_ambiguous_depth) / avg_expect / ambiguous_factor

                strange = ((cds_depth >= cds_depth_expect * 1.5) |
                           (cds_ambiguous_depth <= cds_depth_expect *
                            (0.5 * ambiguous_factor)))

                interesting_coverage = numpy.average(
                    strange) >= coverage_cutoff

            if interesting_coverage or diffs or diff_start or shifts or len(
                    new_protein) != len(protein):
                line = name + '\t' + locus_tag + '\t' + \
                      '%d\t' % (len(protein)-1) + \
                      '%d\t' % (len(new_protein)-1) + \
                      '%d\t' % len(diffs)

                if use_coverage:
                    if avg_expect <= 0.0:
                        line += '\t\t\t'
                    else:
                        line += '%.1f\t' % (cds_avg_depth) + graphlet(
                            cds_depth, cds_depth_expect) + '\t'
                        line += '%.1f\t' % (
                            cds_avg_ambiguous_depth) + graphlet(
                                cds_ambiguous_depth,
                                cds_depth_expect * ambiguous_factor) + '\t'
                        line += '%.1f%%\t' % (
                            numpy.average(cds_ambiguous_depth > 0.0) * 100.0)

                line += '%s\t' % feature.qualifiers.get('gene',[''])[0] + \
                        '%s' % feature.qualifiers.get('product',[''])[0]

                notes = []

                if use_coverage and 'X' in new_protein:
                    xs = new_protein.count('X')
                    if xs == len(new_protein) - 1:  #First is M, so len-1
                        notes.append('\ No consensus')
                    else:
                        notes.append('\ No consensus for %d aa' %
                                     (new_protein.count('X')))

                if len(new_protein) < len(protein):
                    notes.append('\ Shorter by %d aa' %
                                 (len(protein) - len(new_protein)))

                if len(new_protein) > len(protein):
                    notes.append('\ Longer by %d aa' %
                                 (len(new_protein) - len(protein)))

                if diff_start:
                    notes.append('\ Start changed: %s -> %s' %
                                 (dna[:3], new_dna[:3]))
                    if new_dna[:3] not in start_codons:
                        notes.append('  No longer a start codon!')

                if shifts:
                    notes.append('\ Indels:')

                    for pos, old, new in shifts:
                        notes.append('    base %5d / codon %5d   %s -> %s' %
                                     (pos + 1,
                                      (pos // 3) + 1, old, new or '-'))

                if diffs:
                    if verbose:
                        notes.append('\ Amino acid changes:')
                        for i, j, k in diffs:
                            notes.append(
                                '    codon %5d   %s->%s   (%s->%s)' %
                                (j + 1, protein_ali[i], new_protein_ali[i],
                                 dna[j * 3:j * 3 + 3] if protein_ali[i] != '-'
                                 else '-', new_dna[k * 3:k * 3 + 3]
                                 if new_protein_ali[i] != '-' else '-'))

                #if len(new_protein) > len(protein):
                #    print 'New protein is longer:', new_protein[len(protein):]
                #if len(new_protein) < len(protein):
                #    print 'New protein is shorter:', protein[len(new_protein):]
                #print protein
                #print new_protein

                if tabular:
                    print line + '\t' + ' '.join(
                        [' '.join(note.strip().split()) for note in notes])
                else:
                    print line
                    for note in notes:
                        print '\t' + note
    return 0
Пример #29
0
def get_file_info(filename):
    info = selection.Matchable_set()
    info.add('compression-' + get_compression_type(filename))

    if os.path.isdir(filename):
        any = False
        if os.path.exists(join(filename, 'alignments.bam')):
            info.add('type-working')
            any = True
        if os.path.exists(join(filename, 'reference.fa')):
            info.add('type-reference')
            any = True
        if not any:
            raise grace.Error('Unrecognized directory type ' + filename)

    else:
        f = open_possibly_compressed_file(filename)
        peek = f.read(1024)
        f.close()

        if 'compression-bam' in info or peek.startswith('@HD\t'):
            #TODO: sam file might be headerless
            info.add('type-sam')

        elif not peek:
            info.add('type-empty')
            # It's a valid sequence file
            info.add('sequences')
            info.add('qualities')

        elif peek.startswith('>'):
            info.add('type-fasta')
            info.add('sequences')
        elif peek.startswith('LOCUS'):
            info.add('type-genbank')
            info.add('sequences')
        elif peek.startswith('@'):
            info.add('type-fastq')
            info.add('sequences')
            info.add('qualities')
        elif peek.startswith('##gff'):
            info.add('type-gff')
            info.add('sequences')
            info.add('annotations')
        elif peek.startswith('.sff'):
            info.add('type-sff')
            info.add('sequences')
            info.add('qualities')
        elif peek.startswith('##fileformat=VCF'):
            info.add('type-vcf')

        # Possibly unreliable
        elif peek.split('\n')[0].count('\t') in (7, 8):
            info.add('type-gff')
            info.add('sequences')
            info.add('annotations')

        else:
            raise grace.Error('Unrecognized file format for ' + filename)

    return info
Пример #30
0
    def run(self):
        reader_f = io.open_possibly_compressed_file(self.vcf)
        reader = vcf.Reader(reader_f)

        tags = {}
        for item in reader.metadata.get('sampleTags', []):
            parts = item.split(',')
            tags[parts[0]] = parts

        assert 'reference' not in reader.samples, 'Can\'t have a sample called reference, sorry.'

        samples = ['reference'] + reader.samples

        for sample in samples:
            if sample not in tags:
                tags[sample] = [sample, 'all']

        samples = selection.select_and_sort(self.select, self.sort, samples,
                                            lambda sample: tags[sample])

        required = [
            i for i, sample in enumerate(samples)
            if selection.matches(self.require, tags[sample])
        ]

        sample_number = dict((b, a) for a, b in enumerate(reader.samples))

        items = []
        for record in reader:
            variants = get_variants(record)
            genotypes = []
            counts = []
            qualities = []
            for sample in samples:
                if sample == 'reference':
                    genotypes.append([0])
                    counts.append([1])
                    qualities.append(float('inf'))
                else:
                    genotypes.append(
                        get_genotype(record.samples[sample_number[sample]]))
                    counts.append(
                        get_variant_counts(
                            record.samples[sample_number[sample]]))
                    qualities.append(
                        record.samples[sample_number[sample]].data.GQ)

            # Only output when there are at least two genotypes
            any_interesting = False
            for i in xrange(len(genotypes)):
                for j in xrange(i):
                    if (genotypes[i] is not None and genotypes[j] is not None
                            and
                            not genotypes_equal(genotypes[i], genotypes[j])):
                        any_interesting = True
                        break
                if any_interesting: break
            if not any_interesting:
                continue

            if any(genotypes[i] is None for i in required):
                continue

            if self.only_snps and any(genotype is not None and any(
                    len(variants[i]) != 1 for i in genotype)
                                      for genotype in genotypes):
                continue

            snpeff = snpeff_describe(record.INFO.get('EFF', ''))
            if not any(
                    selection.matches(self.snpeff_filter, item[1])
                    for item in (snpeff or [('', [])])):
                continue

            items.append(
                _Nway_record(variants=variants,
                             genotypes=genotypes,
                             counts=counts,
                             qualities=qualities,
                             snpeff=snpeff,
                             record=record))

        self.log.log('%d variants\n\n' % len(items))

        if self.as_ == 'table':
            self._write_table(samples, items)
        elif self.as_ == 'nexus':
            self._write_nexus(samples, items)
        elif self.as_ == 'splitstree':
            self._write_nexus(samples, items)

            io.execute(
                'SplitsTree +g -i INPUT -x COMMAND',
                no_display=True,
                INPUT=self.prefix + '.nex',
                COMMAND='UPDATE; '
                'SAVE FILE=\'%s.nex\' REPLACE=yes; '
                'EXPORTGRAPHICS format=svg file=\'%s.svg\' REPLACE=yes TITLE=\'NeighborNet from %d variants\'; '
                'QUIT' % (self.prefix, self.prefix, len(items)),
            )
        elif self.as_ == 'vcf':
            self._write_vcf(samples, items, reader)

        else:
            raise grace.Error('Unknown output format: ' + self.as_)