def read_sequences(filename, qualities=False, genbank_callback=None): """ Read fasta or illumina sequences, possibly compressed Valid values for qualities: False,True,'required' Post reading filters can be applied. """ assert qualities in (False, True, 'required') parts = filename.split('~~') info = get_file_info(parts[0]) have_qualities = False if 'type-empty' in info: have_qualities = True result = read_empty(parts[0]) elif 'type-fasta' in info: result = read_fasta(parts[0]) elif 'type-genbank' in info: result = read_genbank_sequence(parts[0], genbank_callback) elif 'type-fastq' in info: have_qualities = True result = read_illumina_with_quality(parts[0]) elif 'type-gff' in info: result = read_gff3_sequence(parts[0]) elif 'type-sff' in info: f.close() grace.require_sff2fastq() have_qualities = True process = run(['sff2fastq', parts[0]]) result = read_illumina_with_quality(process.stdout) else: raise grace.Error('Unrecognized file format for ' + filename) if qualities == 'required' and not have_qualities: raise grace.Error('Need base qualities in ' + filename) for part in parts[1:]: for prefix in FILTERS: if part.lower().startswith(prefix): result = FILTERS[prefix](result, part[len(prefix):]) break else: raise grace.Error('Unrecognized filter: ' + part) if have_qualities and not qualities: result = filter_no_qualities(result) return result
def reader(working_dirs, references, use_reference, annotations={}): for name, sequence in references: features = annotations.get(sequence, []) if use_reference: readers = [ reference_reader(sequence) ] else: readers = [ ] readers.extend( evidence_reader(working_dir, name) for working_dir in working_dirs ) active_features = [ ] feature_pos = 0 for i in xrange(len(sequence)): if i % 10000 == 0: grace.status('%s %s' % (name, grace.pretty_number(i))) active_features = [ item for item in active_features if item.location.nofuzzy_end > i ] while feature_pos < len(features) and \ features[feature_pos].location.nofuzzy_start <= i: active_features.append(features[feature_pos]) feature_pos += 1 for is_insertion in (True, False): yield Calls(name, i, is_insertion, [ item.next() for item in readers ], active_features) for reader in readers: for item in reader: raise grace.Error('Unexpected extra data in evidence file') grace.status('')
def run(self): sequences = [ ] annotations = [ ] for filename in self.filenames: any = False if io.is_sequence_file(filename): sequences.append(filename) any = True if annotation.is_annotation_file(filename): annotations.append(filename) any = True if not any: raise grace.Error(filename + ' is neither a sequence file nor an annotation file that nesoni can read.') if not sequences: assert not annotations, 'Annotations given without any reference sequences.' reference = Reference(self.output_dir, must_exist=True) else: reference = Reference(self.output_dir, must_exist=False) reference.set_sequences(sequences) reference.set_annotations(annotations) with legion.Stage() as stage: if self.genome: stage.process(reference.build_genome, self.genome_select) if config.apply_ifavailable_program(self.bowtie, 'bowtie2-build'): stage.process(reference.build_bowtie_index) if config.apply_ifavailable_program(self.ls, 'gmapper-ls'): stage.process(reference.build_shrimp_mmap, False) if config.apply_ifavailable_program(self.cs, 'gmapper-cs'): stage.process(reference.build_shrimp_mmap, True) if config.apply_ifavailable_jar(self.snpeff, 'snpEff.jar'): stage.process(reference.build_snpeff)
def normalize(args): min_depth, args = grace.get_option_value(args, '--min-depth', int, 5) grace.expect_no_further_options(args) if len(args) < 2: print NORMALIZE_HELP raise grace.Help_shown() dirnames = args filenames = [] for dirname in dirnames: assert os.path.isdir(dirname), dirname + ' is not a directory' filenames.append( sorted( item for item in os.listdir(dirname) #if item.endswith('.userplot') and not item.endswith('-norm.userplot') if item.endswith('-depth.userplot') and not item.endswith('-ambiguous-depth.userplot') and not item.endswith('-pairspan-depth.userplot'))) for i in xrange(1, len(dirnames)): if filenames[i] != filenames[0]: raise grace.Error('Userplots in %s differ from those in %s' % (dirnames[i], dirnames[0])) filenames = filenames[0] for filename in filenames: normalize_files(dirnames, filename[:-15], min_depth)
def open_possibly_compressed_file(filename, compression_type=None): """ Notionally, cast "filename" to a file-like object. If filename is already file-like, return it. If it's compressed, return a decompressing file-like object. If it's a BAM file, return a file-like object that produces SAM format. Otherwise, just return an open file! """ if hasattr(filename, 'read'): return filename #It's already file-like if compression_type is None: compression_type = get_compression_type(filename) if compression_type == 'none': return open(filename, 'rb') elif compression_type == 'gzip': return gzip.open(filename, 'rb') elif compression_type == 'bzip2': return bz2.BZ2File(filename, 'rb') elif compression_type == 'bam': from nesoni import sam return sam.open_bam(filename) else: raise grace.Error('Unknown compression type: ' + compression_type)
def read_sequences(filename, qualities=False, genbank_callback=None): """ Read fasta or illumina sequences, possibly compressed Post reading filters can be applied. """ parts = filename.split('~~') f = open_possibly_compressed_file(parts[0]) peek = f.read(8) f.close() have_qualities = False if not peek: result = read_empty(parts[0]) elif peek.startswith('>'): result = read_fasta(parts[0]) elif peek.startswith('LOCUS'): result = read_genbank_sequence(parts[0], genbank_callback) elif peek.startswith('@'): have_qualities = True result = read_illumina_with_quality(parts[0]) elif peek.startswith('##gff'): result = read_gff3_sequence(parts[0]) elif peek.startswith('.sff'): f.close() grace.require_sff2fastq() have_qualities = True process = run(['sff2fastq', parts[0]]) result = read_illumina_with_quality(process.stdout) else: raise grace.Error('Unrecognized file format for ' + filename) for part in parts[1:]: for prefix in FILTERS: if part.lower().startswith(prefix): result = FILTERS[prefix](result, part[len(prefix):]) break else: raise grace.Error('Unrecognized filter: ' + part) if have_qualities and not qualities: result = filter_no_qualities(result) return result
def is_colorspace(filename): for name, seq in read_sequences(filename): tail = seq[1:].upper() for char in '0123.': if char in tail: return True for char in 'ACGTN': if char in tail: return False raise grace.Error('Couldn\'t determine if sequence file is colorspace: ' + filename)
def read_gff3_sequence(filename): f = open_possibly_compressed_file(filename) for line in f: if line.rstrip() == '##FASTA': break else: raise grace.Error( 'Tried reading file as a GFF3 but it contains no ##FASTA section') return read_fasta(f)
def original_name(self): #Assuming it was Illumina if self.flag&FLAG_PAIRED: if self.flag&FLAG_FIRST: return self.qname+'/1' elif self.flag&FLAG_SECOND: return self.qname+'/2' else: raise grace.Error('Confused by SAM file') else: return self.qname
def read_annotations(filename, joiner=None): f = io.open_possibly_compressed_file(filename) peek = f.read(1024) f.close() if peek.startswith('LOCUS'): return read_genbank(filename) elif peek.startswith('##gff') or peek.split('\n')[0].count('\t') in (7, 8): return read_gff(filename, joiner) else: raise grace.Error('Not an annotation file.')
def run(self): f = self.begin_output() for filename in self.filenames: info = io.get_file_info(filename) any = False name = os.path.splitext(os.path.split(filename)[1])[0] if info.matches('sequences'): total = 0 total_length = 0 for seq in io.read_sequences(filename, qualities=True): total += 1 total_length += len(seq[1]) print >> f, grace.datum(name, 'sequences', total) print >> f, grace.datum(name, 'total bases', total_length) if total: print >> f, grace.datum(name, 'average length', float(total_length) / total) print >> f any = True if info.matches('annotations'): total = 0 counts = {} for item in annotation.read_annotations(filename, "/"): total += 1 counts[item.type] = counts.get(item.type, 0) + 1 print >> f, grace.datum(name, 'features', total) for key in sorted(counts): print >> f, grace.datum(name, key + ' features', counts[key]) print >> f any = True if info.matches('type-vcf'): reader_f = io.open_possibly_compressed_file(filename) reader = vcf.Reader(reader_f) n = 0 for item in reader: n += 1 print >> f, grace.datum(name, 'variants', n) any = True if not any: raise grace.Error('Don\'t know what to do with ' + filename) self.end_output(f)
def run(self): f = self.begin_output() for filename in self.filenames: any = False name = os.path.splitext(os.path.split(filename)[1])[0] try: iterator = io.read_sequences(filename, qualities=True) except grace.Error: iterator = None if iterator is not None: total = 0 total_length = 0 for seq in io.read_sequences(filename, qualities=True): total += 1 total_length += len(seq[1]) print >> f, grace.datum(name, 'sequences', total) print >> f, grace.datum(name, 'average length', float(total_length) / total) print >> f any = True try: iterator = annotation.read_annotations(filename) except grace.Error: iterator = None if iterator: total = 0 counts = {} for item in iterator: total += 1 counts[item.type] = counts.get(item.type, 0) + 1 print >> f, grace.datum(name, 'features', total) for key in sorted(counts): print >> f, grace.datum(name, key + ' features', counts[key]) print >> f any = True if not any: raise grace.Error( filename + ' is neither a sequence file nor an annotation file that nesoni can read.' ) self.end_output(f)
def evidence_reader(working_dir, name): filename = os.path.join(working_dir, grace.filesystem_friendly_name(name) + '-evidence.txt') f = open(filename,'rb') header = f.readline() if header.count('\t') != 7: raise grace.Error('Old style evidence file. Please re-run nesoni consensus.') for line in f: fields = line.rstrip('\n').split('\t') yield Call(fields[4], fields[1], fields[6]) yield Call(fields[5], fields[2], fields[7]) f.close()
def classify_files(filenames, selectors): """ Put each of a set of files into one or more categories. """ results = [[] for item in categories] for filename in filenames: info = get_file_info(filename) any = False for i, selector in enumerate(selectors): if selection.matches(selector, info): results[i].append(filename) any = True if not any: raise grace.Error('Don\'t know what to do with ' + filename) return results
def find_jar(jarname, extra_help=''): search = [] if 'JARPATH' in os.environ: # I just made this up search.extend(os.environ['JARPATH'].split(':')) if 'PATH' in os.environ: search.extend(os.environ['PATH'].split(os.pathsep)) for dirname in search: filename = os.path.join(dirname, jarname) if os.path.isabs(dirname) and os.path.exists(filename): return filename raise grace.Error( 'Couldn\'t find "%s". Directories listed in JARPATH and PATH were searched. %s' % (jarname, extra_help))
def matches(expression, tags): tokens = list('[]-:/^') def parse2(expression): assert expression, 'unexpected end of expression' if expression[0] == '[': value, expression = parse(expression[1:]) assert expression.startswith(']'), 'expected a closing ]' return value, expression[1:] i = 0 while i < len(expression) and expression[i] not in '[]:/^': i += 1 assert i > 0, 'unexpected ' + expression[0] return expression[:i] == 'all' or expression[:i] in tags, expression[ i:] def parse1(expression): assert expression, 'unexpected end of expression' if expression.startswith('-'): value, expression = parse2(expression[1:]) return not value, expression else: value, expression = parse2(expression) return value, expression def parse(expression): value, expression = parse1(expression) while expression and expression[0] in ':/^': operator, expression = expression[0], expression[1:] value2, expression = parse1(expression) if operator == ':': value = value and value2 elif operator == '/': value = value or value2 else: value = (not value2 and value) or (not value and value2) return value, expression if expression == '': return False try: value, expression = parse(expression) assert not expression, 'don\'t know what to do with: ' + expression except AssertionError, e: raise grace.Error('Could not parse: ' + expression + ', ' + e.args[0])
def run(args, stdin=None, stdout=PIPE, stderr=None, cwd=None, no_display=False, **kwargs): """ Start a process using subprocess.Popen Set close_fds=True so process doesn't inherit any other pipes we might be using. stdin stdout and stderr may be: None - inherit existing nesoni.io.PIPE - create a pipe a file or fd number - the file (be sure to flush() anything you've written to it first!) stderr may also be nesoni.io.STDOUT """ args = _interpret_args(args, kwargs) if not no_display: env = None else: env = dict(os.environ) if 'DISPLAY' in env: del env['DISPLAY'] try: return subprocess.Popen( args, bufsize=1 << 24, stdin=stdin, stdout=stdout, stderr=stderr, cwd=cwd, env=env, close_fds=True, ) except OSError, err: raise grace.Error("Failed to run: %s" % (' '.join(args)))
def _make_inner(action): timestamp = coordinator().time() assert timestamp > LOCAL.time, 'Time running in reverse.' cores = action.cores_required() if cores > 1: coordinator().trade_cores(1, cores) try: config.write_colored_text(sys.stderr, '\n'+action.describe()+'\n') if LOCAL.abort_make: raise grace.Error('%s would be run. Stopping here.' % action.ident()) grace.status(action.ident()) try: _run_and_save_state(action, timestamp) finally: grace.status('') finally: if cores > 1: coordinator().trade_cores(cores, 1)
def iter_reads(config, qualities=False): if 'stride' not in config: raise grace.Error( 'Please re-run nesoni shrimp, output format has changed') stride = config['stride'] for reads_filename_set in config['reads']: if config['solid']: reader = [ io.read_solid(filename) for filename in reads_filename_set ] else: reader = [ io.read_sequences(filename, qualities) for filename in reads_filename_set ] reader = itertools.izip(*reader) for i, items in enumerate(reader): if i % stride == 0: for item in items: yield item
def run_toolbox(action_classes, script_name=''): """ Provide a command line interface for a list of Actions. Note: strings included in the action_classes list will be printed as help text, for example to display section headings. """ commands = { } help = [ '\n' ] for item in action_classes: if isinstance(item, str): help.append(config.wrap(item, 70) + '\n\n') continue name = item.shell_name() commands[ name ] = item help.append(' %s\n' % config.colored(1,name+':')) help.append(config.wrap(item.help_short, 70, ' ') + '\n\n') args = sys.argv[1:] if not args: config.write_colored_text(sys.stdout, ''.join(help)+'\n\n') sys.exit(1) try: command, args = args[0], args[1:] mangled_command = command.lower().rstrip(':') if mangled_command not in commands: raise grace.Error("Don't know how to "+command) except: config.report_exception() sys.exit(1) config.shell_run(commands[mangled_command](), args, (script_name+' ' if script_name else '') + mangled_command+':')
def run(self): sequences = [] annotations = [] for filename in self.filenames: any = False if io.is_sequence_file(filename): sequences.append(filename) any = True if annotation.is_annotation_file(filename): annotations.append(filename) any = True if not any: raise grace.Error( filename + ' is neither a sequence file nor an annotation file that nesoni can read.' ) reference = Reference(self.output_dir, must_exist=False) reference.set_sequences(sequences) reference.set_annotations(annotations) if self.ls: reference.build_shrimp_mmap(False) if self.cs: reference.build_shrimp_mmap(True)
def run(self): bams = [] reference = None reference2 = None extra = [] for sample in self.samples: if sam.is_bam(sample): bams.append(sample) elif os.path.isdir(sample): working = working_directory.Working(sample, True) bams.append(working.get_filtered_sorted_bam()) extra.append('##sampleTags=' + ','.join(working.get_tags())) if reference2 is None: reference2 = working.get_reference( ).reference_fasta_filename() elif io.is_sequence_file(sample): assert reference is None, 'Only one reference FASTA file allowed.' reference = sample if reference is None: reference = reference2 if reference is None: raise grace.Error('No reference FASTA file given.') with nesoni.Stage() as stage: tempspace = stage.enter(workspace.tempspace()) if self.depth_limit: with nesoni.Stage() as stage2: for i in xrange(len(bams)): sam.Bam_depth_limit( tempspace / ('%d' % i), bams[i], depth=self.depth_limit).process_make(stage2) bams[i] = tempspace / ('%d.bam' % i) # FreeBayes claims to handle multiple bams, but it doesn't actually work if len(bams) > 1: sam.Bam_merge(tempspace / 'merged', bams=bams, index=False).run() bams = [tempspace / 'merged.bam'] command = [ 'freebayes', '-f', reference, '--ploidy', str(self.ploidy), '--pvar', str(self.pvar), ] + self.freebayes_options + bams self.log.log('Running: ' + ' '.join(command) + '\n') f_out = stage.enter(open(self.prefix + '.vcf', 'wb')) f_in = stage.enter(io.pipe_from(command)) done_extra = False for line in f_in: if not done_extra and not line.startswith('##'): for extra_line in extra: f_out.write(extra_line + '\n') done_extra = True f_out.write(line) index_vcf(self.prefix + '.vcf')
def run(self): assert self.reads or self.pairs or self.interleaved, 'No reads given' io.check_name_uniqueness(self.reads, self.pairs, self.interleaved) working = self.get_workspace() working.setup_reference(self.references, bowtie=True) working.update_param(snp_cost=2.0) reference = working.get_reference() log_file = open(self.log_filename(), 'wb') with workspace.tempspace(dir=working.working_dir) as temp: n = [0] def tempname(): n[0] += 1 return temp / ('%d.fq' % n[0]) def convert(filename): info = io.get_file_info(filename) ok = selection.matches( 'type-fastq:[compression-none/compression-gzip/compression-bzip2]', info) if ok: return filename result_name = tempname() with open(result_name, 'wb') as f: for name, seq, qual in io.read_sequences( filename, qualities='required'): io.write_fastq(f, name, seq, qual) return result_name ones = [] twos = [] singles = [] for pair in self.pairs: assert len( pair) == 2, 'Need two files in each "pair:" section.' ones.append(convert(pair[0])) twos.append(convert(pair[1])) for item in self.interleaved: left_name = tempname() right_name = tempname() ones.append(left_name) twos.append(right_name) with open(left_name,'wb') as left, \ open(right_name,'wb') as right: reader = io.read_sequences(item, qualities='required') while True: try: name, seq, qual = reader.next() except StopIteration: break io.write_fastq(left, name, seq, qual) try: name, seq, qual = reader.next() except StopIteration: raise grace.Error( 'Interleaved file contains odd number of sequences' ) io.write_fastq(right, name, seq, qual) for item in self.reads: singles.append(convert(item)) cores = min(self.cores, legion.coordinator().get_cores()) command = ([ 'bowtie2', '--threads', str(cores), '--rg-id', '1', '--rg', 'SM:' + working.name, ] + self.bowtie_options + ['-x', reference.get_bowtie_index_prefix()]) commands = [] if ones: commands.append(command + ['-1', ','.join(ones), '-2', ','.join(twos)]) if singles: commands.append(command + ['-U', ','.join(singles)]) temp_bam_name = temp / 'temp.bam' with io.pipe_to(['samtools', 'view', '-S', '-b', '-'], stdout=open(temp_bam_name, 'wb'), stderr=log_file) as f: header_sent = False for command in commands: self.log.log('Running:\n' + ' '.join(command) + '\n') with io.pipe_from(command, stderr=log_file, cores=cores) as f_out: for line in f_out: if not header_sent or not line.startswith('@'): f.write(line) header_sent = True #io.execute([ # 'samtools', 'sort', '-n', temp_bam_name, working/'alignments' # ]) sam.sort_bam(temp_bam_name, working / 'alignments', by_name=True, cores=self.cores) log_file.close()
def run_toolbox(action_classes, script_name='', show_make_flags=True): """ Provide a command line interface for a list of Actions. Note: strings included in the action_classes list will be printed as help text, for example to display section headings. """ args = configure_making(sys.argv[1:]) commands = {} for item in action_classes: if isinstance(item, str): continue name = item.shell_name() commands[name] = item if args == ['--help-make']: help = ['\n'] help.append( '\nMake options:\n' + Make().describe('', show_help=True, escape_newlines=False) + '\n') config.write_colored_text(sys.stdout, ''.join(help) + '\n\n') sys.exit(1) if not args or args == ['-h'] or args == ['--help']: help = ['\n'] for item in action_classes: if isinstance(item, str): help.append(config.wrap(item, 70) + '\n\n') continue name = item.shell_name() help.append(' %s\n' % config.colored(1, name + ':')) help.append( config.color_as_comment( config.wrap(item.help_short, 70, ' ')) + '\n\n') if show_make_flags: #help.append('\nMake options:\n'+Make().describe('', show_help=True, escape_newlines=False)+'\n') help.append( '\nFor workflow make options type "%s --help-make".\n' % script_name) config.write_colored_text(sys.stdout, ''.join(help)) sys.exit(1) try: command, args = args[0], args[1:] mangled_command = command.lower().rstrip(':') if mangled_command not in commands: raise grace.Error("Don't know how to " + command) except: config.report_exception() sys.exit(1) config.shell_run(commands[mangled_command](), args, (script_name + ' ' if script_name else '') + mangled_command + ':')
def run(self): log = self.log #quality_cutoff, args = grace.get_option_value(args, '--quality', int, 10) #qoffset, args = grace.get_option_value(args, '--qoffset', int, None) #clip_ambiguous, args = grace.get_option_value(args, '--clip-ambiguous', grace.as_bool, True) #length_cutoff, args = grace.get_option_value(args, '--length', int, 24) #adaptor_cutoff, args = grace.get_option_value(args, '--match', int, 10) #max_error, args = grace.get_option_value(args, '--max-errors', int, 1) #adaptor_set, args = grace.get_option_value(args, '--adaptors', str, 'truseq-adapter,truseq-srna,genomic,multiplexing,pe,srna') #disallow_homopolymers, args = grace.get_option_value(args, '--homopolymers', grace.as_bool, False) #reverse_complement, args = grace.get_option_value(args, '--revcom', grace.as_bool, False) #trim_start, args = grace.get_option_value(args, '--trim-start', int, 0) #trim_end, args = grace.get_option_value(args, '--trim-end', int, 0) #output_fasta, args = grace.get_option_value(args, '--fasta', grace.as_bool, False) #use_gzip, args = grace.get_option_value(args, '--gzip', grace.as_bool, True) #output_rejects, args = grace.get_option_value(args, '--rejects', grace.as_bool, False) #grace.expect_no_further_options(args) prefix = self.prefix log_name = os.path.split(prefix)[1] quality_cutoff = self.quality qoffset = self.qoffset clip_ambiguous = self.clip_ambiguous length_cutoff = self.length adaptor_cutoff = self.match max_error = self.max_errors adaptor_set = self.adaptors disallow_homopolymers = self.homopolymers reverse_complement = self.revcom trim_start = self.trim_start trim_end = self.trim_end output_fasta = self.fasta use_gzip = self.gzip output_rejects = self.rejects iterators = [] filenames = [] any_paired = False for filename in self.reads: filenames.append(filename) iterators.append( itertools.izip(io.read_sequences(filename, qualities=True))) for pair_filenames in self.pairs: assert len(pair_filenames ) == 2, 'Expected a pair of files for "pairs" section.' filenames.extend(pair_filenames) any_paired = True iterators.append( itertools.izip( io.read_sequences(pair_filenames[0], qualities=True), io.read_sequences(pair_filenames[1], qualities=True))) for filename in self.interleaved: filenames.extend(filename) any_paired = True iterators.append( deinterleave(io.read_sequences(filename, qualities=True))) fragment_reads = (2 if any_paired else 1) read_in_fragment_names = ['read-1', 'read-2' ] if any_paired else ['read'] assert iterators, 'Nothing to clip' if qoffset is None: guesses = [ io.guess_quality_offset(filename) for filename in filenames ] assert len( set(guesses) ) == 1, 'Conflicting quality offset guesses, please specify manually.' qoffset = guesses[0] log.log('FASTQ offset seems to be %d\n' % qoffset) quality_cutoff_char = chr(qoffset + quality_cutoff) #log.log('Minimum quality: %d (%s)\n' % (quality_cutoff, quality_cutoff_char)) #log.log('Clip ambiguous bases: %s\n' % (grace.describe_bool(clip_ambiguous))) #log.log('Minimum adaptor match: %d bases, %d errors\n' % (adaptor_cutoff, max_error)) #log.log('Minimum length: %d bases\n' % length_cutoff) adaptor_seqs = [] adaptor_names = [] if adaptor_set and adaptor_set.lower() != 'none': for item in adaptor_set.split(','): item = item.strip().lower() + ' ' any = False for line in ADAPTORS.strip().split('\n'): if line.startswith('#'): continue if not line.lower().startswith(item): continue any = True name, seq = line.rsplit(None, 1) seq = seq.replace('U', 'T') #if seq in adaptor_seqs: print 'Dup', name adaptor_seqs.append(seq) adaptor_names.append(name) adaptor_seqs.append(bio.reverse_complement(seq)) adaptor_names.append(name) if not any: raise grace.Error('Unknown adaptor set: ' + item) matcher = Matcher(adaptor_seqs, adaptor_names, max_error) start_clips = [ collections.defaultdict(list) for i in xrange(fragment_reads) ] end_clips = [ collections.defaultdict(list) for i in xrange(fragment_reads) ] if output_fasta: write_sequence = io.write_fasta_single_line else: write_sequence = io.write_fastq f_single = io.open_possibly_compressed_writer( self.reads_output_filenames()[0]) if fragment_reads == 2: f_paired = io.open_possibly_compressed_writer( self.interleaved_output_filenames()[0]) if output_rejects: f_reject = io.open_possibly_compressed_writer( self.rejects_output_filenames()[0]) n_single = 0 n_paired = 0 n_in_single = 0 n_in_paired = 0 total_in_length = [0] * fragment_reads n_out = [0] * fragment_reads n_q_clipped = [0] * fragment_reads n_a_clipped = [0] * fragment_reads n_homopolymers = [0] * fragment_reads total_out_length = [0] * fragment_reads #log.attach(open(prefix + '_log.txt', 'wb')) for iterator in iterators: for fragment in iterator: if (n_in_single + n_in_paired) % 10000 == 0: grace.status( 'Clipping fragment %s' % grace.pretty_number(n_in_single + n_in_paired)) if len(fragment) == 1: n_in_single += 1 else: n_in_paired += 1 graduates = [] rejects = [] for i, (name, seq, qual) in enumerate(fragment): name = name.split()[0] seq = seq.upper() total_in_length[i] += len(seq) start = trim_start best_start = 0 best_len = 0 for j in xrange(len(seq) - trim_end): if qual[j] < quality_cutoff_char or \ (clip_ambiguous and seq[j] not in 'ACGT'): if best_len < j - start: best_start = start best_len = j - start start = j + 1 j = len(seq) - trim_end if best_len < j - start: best_start = start best_len = j - start clipped_seq = seq[best_start:best_start + best_len] clipped_qual = qual[best_start:best_start + best_len] if len(clipped_seq) < length_cutoff: n_q_clipped[i] += 1 rejects.append((name, seq, qual, 'quality')) continue match = matcher.match(clipped_seq) if match and match[0] >= adaptor_cutoff: clipped_seq = clipped_seq[match[0]:] clipped_qual = clipped_qual[match[0]:] start_clips[i][match[0]].append(match[1][0]) if len(clipped_seq) < length_cutoff: n_a_clipped[i] += 1 rejects.append((name, seq, qual, 'adaptor')) continue match = matcher.match(bio.reverse_complement(clipped_seq)) if match and match[0] >= adaptor_cutoff: clipped_seq = clipped_seq[:len(clipped_seq) - match[0]] clipped_qual = clipped_qual[:len(clipped_qual) - match[0]] end_clips[i][match[0]].append(match[1][0]) if len(clipped_seq) < length_cutoff: n_a_clipped[i] += 1 rejects.append((name, seq, qual, 'adaptor')) continue if disallow_homopolymers and len(set(clipped_seq)) <= 1: n_homopolymers[i] += 1 rejects.append((name, seq, qual, 'homopolymer')) continue graduates.append((name, clipped_seq, clipped_qual)) n_out[i] += 1 total_out_length[i] += len(clipped_seq) if output_rejects: for name, seq, qual, reason in rejects: write_sequence(f_reject, name + ' ' + reason, seq, qual) if graduates: if reverse_complement: graduates = [(name, bio.reverse_complement(seq), qual[::-1]) for name, seq, qual in graduates] if len(graduates) == 1: this_f = f_single n_single += 1 else: assert len(graduates) == 2 this_f = f_paired n_paired += 1 for name, seq, qual in graduates: write_sequence(this_f, name, seq, qual) grace.status('') if output_rejects: f_reject.close() if fragment_reads == 2: f_paired.close() f_single.close() def summarize_clips(name, location, clips): total = 0 for i in clips: total += len(clips[i]) log.datum(log_name, name + ' adaptors clipped at ' + location, total) if not clips: return for i in xrange(min(clips), max(clips) + 1): item = clips[i] log.quietly_log('%3d bases: %10d ' % (i, len(item))) if item: avg_errors = float(sum(item2[0] for item2 in item)) / len(item) log.quietly_log(' avg errors: %5.2f ' % avg_errors) counts = collections.defaultdict(int) for item2 in item: counts[item2[1]] += 1 #print counts for no in sorted(counts, key=lambda item2: counts[item2], reverse=True)[:2]: log.quietly_log('%dx%s ' % (counts[no], matcher.names[no])) if len(counts) > 2: log.quietly_log('...') log.quietly_log('\n') log.quietly_log('\n') if n_in_paired: log.datum(log_name, 'read-pairs', n_in_paired) if n_in_single: log.datum(log_name, 'single reads', n_in_single) for i in xrange(fragment_reads): if start_clips: summarize_clips(read_in_fragment_names[i], 'start', start_clips[i]) if end_clips: summarize_clips(read_in_fragment_names[i], 'end', end_clips[i]) prefix = read_in_fragment_names[i] log.datum(log_name, prefix + ' too short after quality clip', n_q_clipped[i]) log.datum(log_name, prefix + ' too short after adaptor clip', n_a_clipped[i]) if disallow_homopolymers: log.datum(log_name, prefix + ' homopolymers', n_homopolymers[i]) if fragment_reads > 1: log.datum(log_name, prefix + ' kept', n_out[i]) log.datum(log_name, prefix + ' average input length', float(total_in_length[i]) / (n_in_single + n_in_paired)) if n_out[i]: log.datum(log_name, prefix + ' average output length', float(total_out_length[i]) / n_out[i]) if fragment_reads == 2: log.datum(log_name, 'pairs kept after clipping', n_paired) log.datum(log_name, 'reads kept after clipping', n_single)
def nway_main(gbk_filename, use_indels, use_reference, give_evidence, give_consequences, require_all, require_bisect, full_output, format, working_dirs, split_a, split_b, f=sys.stdout): assert working_dirs, 'Need at least one working directory.' workspaces = [ working_directory.Working(dirname, must_exist=True) for dirname in working_dirs ] reference = workspaces[0].get_reference() #if not annotation_filename: # annotation_filename = reference.annotations_filename() #May still be None if use_reference: names = ['reference'] evidence_start = 1 else: names = [ ] evidence_start = 0 names.extend( norm_name(item) for item in working_dirs ) references = io.read_sequences(reference.reference_fasta_filename()) annotations = { } if gbk_filename: from Bio import SeqIO for record in SeqIO.parse(io.open_possibly_compressed_file(gbk_filename),'genbank'): sequence = record.seq.tostring() features = [ item for item in record.features if item.type != 'source' ] features.sort(key=lambda item: item.location.nofuzzy_start) annotations[sequence] = features iterator = reader(working_dirs, references, use_reference, annotations) if not use_indels: iterator = itertools.ifilter(has_no_indels, iterator) if require_all or require_bisect or format == 'counts': iterator = itertools.ifilter(fully_unambiguous, iterator) if require_bisect: iterator = itertools.ifilter(is_binary_partition, iterator) if not require_bisect: if full_output: iterator = itertools.ifilter(not_boring_insertion, iterator) else: iterator = itertools.ifilter(is_interesting, iterator) if split_a or split_b: assert len(names) == len(set(names)), 'Two samples with the same name' try: split_a = [ names.index(norm_name(item)) for item in split_a ] split_b = [ names.index(norm_name(item)) for item in split_b ] except ValueError: raise grace.Error('Sample to be split is not amongst samples given') iterator = itertools.ifilter(is_split(split_a, split_b), iterator) #if limit: # iterator = itertools.islice(iterator, limit) if format == 'table': line = 'Reference\tPosition\tChange type' line += '\t' + '\t'.join(names) if give_evidence: line += '\t' + '\t'.join(names[evidence_start:]) if give_consequences: line += '\t' + '\t'.join(names[evidence_start:]) if annotations: line += '\tAnnotations' print >> f, line for calls in iterator: line = '%s\t%d\t%s\t%s' % ( calls.ref_name, calls.ref_pos+1, change_type(calls), '\t'.join(item.consensus for item in calls.calls)) if give_evidence: line += '\t' + '\t'.join(item.evidence for item in calls.calls[evidence_start:]) if give_consequences: line += '\t' + '\t'.join(item.consequences for item in calls.calls[evidence_start:]) if annotations: line += '\t' + describe_features(calls.features) print >> f, line elif format == 'compact': for line in transpose_strings(names): print >> f, line print >> f for calls in iterator: if calls.is_insertion: footer = '%12d.5 %s' % (calls.ref_pos, calls.ref_name) else: footer = '%12d %s' % (calls.ref_pos+1, calls.ref_name) t = transpose_strings([ item.consensus for item in calls.calls ], '-', 1) top = t[0] + ' ' + footer if give_consequences: consequences = [ ] for call in calls.calls: if call.consequences: for item in call.consequences.split(', '): item = ' '.join(item.split()[:3]) if item not in consequences: consequences.append(item) if consequences: top += ' ' + ' / '.join(sorted(consequences)) top += ' ' + describe_features(calls.features) print >> f, top for line in t[1:]: print >> f, line elif format == 'nexus': buckets = [ [ ] for name in names ] for calls in iterator: for i, char in enumerate(partition_string(calls)): buckets[i].append(char) print >> f, '#NEXUS' print >> f, 'begin taxa;' print >> f, 'dimensions ntax=%d;' % len(names) print >> f, 'taxlabels' for name in names: print >> f, name print >> f, ';' print >> f, 'end;' print >> f, 'begin characters;' print >> f, 'dimensions nchar=%d;' % len(buckets[0]) print >> f, 'format datatype=STANDARD symbols="ACGT-0123456789" missing=N;' print >> f, 'matrix' for name, bucket in itertools.izip(names, buckets): print >> f, name, ''.join(bucket) print >> f, ';' print >> f, 'end;' elif format == 'counts': for line in transpose_strings(names): print >> f, line print >> f counts = { } for calls in iterator: count_str = partition_string(calls) if count_str not in counts: counts[count_str] = 1 else: counts[count_str] += 1 for count_str in sorted(counts, key=lambda x: (counts[x], x), reverse=True): print >> f, '%s %d' % (transpose_strings(count_str)[0], counts[count_str]) else: raise grace.Error('Unknown output format: ' + format)
def main(args): genbank_filename, args = grace.get_option_value(args,'--gbk',str,None) use_indels, args = grace.get_option_value(args,'--indels',grace.as_bool,True) use_reference, args = grace.get_option_value(args,'--reference',grace.as_bool,True) give_evidence, args = grace.get_option_value(args,'--evidence',grace.as_bool,True) give_consequences, args = grace.get_option_value(args,'--consequences',grace.as_bool,True) require_all, args = grace.get_option_value(args,'--require-all',grace.as_bool,False) require_bisect, args = grace.get_option_value(args,'--require-bisect',grace.as_bool,False) full_output, args = grace.get_option_value(args,'--full',grace.as_bool,False) format, args = grace.get_option_value(args,'--as',str,'table') # Secret option! limit, args = grace.get_option_value(args,'--limit',int,None) grace.expect_no_further_options(args) if len(args) < 1: sys.stderr.write(USAGE) return 1 working_dirs = [ ] split_a = [ ] split_b = [ ] def default(args): working_dirs.extend(args) def splitting(args): split_a.extend(args) def splitting_from(args): split_b.extend(args) grace.execute(args, { 'splitting' : splitting, 'from' : splitting_from }, default ) if use_reference: names = ['reference'] evidence_start = 1 else: names = [ ] evidence_start = 0 names.extend( norm_name(item) for item in working_dirs ) references = io.read_sequences(os.path.join(working_dirs[0], 'reference.fa')) annotations = { } if genbank_filename: from Bio import SeqIO for record in SeqIO.parse(io.open_possibly_compressed_file(genbank_filename),'genbank'): sequence = record.seq.tostring() features = [ item for item in record.features if item.type != 'source' ] features.sort(key=lambda item: item.location.nofuzzy_start) annotations[sequence] = features iterator = reader(working_dirs, references, use_reference, annotations) if not use_indels: iterator = itertools.ifilter(has_no_indels, iterator) if require_all or require_bisect or format == 'counts': iterator = itertools.ifilter(fully_unambiguous, iterator) if require_bisect: iterator = itertools.ifilter(is_binary_partition, iterator) if not require_bisect: if full_output: iterator = itertools.ifilter(not_boring_insertion, iterator) else: iterator = itertools.ifilter(is_interesting, iterator) if split_a or split_b: assert len(names) == len(set(names)), 'Two samples with the same name' try: split_a = [ names.index(norm_name(item)) for item in split_a ] split_b = [ names.index(norm_name(item)) for item in split_b ] except ValueError: raise grace.Error('Sample to be split is not amongst samples given') iterator = itertools.ifilter(is_split(split_a, split_b), iterator) if limit: iterator = itertools.islice(iterator, limit) if format == 'table': line = 'Reference\tPosition\tChange type' line += '\t' + '\t'.join(names) if give_evidence: line += '\t' + '\t'.join(names[evidence_start:]) if give_consequences: line += '\t' + '\t'.join(names[evidence_start:]) if annotations: line += '\tAnnotations' print line for calls in iterator: line = '%s\t%d\t%s\t%s' % ( calls.ref_name, calls.ref_pos+1, change_type(calls), '\t'.join(item.consensus for item in calls.calls)) if give_evidence: line += '\t' + '\t'.join(item.evidence for item in calls.calls[evidence_start:]) if give_consequences: line += '\t' + '\t'.join(item.consequences for item in calls.calls[evidence_start:]) if annotations: line += '\t' + describe_features(calls.features) print line elif format == 'compact': for line in transpose_strings(names): print line print for calls in iterator: if calls.is_insertion: footer = '%12d.5 %s' % (calls.ref_pos, calls.ref_name) else: footer = '%12d %s' % (calls.ref_pos+1, calls.ref_name) t = transpose_strings([ item.consensus for item in calls.calls ], '-', 1) top = t[0] + ' ' + footer if give_consequences: consequences = [ ] for call in calls.calls: if call.consequences: for item in call.consequences.split(', '): item = ' '.join(item.split()[:3]) if item not in consequences: consequences.append(item) if consequences: top += ' ' + ' / '.join(sorted(consequences)) top += ' ' + describe_features(calls.features) print top for line in t[1:]: print line elif format == 'nexus': buckets = [ [ ] for name in names ] for calls in iterator: for i, char in enumerate(partition_string(calls)): buckets[i].append(char) print '#NEXUS' print 'begin taxa;' print 'dimensions ntax=%d;' % len(names) print 'taxlabels' for name in names: print name print ';' print 'end;' print 'begin characters;' print 'dimensions nchar=%d;' % len(buckets[0]) print 'format datatype=STANDARD symbols="ACGT-0123456789" missing=N;' print 'matrix' for name, bucket in itertools.izip(names, buckets): print name, ''.join(bucket) print ';' print 'end;' elif format == 'counts': for line in transpose_strings(names): print line print counts = { } for calls in iterator: count_str = partition_string(calls) if count_str not in counts: counts[count_str] = 1 else: counts[count_str] += 1 for count_str in sorted(counts, key=lambda x: (counts[x], x), reverse=True): print '%s %d' % (transpose_strings(count_str)[0], counts[count_str]) else: raise grace.Error('Unknown output format: ' + format)
def main(args): default_transl_table, args = grace.get_option_value( args, '--transl_table', int, 11) use_coverage, args = grace.get_flag(args, '--use-coverage') coverage_cutoff, args = grace.get_option_value(args, '--coverage-cutoff', float, 0.1) tabular, args = grace.get_flag(args, '--tabular') noheader, args = grace.get_flag(args, '--noheader') verbose, args = grace.get_flag(args, '--verbose') bandwidth, args = grace.get_option_value(args, '--band', int, 20) grace.expect_no_further_options(args) if len(args) != 2: print USAGE return 1 genbank_filename = args[0] alignment_filename = args[1] if os.path.isdir(alignment_filename): alignment_filename = os.path.join(alignment_filename, 'alignment.maf') working_dir = os.path.split(alignment_filename)[0] alignments = load_alignments(alignment_filename) summaries = [] details = [] if not noheader: fields = 'Sequence\tLocus tag\tOld length (aa)\tNew length (aa)\tAmino acid changes\t' if use_coverage: fields += 'Unambiguous coverage vs expected\t\tAmbiguous coverage vs expected\t\tAmbiguous percent with any hits\t' fields += 'Gene\tProduct' if tabular: fields += '\tChanges of note' print fields for record in SeqIO.parse( io.open_possibly_compressed_file(genbank_filename), 'genbank'): sequence = record.seq.tostring() for name, seq1, seq2, alignment in alignments: if seq1 == sequence: break else: raise grace.Error( 'Genbank record %s sequence not identical to any reference sequence' % record.id) if use_coverage: depth = get_graph(working_dir, name, 'depth') ambiguous_depth = get_graph(working_dir, name, 'ambiguous-depth') median_depth = numpy.median(depth) median_ambiguous_depth = numpy.median(ambiguous_depth) ambiguous_factor = float(median_ambiguous_depth) / median_depth depth_expect = expected_depth(name, sequence, depth, ambiguous_depth) for feature in record.features: if feature.type != 'CDS': continue if 'locus_tag' not in feature.qualifiers: locus_tag = '%d..%d' % (feature.location.nofuzzy_start + 1, feature.location.nofuzzy_end) else: locus_tag = feature.qualifiers['locus_tag'][0] if 'transl_table' in feature.qualifiers: transl_table_no = int(feature.qualifiers['transl_table'][0]) else: assert default_transl_table is not None, 'No /transl_table for CDS, and default transl_table not given' transl_table_no = default_transl_table transl_table = CodonTable.ambiguous_dna_by_id[transl_table_no] start_codons = transl_table.start_codons try: feature_alignment = alignment_from_feature(sequence, feature) except Weird_alignment: warn('%s has a location I could not handle, skipping, sorry' % locus_tag) continue dna = [] new_dna = [] shifts = [] for i in xrange(feature_alignment.end2): p1 = feature_alignment.back_project(i, left=False) p2 = feature_alignment.back_project(i + 1, left=True) assert abs(p2 - p1) < 2 dna.append(sequence_slice(sequence, p1, p2)) p1a = alignment.project(p1, left=False) p2a = alignment.project(p2, left=False) #Hmm diff = (p2 - p1) - (p2a - p1a) #if diff: # if diff%3: # frame_shift = True # else: # frame_preserving_shift = True new_dna.append(sequence_slice(seq2, p1a, p2a)) if diff: shifts.append((i, dna[-1], new_dna[-1])) dna = ''.join(dna) new_dna = ''.join(new_dna) # This usually indicated a CDS truncated at the start? # in which case, will probably fail some way or other down the line. if 'codon_start' in feature.qualifiers: codon_start = int(feature.qualifiers['codon_start'][0]) - 1 else: codon_start = 0 dna = dna[codon_start:] new_dna = new_dna[codon_start:] if len(dna) % 3 != 0: warn(locus_tag + ' length not a multiple of 3') #assert len(new_dna) % 3 == 0 protein = Seq.Seq(dna).translate(table=transl_table_no).tostring() # http://en.wikipedia.org/wiki/Start_codon is always translated to M protein = 'M' + protein[1:] if dna[:3] not in start_codons: warn(locus_tag + ' has unknown start codon: ' + dna[:3]) original_lacks_stop_codon = not protein.endswith('*') if original_lacks_stop_codon: warn(locus_tag + ' lacks end codon') original_stops_before_end = '*' in protein[:-1] if original_stops_before_end: warn(locus_tag + ' contains stop codon before end') if 'translation' in feature.qualifiers: expect = feature.qualifiers['translation'][0] if protein[:-1] != expect: warn( locus_tag + ' translation given in feature does not match translation from DNA' ) new_protein = Seq.Seq(new_dna).translate( table=transl_table_no).tostring() new_protein = 'M' + new_protein[1:] # If end codon changed, find new end # Don't bother if there are unknown amino acids or # the original protein lacks a stop codon if 'X' not in new_protein and '*' not in new_protein and not original_lacks_stop_codon: #This is very inefficient i = feature_alignment.end2 while True: p1 = feature_alignment.back_project(i, left=False) p2 = feature_alignment.back_project(i + 1, left=True) p1a = alignment.project(p1, left=False) p2a = alignment.project(p2, left=False) #Hmm if p1a < 0 or p2a < 0 or p1a > len(seq2) or p2a > len( seq2): break new_dna += sequence_slice(seq2, p1a, p2a) new_protein = Seq.Seq(new_dna).translate( table=transl_table_no).tostring() new_protein = 'M' + new_protein[1:] if 'X' in new_protein or '*' in new_protein: break i += 1 # Is the protein shorter? # Don't bother checking if the original protein has extra stop codons if '*' in new_protein and not original_stops_before_end: new_protein = new_protein[:new_protein.index('*') + 1] # If indels occurred, do an alignment # Don't bother otherwise if shifts: # Penalize gaps with cost 2 (vs 1 for mismatch) # If lengths don't match, pad with spaces (won't match longer seq), # aligner prefers mismatch to gaps #result = pairwise2.align.globalxs(protein + ' '*max(0,len(new_protein)-len(protein)), # new_protein + ' '*max(0,len(protein)-len(new_protein)), # -2.001,-2.000)[0] # 2.001 : very slightly prefer contiguous gaps. Also much faster! result = band_limited_align( protein + ' ' * max(0, len(new_protein) - len(protein)), new_protein + ' ' * max(0, len(protein) - len(new_protein)), bandwidth) protein_ali = result[0] new_protein_ali = result[1] else: protein_ali = protein new_protein_ali = new_protein diffs = [] j = 0 k = 0 for i in xrange(min(len(new_protein_ali), len(protein_ali))): if protein_ali[i] != ' ' and new_protein_ali[i] != ' ' and ( protein_ali[i] == '-' or new_protein_ali[i] == '-' or not bio.might_be_same_amino(protein_ali[i], new_protein_ali[i])): diffs.append((i, j, k)) if protein_ali[i] != '-': j += 1 if new_protein_ali[i] != '-': k += 1 diff_start = not bio.might_be_same_base(new_dna[0],dna[0]) or \ not bio.might_be_same_base(new_dna[1],dna[1]) or \ not bio.might_be_same_base(new_dna[2],dna[2]) interesting_coverage = False if use_coverage: cds_depth = depth[feature_alignment.start1: feature_alignment.end1] #/ median_depth if not feature_alignment.forward1: cds_depth = cds_depth[::-1] cds_ambiguous_depth = ambiguous_depth[ feature_alignment.start1: feature_alignment.end1] #/ median_ambiguous_depth if not feature_alignment.forward1: cds_ambiguous_depth = cds_ambiguous_depth[::-1] cds_depth_expect = depth_expect[feature_alignment. start1:feature_alignment.end1] if not feature_alignment.forward1: cds_depth_expect = cds_depth_expect[::-1] #cds_average_depth_ratio = numpy.average(depth[feature_alignment.start1:feature_alignment.end1]) / median_depth #cds_average_ambiguous_depth_ratio = numpy.average(ambiguous_depth[feature_alignment.start1:feature_alignment.end1]) / median_ambiguous_depth #line += '%.1f\t' % cds_average_depth_ratio #line += '%.1f\t' % cds_average_ambiguous_depth_ratio #line += '%.1f..%.1f\t' % (numpy.minimum.reduce(cds_depth)/median_depth, numpy.maximum.reduce(cds_depth)/median_depth) #line += '%.1f+/-%.1f\t' % (numpy.average(cds_depth)/median_depth, numpy.var(cds_depth)**0.5/median_depth) #line += '%.1f..%.1f\t' % (numpy.minimum.reduce(cds_ambiguous_depth)/median_ambiguous_depth, numpy.maximum.reduce(cds_ambiguous_depth)/median_ambiguous_depth) avg_expect = numpy.average(cds_depth_expect) if avg_expect > 0.0: cds_avg_depth = numpy.average(cds_depth) / avg_expect cds_avg_ambiguous_depth = numpy.average( cds_ambiguous_depth) / avg_expect / ambiguous_factor strange = ((cds_depth >= cds_depth_expect * 1.5) | (cds_ambiguous_depth <= cds_depth_expect * (0.5 * ambiguous_factor))) interesting_coverage = numpy.average( strange) >= coverage_cutoff if interesting_coverage or diffs or diff_start or shifts or len( new_protein) != len(protein): line = name + '\t' + locus_tag + '\t' + \ '%d\t' % (len(protein)-1) + \ '%d\t' % (len(new_protein)-1) + \ '%d\t' % len(diffs) if use_coverage: if avg_expect <= 0.0: line += '\t\t\t' else: line += '%.1f\t' % (cds_avg_depth) + graphlet( cds_depth, cds_depth_expect) + '\t' line += '%.1f\t' % ( cds_avg_ambiguous_depth) + graphlet( cds_ambiguous_depth, cds_depth_expect * ambiguous_factor) + '\t' line += '%.1f%%\t' % ( numpy.average(cds_ambiguous_depth > 0.0) * 100.0) line += '%s\t' % feature.qualifiers.get('gene',[''])[0] + \ '%s' % feature.qualifiers.get('product',[''])[0] notes = [] if use_coverage and 'X' in new_protein: xs = new_protein.count('X') if xs == len(new_protein) - 1: #First is M, so len-1 notes.append('\ No consensus') else: notes.append('\ No consensus for %d aa' % (new_protein.count('X'))) if len(new_protein) < len(protein): notes.append('\ Shorter by %d aa' % (len(protein) - len(new_protein))) if len(new_protein) > len(protein): notes.append('\ Longer by %d aa' % (len(new_protein) - len(protein))) if diff_start: notes.append('\ Start changed: %s -> %s' % (dna[:3], new_dna[:3])) if new_dna[:3] not in start_codons: notes.append(' No longer a start codon!') if shifts: notes.append('\ Indels:') for pos, old, new in shifts: notes.append(' base %5d / codon %5d %s -> %s' % (pos + 1, (pos // 3) + 1, old, new or '-')) if diffs: if verbose: notes.append('\ Amino acid changes:') for i, j, k in diffs: notes.append( ' codon %5d %s->%s (%s->%s)' % (j + 1, protein_ali[i], new_protein_ali[i], dna[j * 3:j * 3 + 3] if protein_ali[i] != '-' else '-', new_dna[k * 3:k * 3 + 3] if new_protein_ali[i] != '-' else '-')) #if len(new_protein) > len(protein): # print 'New protein is longer:', new_protein[len(protein):] #if len(new_protein) < len(protein): # print 'New protein is shorter:', protein[len(new_protein):] #print protein #print new_protein if tabular: print line + '\t' + ' '.join( [' '.join(note.strip().split()) for note in notes]) else: print line for note in notes: print '\t' + note return 0
def get_file_info(filename): info = selection.Matchable_set() info.add('compression-' + get_compression_type(filename)) if os.path.isdir(filename): any = False if os.path.exists(join(filename, 'alignments.bam')): info.add('type-working') any = True if os.path.exists(join(filename, 'reference.fa')): info.add('type-reference') any = True if not any: raise grace.Error('Unrecognized directory type ' + filename) else: f = open_possibly_compressed_file(filename) peek = f.read(1024) f.close() if 'compression-bam' in info or peek.startswith('@HD\t'): #TODO: sam file might be headerless info.add('type-sam') elif not peek: info.add('type-empty') # It's a valid sequence file info.add('sequences') info.add('qualities') elif peek.startswith('>'): info.add('type-fasta') info.add('sequences') elif peek.startswith('LOCUS'): info.add('type-genbank') info.add('sequences') elif peek.startswith('@'): info.add('type-fastq') info.add('sequences') info.add('qualities') elif peek.startswith('##gff'): info.add('type-gff') info.add('sequences') info.add('annotations') elif peek.startswith('.sff'): info.add('type-sff') info.add('sequences') info.add('qualities') elif peek.startswith('##fileformat=VCF'): info.add('type-vcf') # Possibly unreliable elif peek.split('\n')[0].count('\t') in (7, 8): info.add('type-gff') info.add('sequences') info.add('annotations') else: raise grace.Error('Unrecognized file format for ' + filename) return info
def run(self): reader_f = io.open_possibly_compressed_file(self.vcf) reader = vcf.Reader(reader_f) tags = {} for item in reader.metadata.get('sampleTags', []): parts = item.split(',') tags[parts[0]] = parts assert 'reference' not in reader.samples, 'Can\'t have a sample called reference, sorry.' samples = ['reference'] + reader.samples for sample in samples: if sample not in tags: tags[sample] = [sample, 'all'] samples = selection.select_and_sort(self.select, self.sort, samples, lambda sample: tags[sample]) required = [ i for i, sample in enumerate(samples) if selection.matches(self.require, tags[sample]) ] sample_number = dict((b, a) for a, b in enumerate(reader.samples)) items = [] for record in reader: variants = get_variants(record) genotypes = [] counts = [] qualities = [] for sample in samples: if sample == 'reference': genotypes.append([0]) counts.append([1]) qualities.append(float('inf')) else: genotypes.append( get_genotype(record.samples[sample_number[sample]])) counts.append( get_variant_counts( record.samples[sample_number[sample]])) qualities.append( record.samples[sample_number[sample]].data.GQ) # Only output when there are at least two genotypes any_interesting = False for i in xrange(len(genotypes)): for j in xrange(i): if (genotypes[i] is not None and genotypes[j] is not None and not genotypes_equal(genotypes[i], genotypes[j])): any_interesting = True break if any_interesting: break if not any_interesting: continue if any(genotypes[i] is None for i in required): continue if self.only_snps and any(genotype is not None and any( len(variants[i]) != 1 for i in genotype) for genotype in genotypes): continue snpeff = snpeff_describe(record.INFO.get('EFF', '')) if not any( selection.matches(self.snpeff_filter, item[1]) for item in (snpeff or [('', [])])): continue items.append( _Nway_record(variants=variants, genotypes=genotypes, counts=counts, qualities=qualities, snpeff=snpeff, record=record)) self.log.log('%d variants\n\n' % len(items)) if self.as_ == 'table': self._write_table(samples, items) elif self.as_ == 'nexus': self._write_nexus(samples, items) elif self.as_ == 'splitstree': self._write_nexus(samples, items) io.execute( 'SplitsTree +g -i INPUT -x COMMAND', no_display=True, INPUT=self.prefix + '.nex', COMMAND='UPDATE; ' 'SAVE FILE=\'%s.nex\' REPLACE=yes; ' 'EXPORTGRAPHICS format=svg file=\'%s.svg\' REPLACE=yes TITLE=\'NeighborNet from %d variants\'; ' 'QUIT' % (self.prefix, self.prefix, len(items)), ) elif self.as_ == 'vcf': self._write_vcf(samples, items, reader) else: raise grace.Error('Unknown output format: ' + self.as_)