def main(args): size, args = grace.get_option_value(args, '--size', int, 200) stride, args = grace.get_option_value(args, '--stride', int, 50) grace.expect_no_further_options(args) if not args: print USAGE return 1 for filename in args: for name, seq in io.read_sequences(filename): name_parts = name.split(None, 1) name = name_parts[0] if len(name_parts) > 1: desc = ' ' + name_parts[1] else: desc = '' for i in xrange(-size + stride, len(seq), stride): start = max(0, min(len(seq), i)) end = max(0, min(len(seq), i + size)) io.write_fasta(sys.stdout, '%s:%d..%d' % (name, start + 1, end) + desc, seq[start:end]) return 0
def main(args): size, args = grace.get_option_value(args,'--size',int,200) stride, args = grace.get_option_value(args,'--stride',int,50) grace.expect_no_further_options(args) if not args: print USAGE return 1 for filename in args: for name, seq in io.read_sequences(filename): name_parts = name.split(None, 1) name = name_parts[0] if len(name_parts) > 1: desc = ' ' + name_parts[1] else: desc = '' for i in xrange(-size+stride,len(seq),stride): start = max(0,min(len(seq),i)) end = max(0,min(len(seq), i+size)) io.write_fasta( sys.stdout, '%s:%d..%d' % (name,start+1,end) + desc, seq[start:end] ) return 0
def normalize(args): min_depth, args = grace.get_option_value(args, '--min-depth', int, 5) grace.expect_no_further_options(args) if len(args) < 2: print NORMALIZE_HELP raise grace.Help_shown() dirnames = args filenames = [] for dirname in dirnames: assert os.path.isdir(dirname), dirname + ' is not a directory' filenames.append( sorted( item for item in os.listdir(dirname) #if item.endswith('.userplot') and not item.endswith('-norm.userplot') if item.endswith('-depth.userplot') and not item.endswith('-ambiguous-depth.userplot') and not item.endswith('-pairspan-depth.userplot'))) for i in xrange(1, len(dirnames)): if filenames[i] != filenames[0]: raise grace.Error('Userplots in %s differ from those in %s' % (dirnames[i], dirnames[0])) filenames = filenames[0] for filename in filenames: normalize_files(dirnames, filename[:-15], min_depth)
def normalize(args): min_depth, args = grace.get_option_value(args, '--min-depth', int, 5) grace.expect_no_further_options(args) if len(args) < 2: print NORMALIZE_HELP raise grace.Help_shown() dirnames = args filenames = [ ] for dirname in dirnames: assert os.path.isdir(dirname), dirname + ' is not a directory' filenames.append(sorted( item for item in os.listdir(dirname) #if item.endswith('.userplot') and not item.endswith('-norm.userplot') if item.endswith('-depth.userplot') and not item.endswith('-ambiguous-depth.userplot') and not item.endswith('-pairspan-depth.userplot') )) for i in xrange(1,len(dirnames)): if filenames[i] != filenames[0]: raise grace.Error('Userplots in %s differ from those in %s' % (dirnames[i], dirnames[0])) filenames = filenames[0] for filename in filenames: normalize_files(dirnames, filename[:-15], min_depth)
def scaffold(args): circular, args = grace.get_option_value(args, '--circular', grace.as_bool, False) scaffold = [ ] for item in args: scaffold.append( ('contig', int(item)) ) scaffold.append( ('gap', None) ) if not circular: scaffold = scaffold[:-1] name = 'custom_scaffold_%d' % (len(scaffolds)+1) scaffolds.append( (name, scaffold) )
def test_power_main(args): m, args = grace.get_option_value(args, '--m', int, 10) n, args = grace.get_option_value(args, '--n', int, 1000) reps, args = grace.get_option_value(args, '--reps', int, 2) count, args = grace.get_option_value(args, '--count', int, 100) dispersion, args = grace.get_option_value(args, '--dispersion', float, 0.1) log_fold, args = grace.get_option_value(args, '--log-fold', float, 1.0) if len(args) < 1: print >> sys.stderr, TEST_POWER_HELP raise grace.Help_shown() output_prefix, args = args[0], args[1:] options = [ ] def of(args): options.extend(args) grace.execute(args, {'of': of}) filename = output_prefix + '-input.txt' filename_literal = R_literal(filename) log_filename_literal = R_literal(output_prefix + '-info.txt') run_script(POWER_TEMPLATE % locals()) claimed_fdr = test_counts_main([ output_prefix, filename, 'Experimental' ] + options) output_filename_literal = R_literal(output_prefix + '.txt') run_script(POWER_REPORT_TEMPLATE % locals())
def debias(args): import numpy radius, args = grace.get_option_value(args, '--radius', int, 2) dirs = args for dir_name in dirs: for name, seq in io.read_sequences( os.path.join(dir_name, 'reference.fa')): for suffix, ambig_suffix in [ ('-depth', '-ambiguous-depth'), ('-pairspan-depth', '-ambiguous-pairspan-depth'), ]: root = grace.filesystem_friendly_name(name) full_name = os.path.join(dir_name, root + suffix + '.userplot') full_ambig_name = os.path.join( dir_name, root + ambig_suffix + '.userplot') if not os.path.exists(full_name): continue if not os.path.exists(full_ambig_name): continue output_suffix = '-%d.userplot' % radius print dir_name, root, output_suffix depths = numpy.array(read_unstranded_userplot(full_name)) ambig_depths = numpy.array( read_unstranded_userplot(full_ambig_name)) expect = expected_depth(root, seq, depths, ambig_depths, radius) write_unstranded_userplot( os.path.join(dir_name, root + suffix + '-expected' + output_suffix), expect) corrected = depths / expect * numpy.median(expect) corrected[expect <= 5.0] = 0.0 write_unstranded_userplot( os.path.join(dir_name, root + suffix + '-corrected' + output_suffix), corrected) ambig_corrected = ambig_depths / expect * numpy.median(expect) ambig_corrected[expect <= 0.0] = 0.0 write_unstranded_userplot( os.path.join( dir_name, root + ambig_suffix + '-corrected' + output_suffix), ambig_corrected)
def plot(args): log_it, args = grace.get_option_value(args, '--log', grace.as_bool, False) grace.expect_no_further_options(args) import numpy, pylab pylab.rcParams['axes.formatter.limits'] = [-20, 20] pylab.figure(figsize=(10, 4)) maximum = 0 for filename in args: parts = filename.split('~~', 1) data = [] f = open(parts[0], 'rb') for line in f: data.append(float(line.strip())) f.close() data = numpy.array(data) maximum = max(maximum, numpy.maximum.reduce(data)) #if log_it: # data = numpy.log(data + 1.0) / numpy.log(2.0) if log_it: pylab.semilogy(numpy.arange(1, len(data) + 1), data, label=parts[-1]) else: pylab.plot(numpy.arange(1, len(data) + 1), data, label=parts[-1]) if len(args) > 1: pylab.legend() if log_it: pylab.ylim((1, maximum**1.2)) else: pylab.ylim((0, maximum * 1.2)) pylab.show()
def debias(args): import numpy radius, args = grace.get_option_value(args, '--radius', int, 2) dirs = args for dir_name in dirs: for name, seq in io.read_sequences(os.path.join(dir_name,'reference.fa')): for suffix, ambig_suffix in [ ('-depth', '-ambiguous-depth'), ('-pairspan-depth', '-ambiguous-pairspan-depth'), ]: root = grace.filesystem_friendly_name(name) full_name = os.path.join(dir_name, root + suffix + '.userplot') full_ambig_name = os.path.join(dir_name, root + ambig_suffix + '.userplot') if not os.path.exists(full_name): continue if not os.path.exists(full_ambig_name): continue output_suffix = '-%d.userplot' % radius print dir_name, root, output_suffix depths = numpy.array( read_unstranded_userplot(full_name) ) ambig_depths = numpy.array( read_unstranded_userplot(full_ambig_name) ) expect = expected_depth(root, seq, depths, ambig_depths, radius) write_unstranded_userplot( os.path.join(dir_name, root + suffix + '-expected' + output_suffix), expect) corrected = depths / expect * numpy.median(expect) corrected[expect <= 5.0] = 0.0 write_unstranded_userplot( os.path.join(dir_name, root + suffix + '-corrected' + output_suffix), corrected) ambig_corrected = ambig_depths / expect * numpy.median(expect) ambig_corrected[expect <= 0.0] = 0.0 write_unstranded_userplot( os.path.join(dir_name, root + ambig_suffix + '-corrected' + output_suffix), ambig_corrected)
def plot(args): log_it, args = grace.get_option_value(args, '--log', grace.as_bool, False) grace.expect_no_further_options(args) import numpy, pylab pylab.rcParams['axes.formatter.limits'] = [ -20, 20 ] pylab.figure(figsize=(10,4)) maximum = 0 for filename in args: parts = filename.split('~~', 1) data = [ ] f = open(parts[0],'rb') for line in f: data.append(float(line.strip())) f.close() data = numpy.array(data) maximum = max(maximum,numpy.maximum.reduce(data)) #if log_it: # data = numpy.log(data + 1.0) / numpy.log(2.0) if log_it: pylab.semilogy( numpy.arange(1,len(data)+1), data, label=parts[-1] ) else: pylab.plot( numpy.arange(1,len(data)+1), data, label=parts[-1] ) if len(args) > 1: pylab.legend() if log_it: pylab.ylim( (1,maximum**1.2) ) else: pylab.ylim( (0,maximum*1.2) ) pylab.show()
def old_main(args): use_indels, args = grace.get_option_value(args,'--indels',int,1) use_reference, args = grace.get_option_value(args,'--reference',int,1) make_list, args = grace.get_option_value(args,'--list',int,0) fasta_output, args = grace.get_option_value(args,'--fasta',int,0) grace.expect_no_further_options(args) if len(args) < 1: sys.stderr.write(USAGE) return 1 if fasta_output and use_indels: print >> sys.stderr, 'Indels will not be included in FASTA output' use_indels = 0 working_dirs = args #reference_data = { } # (ref_name, position, change_type) -> string #strain_data = { } # working_dir -> (ref_name, position, change_type) -> string names = ['reference'] + working_dirs substitution_calls = { } # ref_name -> [ [ call ] ] insertion_calls = { } # ref_name -> [ [ call ] ] substitution_evidence = { } insertion_evidence = { } for name, sequence in io.read_sequences(os.path.join(working_dirs[0], 'reference.fa')): substitution_calls[name] = [ list(sequence.upper()) ] insertion_calls[name] = [ [ '-' ] * len(sequence) ] substitution_evidence[name] = [ [ '' ] * len(sequence) ] insertion_evidence[name] = [ [ '' ] * len(sequence) ] for working_dir in working_dirs: for name in substitution_calls: filename = os.path.join(working_dir, grace.filesystem_friendly_name(name) + '-evidence.txt') f = open(filename,'rb') this_substitution_calls = [ ] this_insertion_calls = [ ] this_substitution_evidence = [ ] this_insertion_evidence = [ ] header = f.readline() if header.count('\t') != 5: print >> sys.stderr, 'Old style evidence file. Please re-run nesoni consensus.' return 1 for line in f: fields = line.rstrip('\n').split('\t') this_substitution_calls.append(fields[5]) this_insertion_calls.append(fields[4]) this_substitution_evidence.append(fields[2]) this_insertion_evidence.append(fields[1]) substitution_calls[name].append(this_substitution_calls) insertion_calls[name].append(this_insertion_calls) substitution_evidence[name].append(this_substitution_evidence) insertion_evidence[name].append(this_insertion_evidence) if not use_reference: names.pop(0) for name in substitution_calls: substitution_calls[name].pop(0) insertion_calls[name].pop(0) substitution_evidence[name].pop(0) insertion_evidence[name].pop(0) interesting = find_interesting('substitution', substitution_calls, substitution_evidence) if use_indels: interesting.extend( find_interesting('insertion-before', insertion_calls, insertion_evidence) ) if not use_indels: interesting = [ item for item in interesting if '-' not in item[3] ] interesting.sort() if fasta_output: do_fasta_output(names, interesting) return 0 #strain_reference_having_consensus = { } # working_dir -> ref_name -> string # #for working_dir in working_dirs: # assert working_dir not in strain_data, 'Working directory given twice' # strain_data[working_dir] = { } # # report_file = open(os.path.join(working_dir, 'report.txt'), 'rU') # report_file.readline() # for line in report_file: # ref_name, position, change_type, old, new, evidence = \ # line.rstrip('\n').split('\t') # # if change_type == 'deletion': # change_type = 'substitution' # # if not use_indels and \ # (change_type == 'insertion-before' or new == '-'): # continue # # key = (ref_name, int(position), change_type) # if key in reference_data: # assert reference_data[key] == old # else: # reference_data[key] = old # # strain_data[working_dir][key] = new # report_file.close() # # strain_reference_having_consensus[working_dir] = { } # ref_have_con_filename = os.path.join(working_dir, 'reference_having_consensus.fa') # for name, sequence in io.read_fasta(ref_have_con_filename): # strain_reference_having_consensus[working_dir][name] = sequence # #keys = sorted(reference_data) # ##Fill in any blanks #for working_dir in working_dirs: # for key in keys: # if key in strain_data[working_dir]: continue # # # - Positions in report files start from 1 not 0 # # - Insertions must be bracketed # lacks_consensus = ( # strain_reference_having_consensus[working_dir][key[0]][key[1]-1] == 'N' or # (key[2] == 'insertion-before' and key[1] > 1 and # strain_reference_having_consensus[working_dir][key[0]][key[1]-2] == 'N') # ) # # #If there's no consensus, record it as ambiguous # if lacks_consensus: # strain_data[working_dir][key] = 'N' # else: # strain_data[working_dir][key] = reference_data[key] #all_data_names = ([ 'reference' ] if use_reference else []) + working_dirs #all_data = ([ reference_data ] if use_reference else []) + \ # [ strain_data[working_dir] for working_dir in working_dirs ] #all_data_names = ([ 'reference' ] if use_reference else []) + working_dirs ones = ( 1 << len(names) )-1 total_differences = 0 if make_list: print '\t'.join(['Partition','Sequence','Position in reference','Change type'] + names + names) for i in xrange(1,(1<<len(names))-1,2): set1 = [ ] set2 = [ ] for j in xrange(len(names)): if i & (1<<j): set1.append(j) else: set2.append(j) if make_list: print print ', '.join( names[i] for i in set1 ) + ' vs ' + \ ', '.join( names[i] for i in set2 ) print n = 0 for refname, position, change_type, values, has_ambiguous, evidence in interesting: #Skip if *any* ambiguity if has_ambiguous: continue if any( values[i] != values[set1[0]] for i in set1[1:] ) or \ any( values[i] != values[set2[0]] for i in set2[1:] ): continue if make_list: if change_type == 'substitution' and '-' in values: change_type = 'deletion' print '\t%s\t%d\t%s\t' % (refname,position,change_type) + '\t'.join(values) + '\t' + '\t'.join(evidence) n += 1 total_differences += n if not make_list: print ', '.join( names[i] for i in set1 ) + ' vs ' + \ ', '.join( names[i] for i in set2 ) + \ ': %d differences' %n if not make_list: print print 'Total: %d' % total_differences if make_list: print print 'Ignored' print n_multiway = 0 n_ambiguous = 0 for refname, position, change_type, values, has_ambiguous, evidence in interesting: confusing = False if has_ambiguous: n_ambiguous += 1 confusing = True elif len(set(values)) > 2: n_multiway += 1 confusing = True if make_list and confusing: print '\t%s\t%d\t%s\t' % (refname,position,change_type) + '\t'.join(values) + '\t' + '\t'.join(evidence) if not make_list: print print 'Ambiguities ignored: %d' % n_ambiguous print 'Multi-way changes ignored: %d' % n_multiway assert total_differences + n_ambiguous + n_multiway == len(interesting) return 0
def report_main(args): title, args = grace.get_option_value(args, '--title', str, 'Report') short_name, args = grace.get_option_value(args, '--short', str, 'files') show_refalign, args = grace.get_option_value(args, '--show-refalign', grace.as_bool, True) output_dir, args = args[0], args[1:] reference_filenames = [ ] clip_filenames = [ ] align_dirs = [ ] count_log_filenames = [ ] extra_items = [ ] extra_files = [ ] def file(args): extra_files.append((args[0], ' '.join(args[1:]))) def extra(args): extra_items.extend(args) def reference(args): reference_filenames.extend(args) def clips(args): clip_filenames.extend(args) def aligns(args): align_dirs.extend(args) def count_log(args): count_log_filenames.extend(args) grace.execute(args, [reference, clips, aligns, extra, file, count_log]) if not os.path.isdir(output_dir): os.mkdir(output_dir) file_dir = join(output_dir, short_name) if not os.path.isdir(file_dir): os.mkdir(file_dir) for item in os.listdir(file_dir): os.unlink(join(file_dir, item)) for filename in reference_filenames: io.copy_file(filename, join(file_dir, os.path.basename(filename))) for filename, desc in extra_files: io.copy_file(filename, join(output_dir, os.path.basename(filename))) pairs = False for directory in align_dirs: name = os.path.basename(directory) io.copy_file(join(directory,'report.txt'), join(file_dir, name + '-report.txt')) for extension in [ '-depth.userplot', '-ambiguous-depth.userplot', '-pairspan-depth.userplot', '-ambiguous-pairspan-depth.userplot', ]: filenames = [ item for item in os.listdir(directory) if item.endswith(extension) and not item.endswith('-ambiguous'+extension) and not item.endswith('-pairspan'+extension) ] for filename in filenames: if len(filenames) == 1: dest = name + extension else: dest = name + '-' + filename io.copy_file(join(directory,filename), join(file_dir, dest)) if 'pairspan' in extension: pairs = True today = datetime.date.today().strftime('%e %B %Y') f = open(join(output_dir, 'index.html'),'wb') print >> f, HEAD % locals() section(f, 'Results') for item in extra_items: p(f, item) for filename, desc in extra_files: name = os.path.basename(filename) p(f, '<a href="%(name)s">%(name)s</a> - %(desc)s' % locals()) p(f, '<a href="%(short_name)s.zip">%(short_name)s.zip</a>' % locals()) for filename in reference_filenames: bullet(f, os.path.basename(filename) + ' - reference') bullet(f, '...-report.txt - report on SNPs and indels found') p(f,'Different kinds of userplot:') bullet(f,'...-depth.userplot - depth of coverage of unambiguously aligned reads') bullet(f,'...-ambiguous-depth.userplot - depth of coverage, including reads that hit multiple locations') if pairs: bullet(f,'...-pairspan-depth.userplot - depth, including the space between reads in read-pairs') bullet(f,'...-ambiguous-pairspan-depth.userplot - as above, but including reads that hit multiple locations') if clip_filenames: section(f, 'Read clipping') for filename in clip_filenames: assert filename.endswith('_log.txt') name = os.path.basename(filename[:-8]) text = extract(filename, lambda line: line.startswith('Fragments:') or line.startswith('Single reads') or line.startswith('Pairs')) subsection(f, name) pre(f, text) end_subsection(f) if count_log_filenames: section(f, 'Counting alignments to genes') for filename in count_log_filenames: pre(f, open(filename,'rb').read()) if align_dirs and show_refalign: section(f, 'Reference alignment') for directory in align_dirs: name = os.path.basename(directory) text = extract(join(directory, 'consensus_log.txt'), lambda line: 'reads/pairs' in line or 'unmapped' in line) text = text.replace('(discarded)','') text = text.replace('reads/pairs kept', 'aligned unambiguously') subsection(f, name) pre(f, text) end_subsection(f) print >> f, TAIL % locals() f.close() zip_filename = join(output_dir, short_name + '.zip') if os.path.exists(zip_filename): os.unlink(zip_filename) assert 0 == os.system('cd %(output_dir)s ; zip %(short_name)s.zip %(short_name)s/* ' % locals()) for item in os.listdir(file_dir): os.unlink(join(file_dir, item)) os.rmdir(file_dir)
def main(args): grace.require_shrimp_1() n_cpus = grace.how_many_cpus() solid, args = grace.get_flag(args, '--solid') verbose, args = grace.get_flag(args, '--verbose') threshold, args = grace.get_option_value(args, '--threshold', str, '68%') stride, args = grace.get_option_value(args, '--stride', int, 1) max_shrimps, args = grace.get_option_value(args, '--cpus', int, n_cpus) batch_size, args = grace.get_option_value(args, '--batch-size', int, 5000000) input_reference_filenames = [] reads_filenames = [] shrimp_options = ['-h', threshold] if threshold.endswith('%'): threshold = -float(threshold[:-1]) / 100.0 else: threshold = int(threshold) output_dir = [] #As list so can write to from function. Gah. def front_command(args): grace.expect_no_further_options(args) if len(args) < 1: return output_dir.append(args[0]) input_reference_filenames.extend( [os.path.abspath(filename) for filename in args[1:]]) def reads_command(args): grace.expect_no_further_options(args) reads_filenames.extend([[os.path.abspath(filename)] for filename in args]) def pairs_command(args): grace.expect_no_further_options(args) assert len(args) == 2, 'Expected exactly two files in "pairs"' reads_filenames.append( [os.path.abspath(filename) for filename in args]) def shrimp_options_command(args): shrimp_options.extend(args) grace.execute( args, { 'reads': reads_command, '--reads': reads_command, 'pairs': pairs_command, 'shrimp-options': shrimp_options_command, '--shrimp-options': shrimp_options_command, }, front_command) if not output_dir: print >> sys.stderr, USAGE % n_cpus return 1 output_dir = output_dir[0] assert input_reference_filenames, 'No reference files given' assert reads_filenames, 'No read files given' for filename in itertools.chain(input_reference_filenames, *reads_filenames): assert os.path.exists(filename), '%s does not exist' % filename if not os.path.isdir(output_dir): os.mkdir(output_dir) if solid: shrimp = 'rmapper-cs' else: shrimp = 'rmapper-ls' reference_filename = os.path.join(output_dir, 'reference.fa') reference_file = open(reference_filename, 'wb') total_reference_sequences = 0 total_reference_bases = 0 for input_reference_filename in input_reference_filenames: for name, sequence in io.read_sequences(input_reference_filename): #Don't retain any comment name = name.split()[0] io.write_fasta(reference_file, name, sequence) total_reference_sequences += 1 total_reference_bases += len(sequence) reference_file.close() print '%s base%s in %s reference sequence%s' % ( grace.pretty_number(total_reference_bases), 's' if total_reference_bases != 1 else '', grace.pretty_number(total_reference_sequences), 's' if total_reference_sequences != 1 else '') assert total_reference_bases, 'Reference sequence file is empty' config = { 'references': input_reference_filenames, 'reads': reads_filenames, 'stride': stride, 'solid': solid, 'threshold': threshold, } config_file = open(os.path.join(output_dir, 'config.txt'), 'wb') pprint.pprint(config, config_file) config_file.close() output_filename = os.path.join(output_dir, 'shrimp_hits.txt.gz') output_file = gzip.open(output_filename, 'wb') unmapped_filename = os.path.join(output_dir, 'unmapped.fa.gz') unmapped_file = gzip.open(unmapped_filename, 'wb') dirty_filenames = set() dirty_filenames.add(output_filename) dirty_filenames.add(unmapped_filename) #warn_low_threshold = True try: #Cleanup temporary files N = [0] def do_shrimp(read_set): my_number = N[0] N[0] += 1 tempname = os.path.join(output_dir, 'temp%d-%d.fa' % (os.getpid(), my_number)) tempname_out = os.path.join( output_dir, 'temp%d-%d.txt' % (os.getpid(), my_number)) dirty_filenames.add(tempname) dirty_filenames.add(tempname_out) f = open(tempname, 'wb') for read_name, read_seq in read_set: print >> f, '>' + read_name print >> f, read_seq f.close() command = shrimp + ' ' + ' '.join(shrimp_options) + ' ' + \ tempname + ' ' + reference_filename + ' >' + tempname_out if not verbose: command += ' 2>/dev/null' #f = os.popen(command, 'r') child_pid = os.spawnl(os.P_NOWAIT, '/bin/sh', '/bin/sh', '-c', command) #print 'SHRiMP %d running' % my_number def finalize(): exit_status = os.waitpid(child_pid, 0)[1] assert exit_status == 0, 'Shrimp indicated an error' hits = {} # read_name -> [ hit line ] f = open(tempname_out, 'rb') for line in f: if line.startswith('>'): read_name = line.split(None, 1)[0][1:] if read_name not in hits: hits[read_name] = [] hits[read_name].append(line) f.close() for read_name, read_seq in read_set: if read_name in hits: for hit in hits[read_name]: output_file.write(hit) else: print >> unmapped_file, '>' + read_name print >> unmapped_file, read_seq output_file.flush() unmapped_file.flush() os.unlink(tempname) dirty_filenames.remove(tempname) os.unlink(tempname_out) dirty_filenames.remove(tempname_out) #print 'SHRiMP %d finished' % my_number return finalize shrimps = [] reader = iter_reads(config) read_count = 0 while True: read_set = [] read_set_bases = 0 #Read name should not include comment cruft # - SHRIMP passes this through # - might stuff up identification of pairs for read_name, read_seq in reader: read_name = read_name.split()[0] read_set.append((read_name, read_seq)) read_set_bases += len(read_seq) #if warn_low_threshold and len(read_seq)*7 < threshold: #Require 70% exact match # sys.stderr.write('\n*** WARNING: Short reads, consider reducing --threshold ***\n\n') # warn_low_threshold = False read_count += 1 if read_set_bases >= batch_size: break if not read_set: break if len(shrimps) >= max_shrimps: shrimps.pop(0)() shrimps.append(do_shrimp(read_set)) grace.status('SHRiMPing %s' % grace.pretty_number(read_count)) while shrimps: grace.status('Waiting for SHRiMPs to finish %d ' % len(shrimps)) shrimps.pop(0)() grace.status('') output_file.close() dirty_filenames.remove(output_filename) unmapped_file.close() dirty_filenames.remove(unmapped_filename) return 0 finally: for filename in dirty_filenames: if os.path.exists(filename): os.unlink(filename)
def main(args): default_transl_table, args = grace.get_option_value( args, '--transl_table', int, 11) use_coverage, args = grace.get_flag(args, '--use-coverage') coverage_cutoff, args = grace.get_option_value(args, '--coverage-cutoff', float, 0.1) tabular, args = grace.get_flag(args, '--tabular') noheader, args = grace.get_flag(args, '--noheader') verbose, args = grace.get_flag(args, '--verbose') bandwidth, args = grace.get_option_value(args, '--band', int, 20) grace.expect_no_further_options(args) if len(args) != 2: print USAGE return 1 genbank_filename = args[0] alignment_filename = args[1] if os.path.isdir(alignment_filename): alignment_filename = os.path.join(alignment_filename, 'alignment.maf') working_dir = os.path.split(alignment_filename)[0] alignments = load_alignments(alignment_filename) summaries = [] details = [] if not noheader: fields = 'Sequence\tLocus tag\tOld length (aa)\tNew length (aa)\tAmino acid changes\t' if use_coverage: fields += 'Unambiguous coverage vs expected\t\tAmbiguous coverage vs expected\t\tAmbiguous percent with any hits\t' fields += 'Gene\tProduct' if tabular: fields += '\tChanges of note' print fields for record in SeqIO.parse( io.open_possibly_compressed_file(genbank_filename), 'genbank'): sequence = record.seq.tostring() for name, seq1, seq2, alignment in alignments: if seq1 == sequence: break else: raise grace.Error( 'Genbank record %s sequence not identical to any reference sequence' % record.id) if use_coverage: depth = get_graph(working_dir, name, 'depth') ambiguous_depth = get_graph(working_dir, name, 'ambiguous-depth') median_depth = numpy.median(depth) median_ambiguous_depth = numpy.median(ambiguous_depth) ambiguous_factor = float(median_ambiguous_depth) / median_depth depth_expect = expected_depth(name, sequence, depth, ambiguous_depth) for feature in record.features: if feature.type != 'CDS': continue if 'locus_tag' not in feature.qualifiers: locus_tag = '%d..%d' % (feature.location.nofuzzy_start + 1, feature.location.nofuzzy_end) else: locus_tag = feature.qualifiers['locus_tag'][0] if 'transl_table' in feature.qualifiers: transl_table_no = int(feature.qualifiers['transl_table'][0]) else: assert default_transl_table is not None, 'No /transl_table for CDS, and default transl_table not given' transl_table_no = default_transl_table transl_table = CodonTable.ambiguous_dna_by_id[transl_table_no] start_codons = transl_table.start_codons try: feature_alignment = alignment_from_feature(sequence, feature) except Weird_alignment: warn('%s has a location I could not handle, skipping, sorry' % locus_tag) continue dna = [] new_dna = [] shifts = [] for i in xrange(feature_alignment.end2): p1 = feature_alignment.back_project(i, left=False) p2 = feature_alignment.back_project(i + 1, left=True) assert abs(p2 - p1) < 2 dna.append(sequence_slice(sequence, p1, p2)) p1a = alignment.project(p1, left=False) p2a = alignment.project(p2, left=False) #Hmm diff = (p2 - p1) - (p2a - p1a) #if diff: # if diff%3: # frame_shift = True # else: # frame_preserving_shift = True new_dna.append(sequence_slice(seq2, p1a, p2a)) if diff: shifts.append((i, dna[-1], new_dna[-1])) dna = ''.join(dna) new_dna = ''.join(new_dna) # This usually indicated a CDS truncated at the start? # in which case, will probably fail some way or other down the line. if 'codon_start' in feature.qualifiers: codon_start = int(feature.qualifiers['codon_start'][0]) - 1 else: codon_start = 0 dna = dna[codon_start:] new_dna = new_dna[codon_start:] if len(dna) % 3 != 0: warn(locus_tag + ' length not a multiple of 3') #assert len(new_dna) % 3 == 0 protein = Seq.Seq(dna).translate(table=transl_table_no).tostring() # http://en.wikipedia.org/wiki/Start_codon is always translated to M protein = 'M' + protein[1:] if dna[:3] not in start_codons: warn(locus_tag + ' has unknown start codon: ' + dna[:3]) original_lacks_stop_codon = not protein.endswith('*') if original_lacks_stop_codon: warn(locus_tag + ' lacks end codon') original_stops_before_end = '*' in protein[:-1] if original_stops_before_end: warn(locus_tag + ' contains stop codon before end') if 'translation' in feature.qualifiers: expect = feature.qualifiers['translation'][0] if protein[:-1] != expect: warn( locus_tag + ' translation given in feature does not match translation from DNA' ) new_protein = Seq.Seq(new_dna).translate( table=transl_table_no).tostring() new_protein = 'M' + new_protein[1:] # If end codon changed, find new end # Don't bother if there are unknown amino acids or # the original protein lacks a stop codon if 'X' not in new_protein and '*' not in new_protein and not original_lacks_stop_codon: #This is very inefficient i = feature_alignment.end2 while True: p1 = feature_alignment.back_project(i, left=False) p2 = feature_alignment.back_project(i + 1, left=True) p1a = alignment.project(p1, left=False) p2a = alignment.project(p2, left=False) #Hmm if p1a < 0 or p2a < 0 or p1a > len(seq2) or p2a > len( seq2): break new_dna += sequence_slice(seq2, p1a, p2a) new_protein = Seq.Seq(new_dna).translate( table=transl_table_no).tostring() new_protein = 'M' + new_protein[1:] if 'X' in new_protein or '*' in new_protein: break i += 1 # Is the protein shorter? # Don't bother checking if the original protein has extra stop codons if '*' in new_protein and not original_stops_before_end: new_protein = new_protein[:new_protein.index('*') + 1] # If indels occurred, do an alignment # Don't bother otherwise if shifts: # Penalize gaps with cost 2 (vs 1 for mismatch) # If lengths don't match, pad with spaces (won't match longer seq), # aligner prefers mismatch to gaps #result = pairwise2.align.globalxs(protein + ' '*max(0,len(new_protein)-len(protein)), # new_protein + ' '*max(0,len(protein)-len(new_protein)), # -2.001,-2.000)[0] # 2.001 : very slightly prefer contiguous gaps. Also much faster! result = band_limited_align( protein + ' ' * max(0, len(new_protein) - len(protein)), new_protein + ' ' * max(0, len(protein) - len(new_protein)), bandwidth) protein_ali = result[0] new_protein_ali = result[1] else: protein_ali = protein new_protein_ali = new_protein diffs = [] j = 0 k = 0 for i in xrange(min(len(new_protein_ali), len(protein_ali))): if protein_ali[i] != ' ' and new_protein_ali[i] != ' ' and ( protein_ali[i] == '-' or new_protein_ali[i] == '-' or not bio.might_be_same_amino(protein_ali[i], new_protein_ali[i])): diffs.append((i, j, k)) if protein_ali[i] != '-': j += 1 if new_protein_ali[i] != '-': k += 1 diff_start = not bio.might_be_same_base(new_dna[0],dna[0]) or \ not bio.might_be_same_base(new_dna[1],dna[1]) or \ not bio.might_be_same_base(new_dna[2],dna[2]) interesting_coverage = False if use_coverage: cds_depth = depth[feature_alignment.start1: feature_alignment.end1] #/ median_depth if not feature_alignment.forward1: cds_depth = cds_depth[::-1] cds_ambiguous_depth = ambiguous_depth[ feature_alignment.start1: feature_alignment.end1] #/ median_ambiguous_depth if not feature_alignment.forward1: cds_ambiguous_depth = cds_ambiguous_depth[::-1] cds_depth_expect = depth_expect[feature_alignment. start1:feature_alignment.end1] if not feature_alignment.forward1: cds_depth_expect = cds_depth_expect[::-1] #cds_average_depth_ratio = numpy.average(depth[feature_alignment.start1:feature_alignment.end1]) / median_depth #cds_average_ambiguous_depth_ratio = numpy.average(ambiguous_depth[feature_alignment.start1:feature_alignment.end1]) / median_ambiguous_depth #line += '%.1f\t' % cds_average_depth_ratio #line += '%.1f\t' % cds_average_ambiguous_depth_ratio #line += '%.1f..%.1f\t' % (numpy.minimum.reduce(cds_depth)/median_depth, numpy.maximum.reduce(cds_depth)/median_depth) #line += '%.1f+/-%.1f\t' % (numpy.average(cds_depth)/median_depth, numpy.var(cds_depth)**0.5/median_depth) #line += '%.1f..%.1f\t' % (numpy.minimum.reduce(cds_ambiguous_depth)/median_ambiguous_depth, numpy.maximum.reduce(cds_ambiguous_depth)/median_ambiguous_depth) avg_expect = numpy.average(cds_depth_expect) if avg_expect > 0.0: cds_avg_depth = numpy.average(cds_depth) / avg_expect cds_avg_ambiguous_depth = numpy.average( cds_ambiguous_depth) / avg_expect / ambiguous_factor strange = ((cds_depth >= cds_depth_expect * 1.5) | (cds_ambiguous_depth <= cds_depth_expect * (0.5 * ambiguous_factor))) interesting_coverage = numpy.average( strange) >= coverage_cutoff if interesting_coverage or diffs or diff_start or shifts or len( new_protein) != len(protein): line = name + '\t' + locus_tag + '\t' + \ '%d\t' % (len(protein)-1) + \ '%d\t' % (len(new_protein)-1) + \ '%d\t' % len(diffs) if use_coverage: if avg_expect <= 0.0: line += '\t\t\t' else: line += '%.1f\t' % (cds_avg_depth) + graphlet( cds_depth, cds_depth_expect) + '\t' line += '%.1f\t' % ( cds_avg_ambiguous_depth) + graphlet( cds_ambiguous_depth, cds_depth_expect * ambiguous_factor) + '\t' line += '%.1f%%\t' % ( numpy.average(cds_ambiguous_depth > 0.0) * 100.0) line += '%s\t' % feature.qualifiers.get('gene',[''])[0] + \ '%s' % feature.qualifiers.get('product',[''])[0] notes = [] if use_coverage and 'X' in new_protein: xs = new_protein.count('X') if xs == len(new_protein) - 1: #First is M, so len-1 notes.append('\ No consensus') else: notes.append('\ No consensus for %d aa' % (new_protein.count('X'))) if len(new_protein) < len(protein): notes.append('\ Shorter by %d aa' % (len(protein) - len(new_protein))) if len(new_protein) > len(protein): notes.append('\ Longer by %d aa' % (len(new_protein) - len(protein))) if diff_start: notes.append('\ Start changed: %s -> %s' % (dna[:3], new_dna[:3])) if new_dna[:3] not in start_codons: notes.append(' No longer a start codon!') if shifts: notes.append('\ Indels:') for pos, old, new in shifts: notes.append(' base %5d / codon %5d %s -> %s' % (pos + 1, (pos // 3) + 1, old, new or '-')) if diffs: if verbose: notes.append('\ Amino acid changes:') for i, j, k in diffs: notes.append( ' codon %5d %s->%s (%s->%s)' % (j + 1, protein_ali[i], new_protein_ali[i], dna[j * 3:j * 3 + 3] if protein_ali[i] != '-' else '-', new_dna[k * 3:k * 3 + 3] if new_protein_ali[i] != '-' else '-')) #if len(new_protein) > len(protein): # print 'New protein is longer:', new_protein[len(protein):] #if len(new_protein) < len(protein): # print 'New protein is shorter:', protein[len(new_protein):] #print protein #print new_protein if tabular: print line + '\t' + ' '.join( [' '.join(note.strip().split()) for note in notes]) else: print line for note in notes: print '\t' + note return 0
def main(args): title1, args = grace.get_option_value(args, "--title1", str, None) title2, args = grace.get_option_value(args, "--title2", str, None) grace.expect_no_further_options(args) if len(args) != 3: print >> sys.stderr, USAGE return 1 working_dir1 = args[0] working_dir2 = args[1] cutoff = float(args[2]) sequence_names = [name for name, sequence in io.read_sequences(os.path.join(working_dir1, "reference.fa"))] if title1 is None: title1 = working_dir1 if title2 is None: title2 = working_dir2 n = 1 while significance([("A", n)], [("T", n)], 1.0) > cutoff: n += 1 print "%g\tsignificance cutoff" % cutoff print "%d\tdepth required to call substitution (greater if there are errors in the reads)" % n print "Sequence\tPosition in reference\tChange type\tReference\t%s\t%s\tp-value (no correction for multiple testing)\t%s\t%s" % ( title1, title2, title1, title2, ) for sequence_name in sequence_names: filename1 = os.path.join(working_dir1, grace.filesystem_friendly_name(sequence_name) + "-evidence.txt") filename2 = os.path.join(working_dir2, grace.filesystem_friendly_name(sequence_name) + "-evidence.txt") for (pos1, ins1, sub1, ref1, conins1, consub1), (pos2, ins2, sub2, ref2, conins2, consub2) in itertools.izip( read_file(filename1), read_file(filename2) ): assert pos1 == pos2 and ref1 == ref2 if pos1 % 1000 == 0: grace.status("Testing %s %d" % (sequence_name, pos1)) dec_ins1 = io.decode_evidence(ins1) dec_ins2 = io.decode_evidence(ins2) if dec_ins1 and dec_ins2: sig = significance(io.decode_evidence(ins1), io.decode_evidence(ins2), cutoff) if sig is not None and sig <= cutoff: grace.status("") print "%s\t%d\t%s\t\t%s\t%s\t%g\t%s\t%s" % ( sequence_name, pos1, "insertion-before", ins1, ins2, sig, conins1, conins2, ) dec_sub1 = io.decode_evidence(sub1) dec_sub2 = io.decode_evidence(sub2) if dec_sub1 and dec_sub2: sig = significance(dec_sub1, dec_sub2, cutoff) if sig is not None and sig <= cutoff: if dec_sub1[0][0] == "-" or dec_sub2[0][0] == "-": what = "deletion" elif dec_sub1[0][0] != dec_sub2[0][0]: what = "substitution" else: what = "different mix" grace.status("") print "%s\t%d\t%s\t%s\t%s\t%s\t%g\t%s\t%s" % ( sequence_name, pos1, what, ref1, sub1, sub2, sig, consub1, consub2, ) grace.status("") return 0
def pastiche(args): if len(args) < 4: print USAGE return 1 mask_only, args = grace.get_option_value(args, '--mask', grace.as_bool, False) min_leftover, args = grace.get_option_value(args, '--min-leftover', int, 20) output_dir, args = args[0], args[1:] #, ref_filename, contig_filenames = args[0], args[1], args[2:] ref_filenames = [] contig_filenames = [] grace.execute(args, {'contigs': lambda args: contig_filenames.extend(args)}, lambda args: ref_filenames.extend(args)) assert ref_filenames, 'No reference sequences given' assert contig_filenames, 'No contig sequences given' contigs = dict([(name.split()[0], seq) for filename in contig_filenames for name, seq in io.read_sequences(filename)]) dir_contigs = {} for name in contigs: dir_contigs[name + '+'] = contigs[name] dir_contigs[name + '-'] = bio.reverse_complement(contigs[name]) dir_contigs_used = {} for name in dir_contigs: dir_contigs_used[name] = [False] * len(dir_contigs[name]) workspace = io.Workspace(output_dir) temp_prefix = workspace._object_filename('temp-pastiche') out_f = workspace.open('pastiche.fa', 'wb') for ref_filename in ref_filenames: for ref_name, ref_seq in io.read_sequences(ref_filename): ref_name = ref_name.split()[0] grace.status(ref_name) f = open(temp_prefix + '.fa', 'wb') io.write_fasta(f, 'ref', ref_seq) f.close() scores = [-1] * (len(ref_seq) * 2) strings = ['N', ''] * (len(ref_seq)) contexts = [None for i in xrange(len(ref_seq) * 2)] #MAXSCORE = len(ref_seq)+1 #for i in xrange(len(ref_seq)): # if ref_seq[i].upper() != 'N': # strings[i*2] = ref_seq[i] # scores[i*2] = MAXSCORE #for i in xrange(len(ref_seq)-1): # if ref_seq[i].upper() != 'N' and ref_seq[i+1].upper() != 'N': # scores[i*2+1] = MAXSCORE if mask_only: for i in xrange(len(ref_seq)): strings[i * 2] = ref_seq[i].lower() def put(position, dir_contig_name, start, end, score): if scores[position] < score: scores[position] = score strings[position] = dir_contigs[dir_contig_name][start:end] contexts[position] = (dir_contig_name, start, end, score) for contig_filename in contig_filenames: execute([ 'nucmer', '--prefix', temp_prefix, #'--maxmatch', #Very slow '--nosimplify', '--minmatch', '9', '--mincluster', '50', #'--maxgap', '1000', #'--breaklen', '1000', # Increasing this reduces Ns, but is slow #'--diagfactor', '1.0', temp_prefix + '.fa', contig_filename ]) for contig_name, contig_seq in io.read_sequences( contig_filename): contig_name = contig_name.split()[0] grace.status(ref_name + ' vs ' + contig_name) p = run([ 'show-aligns', temp_prefix + '.delta', 'ref', contig_name ], stderr=subprocess.PIPE) alignments = [] while True: line = p.stdout.readline() if not line: break if not line.startswith('-- BEGIN'): continue parts = line.split() ref_start = int(parts[5]) ref_end = int(parts[7]) query_start = int(parts[10]) query_end = int(parts[12]) #assert ref_start < ref_end #ref_start -= 1 #Zero based coordinates al_ref = [] al_query = [] while True: block = [] end = False while True: line = p.stdout.readline() if line.startswith('-- END'): end = True break if line == '\n': if block: break else: continue block.append(line) if end: break al_ref.append(block[0].split()[1]) al_query.append(block[1].split()[1]) al_ref = ''.join(al_ref) al_query = ''.join(al_query) if ref_start > ref_end: al_ref = bio.reverse_complement(al_ref) al_query = bio.reverse_complement(al_query) ref_start, ref_end = ref_end, ref_start query_start, query_end = query_end, query_start if query_start > query_end: dir_contig_name = contig_name + '-' query_start = len(contig_seq) + 1 - query_start query_end = len(contig_seq) + 1 - query_end else: dir_contig_name = contig_name + '+' ref_start -= 1 #Zero based coordinates query_start -= 1 #print al_ref #print al_query #Pretty dumb scoring scheme al_score = 0 for i in xrange(len(al_ref)): if al_ref[i] == al_query[i]: al_score += 1 #else: # al_score -= 1 #Pastiche alignment over reference ref_pos = ref_start query_pos = query_start al_pos = 0 while al_pos < len(al_ref): assert al_ref[al_pos] != '.' if al_query[al_pos] == '.': put(ref_pos * 2, dir_contig_name, query_pos, query_pos, al_score) else: assert al_query[al_pos].lower() == dir_contigs[ dir_contig_name][query_pos].lower() put(ref_pos * 2, dir_contig_name, query_pos, query_pos + 1, al_score) query_pos += 1 al_pos += 1 al_pos_end = al_pos query_pos_end = query_pos while al_pos_end < len( al_ref) and al_ref[al_pos_end] == '.': al_pos_end += 1 query_pos_end += 1 #put(ref_pos*2+1, al_query[al_pos:al_pos_end], al_score) assert al_query[al_pos:al_pos_end].lower( ) == dir_contigs[dir_contig_name][ query_pos:query_pos_end].lower() put(ref_pos * 2 + 1, dir_contig_name, query_pos, query_pos_end, al_score) al_pos = al_pos_end query_pos = query_pos_end ref_pos += 1 p.wait() grace.status(ref_name) result = ''.join(strings) io.write_fasta(out_f, ref_name, result) for context in contexts: if context is None: continue name, start, end, score = context for i in xrange(start, end): dir_contigs_used[name][i] = True #Interpolation #result = [ ] #i = 0 #while i < len(ref_seq): # if strings[i*2].upper() != 'N': # result.append(strings[i*2]) # result.append(strings[i*2+1]) # i += 1 # continue # # j = i # while strings[j*2].upper() == 'N': # j += 1 # # grace.status('') # print >> sys.stderr, 'interpolating', i+1,'..',j # # window = 20 #!!!!!!!!!!! # left_contexts = collections.defaultdict(lambda:0) # for i1 in xrange(max(0,i-window),i): # for context_name, context_start, context_end, context_score in contexts[i1*2]: # key = (context_name, context_end + i - i1) # left_contexts[key] = max(left_contexts[key],context_score) # # right_contexts = collections.defaultdict(lambda:0) # for j1 in xrange(j,min(j+window,len(ref_seq))): # for context_name, context_start, context_end, context_score in contexts[j1*2]: # key = (context_name, context_start + j - j1) # right_contexts[key] = max(left_contexts[key],context_score) # # #print >> sys.stderr, left_contexts # #print >> sys.stderr, right_contexts # # options = [ ] # # for (left_name, left_pos), left_score in left_contexts.items(): # for (right_name, right_pos), right_score in right_contexts.items(): # if left_name != right_name: continue # if right_pos < left_pos: continue # # if right_pos-left_pos > (j-i) * 4.0 + 10: continue #!!!!!!!!!!!!!!!!!!!!!!1 # if right_pos-left_pos < (j-i) * 0.25 - 10: continue # # score = float(min(right_pos-left_pos,j-i))/max(right_pos-left_pos,j-i) # score *= left_score + right_score # #print >> sys.stderr, left_name, right_pos-left_pos, j-i, score # options.append( (score, left_name, left_pos, right_pos) ) # # if options: # best = max(options, key=lambda option: option[0]) # print >> sys.stderr, '->', best # result.append( dir_contigs[best[1]][best[2]:best[3]].lower() ) # else: # print >> sys.stderr, '-> no good interpolation' # result.append( ref_seq[i:j] ) # # i = j # #result = ''.join(result) #io.write_fasta(sys.stdout, ref_name, result) #print >> sys.stderr, len(result), result.count('N') #for pos, size in N_runs: # out_size = len(''.join( strings[pos*2:pos*2+2] )) # print >> sys.stderr, pos, size, '->', out_size out_f.close() grace.status('') #for name, seq in io.read_sequences(ref_filename): # result = pastiche(seq, contigs_filename) # io.write_fasta(sys.stdout, name, result) leftover_f = workspace.open('leftovers.fa', 'wb') for name in sorted(contigs): used = [ (a or b) for a, b in zip(dir_contigs_used[name + '+'], dir_contigs_used[name + '-'][::-1]) ] i = 0 while i < len(used): j = i while j < len(used) and not used[j]: j += 1 if j - i > min_leftover: if i == 0 and j == len(used): out_name = name else: out_name = name + ':%d..%d' % (i + 1, j) io.write_fasta(leftover_f, out_name, contigs[name][i:j]) i = j + 1 leftover_f.close() for suffix in ['.fa', '.delta']: os.unlink(temp_prefix + suffix)
def batch_main(args): options = Options() options.references = [ ] options.clip_options = [ ] options.shrimp_options = [ ] options.do_consensus = True options.consensus_options = [ ] options.samples = [ ] options.do_count = False options.count_options = [ ] options.tests = [ ] options.report_options = [ ] default_nesoni = sys.executable + ' ' + sys.argv[0] options.nesoni, args = grace.get_option_value(args, '--nesoni', str, default_nesoni) options.pypy_nesoni, args = grace.get_option_value(args, '--pypy-nesoni', str, options.nesoni) options.prefix, args = grace.get_option_value(args,'--input-prefix', str, None) options.submit, args = grace.get_option_value(args,'--submit', str, '%') assert '%' in options.submit, 'Bad submit pattern' options.damp, args = grace.get_option_value(args, '--damp-run', grace.as_bool, False) options.run, args = grace.get_option_value(args, '--run', int, None) def absolutize(filename): if options.prefix is not None: return options.prefix + filename else: return io.abspath(filename) def path_param(filenames, damp=False): if damp: filenames = [ item+'~~first:10000' for item in filenames ] return ' '.join(absolutize(filename) for filename in filenames) def default(args): grace.expect_no_further_options(args) if len(args) != 1: print >> sys.stderr, BATCH_HELP % default_nesoni raise grace.Help_shown() options.dirname = args[0] def reference(args): grace.expect_no_further_options(args) options.references.extend(args) def do_clip(args): options.clip_options.extend(args) def do_shrimp(args): options.shrimp_options.extend(args) def do_consensus(args): if args == ['no']: options.do_consensus = False else: options.consensus_options.extend(args) def sample(args): sample = Options() sample.imported = False sample.reads = [ ] sample.pairs = [ ] sample.interleaved = [ ] options.samples.append(sample) def default(args): assert len(args) == 1, 'Expected a sample name in "sample:"' sample.name = args[0] def reads(args): grace.expect_no_further_options(args) sample.reads.extend(args) def pairs(args): grace.expect_no_further_options(args) assert len(args) == 2, 'Expected exactly two files in "pairs:"' sample.pairs.append(args) def interleaved(args): grace.expect_no_further_options(args) sample.interleaved.extend(args) grace.execute(args, [reads,pairs,interleaved], default) assert sample.reads or sample.pairs or sample.interleaved, 'No reads for sample' def import_(args): grace.expect_no_further_options(args) for item in args: sample = Options() options.samples.append(sample) sample.imported = True sample.clip_dest = None sample.align_dest = absolutize(item) def do_count(args): options.do_count = True options.count_options.extend(args) def do_test_counts(args): assert len(args) > 1, 'Incorrect parameters for test-counts' test = Options() options.tests.append(test) test.args = args def do_report(args): options.report_options.extend(args) grace.execute(args, [ reference, do_clip, do_shrimp, do_consensus, sample, import_, do_count, do_test_counts, do_report, ], default) if options.damp: options.dirname += '-damp' if options.tests: options.do_count = True batch = Batch(options.dirname, options.submit) for sample in options.samples: if sample.imported: continue # CLIP =========================================== batch.require_dir('clip') sample.clip_dest = join('clip', sample.name) command = ( options.pypy_nesoni + ' clip: ' + sample.clip_dest ) if options.clip_options: command += ' ' + quote_param(options.clip_options) if sample.reads: command += ' reads: ' + path_param(sample.reads, options.damp) for pair in sample.pairs: command += ' pairs: ' + path_param(pair, options.damp) if sample.interleaved: command += ' interleaved: ' + path_param(sample.interleaved, options.damp) sample.has_pairs = bool(sample.pairs) or bool(sample.interleaved) sample.clip_state = batch.target( sample.clip_dest, [], command ) # ALIGN ========================================== batch.require_dir('align') sample.align_dest = join('align', sample.name) command = options.pypy_nesoni + ' samshrimp: ' + sample.align_dest command += ' ' + path_param(options.references) command += ' reads: ' + sample.clip_dest + '_single.fq.gz' if sample.has_pairs: command += ' interleaved: ' + sample.clip_dest + '_paired.fq.gz' command += ' ' + quote_param(options.shrimp_options) sample.align_state = batch.target( sample.align_dest, [ sample.clip_state ], command ) # CONSENSUS ======================================= if options.do_consensus: command = ( options.pypy_nesoni + ' samconsensus: ' + sample.align_dest + ' ' + quote_param(options.consensus_options) ) sample.consensus_state = batch.target( join(sample.align_dest, 'consensus'), [ sample.align_state ], command ) batch.virtual_target( 'clip', [ sample.clip_state for sample in options.samples if not sample.imported ] ) batch.virtual_target( 'align', [ sample.align_state for sample in options.samples if not sample.imported ] ) # COUNT ========================================== if options.do_count: command = options.pypy_nesoni + ' samcount: counts ' + quote_param(options.count_options) command += ' ' + ' '.join( sample.align_dest for sample in options.samples ) options.counts_state = batch.target( 'count', [ (sample.consensus_state if options.do_consensus else sample.align_state) for sample in options.samples if not sample.imported ], # count: --filter existing can depend on consensus command ) batch.virtual_target('count', [ options.counts_state ]) command = options.pypy_nesoni + ' plot-counts: scatter-plots counts.txt' options.plot_state = batch.target( 'plot', [ options.counts_state ], command ) # TEST ============================================ for test in options.tests: batch.require_dir('test') test.dest = join('test', test.args[0]) param = test.args[1:] command = options.pypy_nesoni + ' test-counts: ' + test.dest + ' counts.txt' command += ' ' + quote_param(param) if options.damp: command += ' --min-count 1' test.state = batch.target( test.dest, [ options.counts_state ], command ) if options.tests: command1 = 'rm -f differential-expression-tests.zip' command2 = ( 'zip -j differential-expression-tests.zip ' + ' '.join( test.dest + '*' for test in options.tests ) ) options.edger_zip_state = batch.target( 'differential-expression-tests', [ test.state for test in options.tests ], command1, command2, ) # REPORT =========================================== command = options.pypy_nesoni + ' report: report ' + quote_param(options.report_options) command += ' reference: ' + path_param(options.references) command += ' clips: ' + ' '.join( sample.clip_dest+'_log.txt' for sample in options.samples if sample.clip_dest is not None ) if options.do_consensus: command += ' aligns: ' + ' '.join( sample.align_dest for sample in options.samples ) if options.do_count: command += ' count-log: counts_log.txt' if options.do_count: command += ' file: counts.txt \'Table of raw counts, RPKMs, and statistics on alignments spanning multiple genes.\'' command += ' file: scatter-plots-count.png \'Pairwise scatter plots of number of reads aligning to each gene.\'' command += ' file: scatter-plots-RPKM.png \'Pairwise scatter plots of RPKM values.\'' if options.tests: command += ' file: differential-expression-tests.zip \'Differential gene expression analysis\'' options.report_state = batch.target( 'report', batch.all[:], #Meh command ) batch.virtual_target('report', [ options.report_state ]) batch.virtual_target( 'view', [ options.report_state ], 'firefox -no-remote report/index.html' ) batch.close() if options.run is None: print print 'Now type:' print print 'make -C %s' % pipes.quote(options.dirname) print else: command = 'make -C %s -j %d' % (pipes.quote(options.dirname), options.run) print print command print assert 0 == os.system(command)
def main(args): mincov, args = grace.get_option_value(args, '--mincov', int, 1) maxdiff, args = grace.get_option_value(args, '--maxdiff', int, 16) minsize, args = grace.get_option_value(args, '--minsize', int, 200) what, args = grace.get_option_value(args, '--what', as_core_or_unique, 'core') is_core = (what == 'core') grace.expect_no_further_options(args) if len(args) < 2: print >> sys.stderr, HELP raise grace.Help_shown() output_dir, working_dirs = args[0], args[1:] assert not path.exists(path.join(output_dir, 'reference.fa')), \ 'Output directory not given' if not path.exists(output_dir): os.mkdir(output_dir) for name, seq in io.read_sequences( path.join(working_dirs[0], 'reference.fa')): print name friendly_name = grace.filesystem_friendly_name(name) good = [True] * len(seq) for working_dir in working_dirs: if is_core: suffix = '-depth.userplot' else: suffix = '-ambiguous-depth.userplot' data = trivia.read_unstranded_userplot( os.path.join(working_dir, friendly_name + suffix)) assert len(seq) == len(data) for i in xrange(len(seq)): if good[i]: if is_core: good[i] = data[i] >= mincov else: good[i] = data[i] < mincov #Close holes start = -maxdiff - 1 n_holes = 0 for i in xrange(len(seq)): if good[i]: if 0 < i - start <= maxdiff: for j in xrange(start, i): good[j] = True n_holes += 1 start = i + 1 print 'Closed', grace.pretty_number(n_holes), 'holes' f = open(path.join(output_dir, '%s-%s.fa' % (friendly_name, what)), 'wb') io.write_fasta( f, name, ''.join([(seq[i] if good[i] else 'N') for i in xrange(len(seq))])) f.close() f = open( path.join(output_dir, '%s-%s_masked.fa' % (friendly_name, what)), 'wb') io.write_fasta( f, name, ''.join([(seq[i] if good[i] else seq[i].lower()) for i in xrange(len(seq))])) f.close() f_good = open( path.join(output_dir, '%s-%s_parts.fa' % (friendly_name, what)), 'wb') f_nongood = open( path.join(output_dir, '%s-non%s_parts.fa' % (friendly_name, what)), 'wb') start = 0 n_good = [0] n_good_bases = [0] def emit(i): if i - start < minsize: return if good[start]: n_good[0] += 1 n_good_bases[0] += i - start io.write_fasta(f_good if good[start] else f_nongood, '%s:%d..%d' % (name, start + 1, i), seq[start:i]) for i in xrange(1, len(seq)): if good[i] != good[start]: emit(i) start = i emit(len(seq)) f_nongood.close() f_good.close() print grace.pretty_number( sum(good)), 'bases are ' + what + ', of', grace.pretty_number( len(seq)), 'in reference sequence' print grace.pretty_number( n_good[0]), 'parts at least', grace.pretty_number( minsize), 'bases long with', grace.pretty_number( n_good_bases[0]), 'total bases' print
def report_main(args): title, args = grace.get_option_value(args, '--title', str, 'Report') short_name, args = grace.get_option_value(args, '--short', str, 'files') show_refalign, args = grace.get_option_value(args, '--show-refalign', grace.as_bool, True) output_dir, args = args[0], args[1:] reference_filenames = [] clip_filenames = [] align_dirs = [] count_log_filenames = [] extra_items = [] extra_files = [] def file(args): extra_files.append((args[0], ' '.join(args[1:]))) def extra(args): extra_items.extend(args) def reference(args): reference_filenames.extend(args) def clips(args): clip_filenames.extend(args) def aligns(args): align_dirs.extend(args) def count_log(args): count_log_filenames.extend(args) grace.execute(args, [reference, clips, aligns, extra, file, count_log]) if not os.path.isdir(output_dir): os.mkdir(output_dir) file_dir = join(output_dir, short_name) if not os.path.isdir(file_dir): os.mkdir(file_dir) for item in os.listdir(file_dir): os.unlink(join(file_dir, item)) for filename in reference_filenames: io.copy_file(filename, join(file_dir, os.path.basename(filename))) for filename, desc in extra_files: io.copy_file(filename, join(output_dir, os.path.basename(filename))) pairs = False for directory in align_dirs: name = os.path.basename(directory) io.copy_file(join(directory, 'report.txt'), join(file_dir, name + '-report.txt')) for extension in [ '-depth.userplot', '-ambiguous-depth.userplot', '-pairspan-depth.userplot', '-ambiguous-pairspan-depth.userplot', ]: filenames = [ item for item in os.listdir(directory) if item.endswith(extension) and not item.endswith('-ambiguous' + extension) and not item.endswith('-pairspan' + extension) ] for filename in filenames: if len(filenames) == 1: dest = name + extension else: dest = name + '-' + filename io.copy_file(join(directory, filename), join(file_dir, dest)) if 'pairspan' in extension: pairs = True today = datetime.date.today().strftime('%e %B %Y') f = open(join(output_dir, 'index.html'), 'wb') print >> f, HEAD % locals() section(f, 'Results') for item in extra_items: p(f, item) for filename, desc in extra_files: name = os.path.basename(filename) p(f, '<a href="%(name)s">%(name)s</a> - %(desc)s' % locals()) p(f, '<a href="%(short_name)s.zip">%(short_name)s.zip</a>' % locals()) for filename in reference_filenames: bullet(f, os.path.basename(filename) + ' - reference') bullet(f, '...-report.txt - report on SNPs and indels found') p(f, 'Different kinds of userplot:') bullet( f, '...-depth.userplot - depth of coverage of unambiguously aligned reads' ) bullet( f, '...-ambiguous-depth.userplot - depth of coverage, including reads that hit multiple locations' ) if pairs: bullet( f, '...-pairspan-depth.userplot - depth, including the space between reads in read-pairs' ) bullet( f, '...-ambiguous-pairspan-depth.userplot - as above, but including reads that hit multiple locations' ) if clip_filenames: section(f, 'Read clipping') for filename in clip_filenames: assert filename.endswith('_log.txt') name = os.path.basename(filename[:-8]) text = extract( filename, lambda line: line.startswith('Fragments:') or line. startswith('Single reads') or line.startswith('Pairs')) subsection(f, name) pre(f, text) end_subsection(f) if count_log_filenames: section(f, 'Counting alignments to genes') for filename in count_log_filenames: pre(f, open(filename, 'rb').read()) if align_dirs and show_refalign: section(f, 'Reference alignment') for directory in align_dirs: name = os.path.basename(directory) text = extract( join(directory, 'consensus_log.txt'), lambda line: 'reads/pairs' in line or 'unmapped' in line) text = text.replace('(discarded)', '') text = text.replace('reads/pairs kept', 'aligned unambiguously') subsection(f, name) pre(f, text) end_subsection(f) print >> f, TAIL % locals() f.close() zip_filename = join(output_dir, short_name + '.zip') if os.path.exists(zip_filename): os.unlink(zip_filename) assert 0 == os.system( 'cd %(output_dir)s ; zip %(short_name)s.zip %(short_name)s/* ' % locals()) for item in os.listdir(file_dir): os.unlink(join(file_dir, item)) os.rmdir(file_dir)
def main(args): title1, args = grace.get_option_value(args, '--title1', str, None) title2, args = grace.get_option_value(args, '--title2', str, None) grace.expect_no_further_options(args) if len(args) != 3: print >> sys.stderr, USAGE return 1 working_dir1 = args[0] working_dir2 = args[1] cutoff = float(args[2]) sequence_names = [ name for name, sequence in io.read_sequences( os.path.join(working_dir1, 'reference.fa')) ] if title1 is None: title1 = working_dir1 if title2 is None: title2 = working_dir2 n = 1 while significance([('A', n)], [('T', n)], 1.0) > cutoff: n += 1 print '%g\tsignificance cutoff' % cutoff print '%d\tdepth required to call substitution (greater if there are errors in the reads)' % n print 'Sequence\tPosition in reference\tChange type\tReference\t%s\t%s\tp-value (no correction for multiple testing)\t%s\t%s' % ( title1, title2, title1, title2) for sequence_name in sequence_names: filename1 = os.path.join( working_dir1, grace.filesystem_friendly_name(sequence_name) + '-evidence.txt') filename2 = os.path.join( working_dir2, grace.filesystem_friendly_name(sequence_name) + '-evidence.txt') for (pos1, ins1, sub1, ref1, conins1, consub1), (pos2, ins2, sub2, ref2, conins2, consub2) in itertools.izip(read_file(filename1), read_file(filename2)): assert pos1 == pos2 and ref1 == ref2 if pos1 % 1000 == 0: grace.status('Testing %s %d' % (sequence_name, pos1)) dec_ins1 = io.decode_evidence(ins1) dec_ins2 = io.decode_evidence(ins2) if dec_ins1 and dec_ins2: sig = significance(io.decode_evidence(ins1), io.decode_evidence(ins2), cutoff) if sig is not None and sig <= cutoff: grace.status('') print '%s\t%d\t%s\t\t%s\t%s\t%g\t%s\t%s' % ( sequence_name, pos1, 'insertion-before', ins1, ins2, sig, conins1, conins2) dec_sub1 = io.decode_evidence(sub1) dec_sub2 = io.decode_evidence(sub2) if dec_sub1 and dec_sub2: sig = significance(dec_sub1, dec_sub2, cutoff) if sig is not None and sig <= cutoff: if dec_sub1[0][0] == '-' or dec_sub2[0][0] == '-': what = 'deletion' elif dec_sub1[0][0] != dec_sub2[0][0]: what = 'substitution' else: what = 'different mix' grace.status('') print '%s\t%d\t%s\t%s\t%s\t%s\t%g\t%s\t%s' % ( sequence_name, pos1, what, ref1, sub1, sub2, sig, consub1, consub2) grace.status('') return 0
def main(args): genbank_filename, args = grace.get_option_value(args,'--gbk',str,None) use_indels, args = grace.get_option_value(args,'--indels',grace.as_bool,True) use_reference, args = grace.get_option_value(args,'--reference',grace.as_bool,True) give_evidence, args = grace.get_option_value(args,'--evidence',grace.as_bool,True) give_consequences, args = grace.get_option_value(args,'--consequences',grace.as_bool,True) require_all, args = grace.get_option_value(args,'--require-all',grace.as_bool,False) require_bisect, args = grace.get_option_value(args,'--require-bisect',grace.as_bool,False) full_output, args = grace.get_option_value(args,'--full',grace.as_bool,False) format, args = grace.get_option_value(args,'--as',str,'table') # Secret option! limit, args = grace.get_option_value(args,'--limit',int,None) grace.expect_no_further_options(args) if len(args) < 1: sys.stderr.write(USAGE) return 1 working_dirs = [ ] split_a = [ ] split_b = [ ] def default(args): working_dirs.extend(args) def splitting(args): split_a.extend(args) def splitting_from(args): split_b.extend(args) grace.execute(args, { 'splitting' : splitting, 'from' : splitting_from }, default ) if use_reference: names = ['reference'] evidence_start = 1 else: names = [ ] evidence_start = 0 names.extend( norm_name(item) for item in working_dirs ) references = io.read_sequences(os.path.join(working_dirs[0], 'reference.fa')) annotations = { } if genbank_filename: from Bio import SeqIO for record in SeqIO.parse(io.open_possibly_compressed_file(genbank_filename),'genbank'): sequence = record.seq.tostring() features = [ item for item in record.features if item.type != 'source' ] features.sort(key=lambda item: item.location.nofuzzy_start) annotations[sequence] = features iterator = reader(working_dirs, references, use_reference, annotations) if not use_indels: iterator = itertools.ifilter(has_no_indels, iterator) if require_all or require_bisect or format == 'counts': iterator = itertools.ifilter(fully_unambiguous, iterator) if require_bisect: iterator = itertools.ifilter(is_binary_partition, iterator) if not require_bisect: if full_output: iterator = itertools.ifilter(not_boring_insertion, iterator) else: iterator = itertools.ifilter(is_interesting, iterator) if split_a or split_b: assert len(names) == len(set(names)), 'Two samples with the same name' try: split_a = [ names.index(norm_name(item)) for item in split_a ] split_b = [ names.index(norm_name(item)) for item in split_b ] except ValueError: raise grace.Error('Sample to be split is not amongst samples given') iterator = itertools.ifilter(is_split(split_a, split_b), iterator) if limit: iterator = itertools.islice(iterator, limit) if format == 'table': line = 'Reference\tPosition\tChange type' line += '\t' + '\t'.join(names) if give_evidence: line += '\t' + '\t'.join(names[evidence_start:]) if give_consequences: line += '\t' + '\t'.join(names[evidence_start:]) if annotations: line += '\tAnnotations' print line for calls in iterator: line = '%s\t%d\t%s\t%s' % ( calls.ref_name, calls.ref_pos+1, change_type(calls), '\t'.join(item.consensus for item in calls.calls)) if give_evidence: line += '\t' + '\t'.join(item.evidence for item in calls.calls[evidence_start:]) if give_consequences: line += '\t' + '\t'.join(item.consequences for item in calls.calls[evidence_start:]) if annotations: line += '\t' + describe_features(calls.features) print line elif format == 'compact': for line in transpose_strings(names): print line print for calls in iterator: if calls.is_insertion: footer = '%12d.5 %s' % (calls.ref_pos, calls.ref_name) else: footer = '%12d %s' % (calls.ref_pos+1, calls.ref_name) t = transpose_strings([ item.consensus for item in calls.calls ], '-', 1) top = t[0] + ' ' + footer if give_consequences: consequences = [ ] for call in calls.calls: if call.consequences: for item in call.consequences.split(', '): item = ' '.join(item.split()[:3]) if item not in consequences: consequences.append(item) if consequences: top += ' ' + ' / '.join(sorted(consequences)) top += ' ' + describe_features(calls.features) print top for line in t[1:]: print line elif format == 'nexus': buckets = [ [ ] for name in names ] for calls in iterator: for i, char in enumerate(partition_string(calls)): buckets[i].append(char) print '#NEXUS' print 'begin taxa;' print 'dimensions ntax=%d;' % len(names) print 'taxlabels' for name in names: print name print ';' print 'end;' print 'begin characters;' print 'dimensions nchar=%d;' % len(buckets[0]) print 'format datatype=STANDARD symbols="ACGT-0123456789" missing=N;' print 'matrix' for name, bucket in itertools.izip(names, buckets): print name, ''.join(bucket) print ';' print 'end;' elif format == 'counts': for line in transpose_strings(names): print line print counts = { } for calls in iterator: count_str = partition_string(calls) if count_str not in counts: counts[count_str] = 1 else: counts[count_str] += 1 for count_str in sorted(counts, key=lambda x: (counts[x], x), reverse=True): print '%s %d' % (transpose_strings(count_str)[0], counts[count_str]) else: raise grace.Error('Unknown output format: ' + format)
def main(args): grace.require_shrimp_1() n_cpus = grace.how_many_cpus() solid, args = grace.get_flag(args, '--solid') verbose, args = grace.get_flag(args, '--verbose') threshold, args = grace.get_option_value(args, '--threshold', str, '68%') stride, args = grace.get_option_value(args, '--stride', int, 1) max_shrimps, args = grace.get_option_value(args, '--cpus', int, n_cpus) batch_size, args = grace.get_option_value(args, '--batch-size', int, 5000000) input_reference_filenames = [ ] reads_filenames = [ ] shrimp_options = [ '-h', threshold ] if threshold.endswith('%'): threshold = -float(threshold[:-1])/100.0 else: threshold = int(threshold) output_dir = [ ] #As list so can write to from function. Gah. def front_command(args): grace.expect_no_further_options(args) if len(args) < 1: return output_dir.append(args[0]) input_reference_filenames.extend( [ os.path.abspath(filename) for filename in args[1:] ]) def reads_command(args): grace.expect_no_further_options(args) reads_filenames.extend([ [ os.path.abspath(filename) ] for filename in args]) def pairs_command(args): grace.expect_no_further_options(args) assert len(args) == 2, 'Expected exactly two files in "pairs"' reads_filenames.append([ os.path.abspath(filename) for filename in args ]) def shrimp_options_command(args): shrimp_options.extend(args) grace.execute(args, { 'reads': reads_command, '--reads': reads_command, 'pairs': pairs_command, 'shrimp-options': shrimp_options_command, '--shrimp-options': shrimp_options_command, }, front_command) if not output_dir: print >> sys.stderr, USAGE % n_cpus return 1 output_dir = output_dir[0] assert input_reference_filenames, 'No reference files given' assert reads_filenames, 'No read files given' for filename in itertools.chain(input_reference_filenames, *reads_filenames): assert os.path.exists(filename), '%s does not exist' % filename if not os.path.isdir(output_dir): os.mkdir(output_dir) if solid: shrimp = 'rmapper-cs' else: shrimp = 'rmapper-ls' reference_filename = os.path.join(output_dir,'reference.fa') reference_file = open(reference_filename,'wb') total_reference_sequences = 0 total_reference_bases = 0 for input_reference_filename in input_reference_filenames: for name, sequence in io.read_sequences(input_reference_filename): #Don't retain any comment name = name.split()[0] io.write_fasta(reference_file, name, sequence) total_reference_sequences += 1 total_reference_bases += len(sequence) reference_file.close() print '%s base%s in %s reference sequence%s' % ( grace.pretty_number(total_reference_bases), 's' if total_reference_bases != 1 else '', grace.pretty_number(total_reference_sequences), 's' if total_reference_sequences != 1 else '') assert total_reference_bases, 'Reference sequence file is empty' config = { 'references' : input_reference_filenames, 'reads' : reads_filenames, 'stride' : stride, 'solid': solid, 'threshold': threshold, } config_file = open(os.path.join(output_dir, 'config.txt'), 'wb') pprint.pprint(config, config_file) config_file.close() output_filename = os.path.join(output_dir, 'shrimp_hits.txt.gz') output_file = gzip.open(output_filename, 'wb') unmapped_filename = os.path.join(output_dir, 'unmapped.fa.gz') unmapped_file = gzip.open(unmapped_filename, 'wb') dirty_filenames = set() dirty_filenames.add(output_filename) dirty_filenames.add(unmapped_filename) #warn_low_threshold = True try: #Cleanup temporary files N = [0] def do_shrimp(read_set): my_number = N[0] N[0] += 1 tempname = os.path.join(output_dir,'temp%d-%d.fa' % (os.getpid(),my_number)) tempname_out = os.path.join(output_dir,'temp%d-%d.txt' % (os.getpid(),my_number)) dirty_filenames.add(tempname) dirty_filenames.add(tempname_out) f = open(tempname,'wb') for read_name, read_seq in read_set: print >> f, '>' + read_name print >> f, read_seq f.close() command = shrimp + ' ' + ' '.join(shrimp_options) + ' ' + \ tempname + ' ' + reference_filename + ' >' + tempname_out if not verbose: command += ' 2>/dev/null' #f = os.popen(command, 'r') child_pid = os.spawnl(os.P_NOWAIT,'/bin/sh','/bin/sh','-c',command) #print 'SHRiMP %d running' % my_number def finalize(): exit_status = os.waitpid(child_pid, 0)[1] assert exit_status == 0, 'Shrimp indicated an error' hits = { } # read_name -> [ hit line ] f = open(tempname_out,'rb') for line in f: if line.startswith('>'): read_name = line.split(None,1)[0][1:] if read_name not in hits: hits[read_name] = [ ] hits[read_name].append(line) f.close() for read_name, read_seq in read_set: if read_name in hits: for hit in hits[read_name]: output_file.write(hit) else: print >> unmapped_file, '>' + read_name print >> unmapped_file, read_seq output_file.flush() unmapped_file.flush() os.unlink(tempname) dirty_filenames.remove(tempname) os.unlink(tempname_out) dirty_filenames.remove(tempname_out) #print 'SHRiMP %d finished' % my_number return finalize shrimps = [ ] reader = iter_reads(config) read_count = 0 while True: read_set = [ ] read_set_bases = 0 #Read name should not include comment cruft # - SHRIMP passes this through # - might stuff up identification of pairs for read_name, read_seq in reader: read_name = read_name.split()[0] read_set.append((read_name, read_seq)) read_set_bases += len(read_seq) #if warn_low_threshold and len(read_seq)*7 < threshold: #Require 70% exact match # sys.stderr.write('\n*** WARNING: Short reads, consider reducing --threshold ***\n\n') # warn_low_threshold = False read_count += 1 if read_set_bases >= batch_size: break if not read_set: break if len(shrimps) >= max_shrimps: shrimps.pop(0)() shrimps.append( do_shrimp(read_set) ) grace.status('SHRiMPing %s' % grace.pretty_number(read_count)) while shrimps: grace.status('Waiting for SHRiMPs to finish %d ' % len(shrimps) ) shrimps.pop(0)() grace.status('') output_file.close() dirty_filenames.remove(output_filename) unmapped_file.close() dirty_filenames.remove(unmapped_filename) return 0 finally: for filename in dirty_filenames: if os.path.exists(filename): os.unlink(filename)
def main(args): default_transl_table, args = grace.get_option_value(args, '--transl_table', int, 11) use_coverage, args = grace.get_flag(args, '--use-coverage') coverage_cutoff, args = grace.get_option_value(args, '--coverage-cutoff', float, 0.1) tabular, args = grace.get_flag(args, '--tabular') noheader, args = grace.get_flag(args, '--noheader') verbose, args = grace.get_flag(args, '--verbose') bandwidth, args = grace.get_option_value(args, '--band', int, 20) grace.expect_no_further_options(args) if len(args) != 2: print USAGE return 1 genbank_filename = args[0] alignment_filename = args[1] if os.path.isdir(alignment_filename): alignment_filename = os.path.join(alignment_filename, 'alignment.maf') working_dir = os.path.split(alignment_filename)[0] alignments = load_alignments(alignment_filename) summaries = [ ] details = [ ] if not noheader: fields = 'Sequence\tLocus tag\tOld length (aa)\tNew length (aa)\tAmino acid changes\t' if use_coverage: fields += 'Unambiguous coverage vs expected\t\tAmbiguous coverage vs expected\t\tAmbiguous percent with any hits\t' fields += 'Gene\tProduct' if tabular: fields += '\tChanges of note' print fields for record in SeqIO.parse(io.open_possibly_compressed_file(genbank_filename),'genbank'): sequence = record.seq.tostring() for name, seq1, seq2, alignment in alignments: if seq1 == sequence: break else: raise grace.Error('Genbank record %s sequence not identical to any reference sequence' % record.id) if use_coverage: depth = get_graph(working_dir, name, 'depth') ambiguous_depth = get_graph(working_dir, name, 'ambiguous-depth') median_depth = numpy.median(depth) median_ambiguous_depth = numpy.median(ambiguous_depth) ambiguous_factor = float(median_ambiguous_depth) / median_depth depth_expect = expected_depth(name, sequence, depth, ambiguous_depth) for feature in record.features: if feature.type != 'CDS': continue if 'locus_tag' not in feature.qualifiers: locus_tag = '%d..%d' % (feature.location.nofuzzy_start+1,feature.location.nofuzzy_end) else: locus_tag = feature.qualifiers['locus_tag'][0] if 'transl_table' in feature.qualifiers: transl_table_no = int(feature.qualifiers['transl_table'][0]) else: assert default_transl_table is not None, 'No /transl_table for CDS, and default transl_table not given' transl_table_no = default_transl_table transl_table = CodonTable.ambiguous_dna_by_id[transl_table_no] start_codons = transl_table.start_codons try: feature_alignment = alignment_from_feature(sequence, feature) except Weird_alignment: warn('%s has a location I could not handle, skipping, sorry' % locus_tag) continue dna = [ ] new_dna = [ ] shifts = [ ] for i in xrange(feature_alignment.end2): p1 = feature_alignment.back_project(i, left=False) p2 = feature_alignment.back_project(i+1, left=True) assert abs(p2-p1) < 2 dna.append( sequence_slice(sequence,p1,p2) ) p1a = alignment.project(p1, left=False) p2a = alignment.project(p2, left=False) #Hmm diff = (p2-p1)-(p2a-p1a) #if diff: # if diff%3: # frame_shift = True # else: # frame_preserving_shift = True new_dna.append( sequence_slice(seq2,p1a,p2a) ) if diff: shifts.append((i,dna[-1],new_dna[-1])) dna = ''.join(dna) new_dna = ''.join(new_dna) # This usually indicated a CDS truncated at the start? # in which case, will probably fail some way or other down the line. if 'codon_start' in feature.qualifiers: codon_start = int(feature.qualifiers['codon_start'][0]) - 1 else: codon_start = 0 dna = dna[codon_start:] new_dna = new_dna[codon_start:] if len(dna) % 3 != 0: warn(locus_tag + ' length not a multiple of 3') #assert len(new_dna) % 3 == 0 protein = Seq.Seq(dna).translate(table=transl_table_no).tostring() # http://en.wikipedia.org/wiki/Start_codon is always translated to M protein = 'M' + protein[1:] if dna[:3] not in start_codons: warn(locus_tag + ' has unknown start codon: ' + dna[:3]) original_lacks_stop_codon = not protein.endswith('*') if original_lacks_stop_codon: warn(locus_tag + ' lacks end codon') original_stops_before_end = '*' in protein[:-1] if original_stops_before_end: warn(locus_tag + ' contains stop codon before end') if 'translation' in feature.qualifiers: expect = feature.qualifiers['translation'][0] if protein[:-1] != expect: warn(locus_tag + ' translation given in feature does not match translation from DNA') new_protein = Seq.Seq(new_dna).translate(table=transl_table_no).tostring() new_protein = 'M' + new_protein[1:] # If end codon changed, find new end # Don't bother if there are unknown amino acids or # the original protein lacks a stop codon if 'X' not in new_protein and '*' not in new_protein and not original_lacks_stop_codon: #This is very inefficient i = feature_alignment.end2 while True: p1 = feature_alignment.back_project(i, left=False) p2 = feature_alignment.back_project(i+1, left=True) p1a = alignment.project(p1, left=False) p2a = alignment.project(p2, left=False) #Hmm if p1a < 0 or p2a < 0 or p1a > len(seq2) or p2a > len(seq2): break new_dna += sequence_slice(seq2,p1a,p2a) new_protein = Seq.Seq(new_dna).translate(table=transl_table_no).tostring() new_protein = 'M' + new_protein[1:] if 'X' in new_protein or '*' in new_protein: break i += 1 # Is the protein shorter? # Don't bother checking if the original protein has extra stop codons if '*' in new_protein and not original_stops_before_end: new_protein = new_protein[:new_protein.index('*')+1] # If indels occurred, do an alignment # Don't bother otherwise if shifts: # Penalize gaps with cost 2 (vs 1 for mismatch) # If lengths don't match, pad with spaces (won't match longer seq), # aligner prefers mismatch to gaps #result = pairwise2.align.globalxs(protein + ' '*max(0,len(new_protein)-len(protein)), # new_protein + ' '*max(0,len(protein)-len(new_protein)), # -2.001,-2.000)[0] # 2.001 : very slightly prefer contiguous gaps. Also much faster! result = band_limited_align(protein + ' '*max(0,len(new_protein)-len(protein)), new_protein + ' '*max(0,len(protein)-len(new_protein)), bandwidth) protein_ali = result[0] new_protein_ali = result[1] else: protein_ali = protein new_protein_ali = new_protein diffs = [ ] j = 0 k = 0 for i in xrange(min(len(new_protein_ali),len(protein_ali))): if protein_ali[i] != ' ' and new_protein_ali[i] != ' ' and ( protein_ali[i] == '-' or new_protein_ali[i] == '-' or not bio.might_be_same_amino(protein_ali[i], new_protein_ali[i]) ): diffs.append((i,j,k)) if protein_ali[i] != '-': j += 1 if new_protein_ali[i] != '-': k += 1 diff_start = not bio.might_be_same_base(new_dna[0],dna[0]) or \ not bio.might_be_same_base(new_dna[1],dna[1]) or \ not bio.might_be_same_base(new_dna[2],dna[2]) interesting_coverage = False if use_coverage: cds_depth = depth[feature_alignment.start1:feature_alignment.end1] #/ median_depth if not feature_alignment.forward1: cds_depth = cds_depth[::-1] cds_ambiguous_depth = ambiguous_depth[feature_alignment.start1:feature_alignment.end1] #/ median_ambiguous_depth if not feature_alignment.forward1: cds_ambiguous_depth = cds_ambiguous_depth[::-1] cds_depth_expect = depth_expect[feature_alignment.start1:feature_alignment.end1] if not feature_alignment.forward1: cds_depth_expect = cds_depth_expect[::-1] #cds_average_depth_ratio = numpy.average(depth[feature_alignment.start1:feature_alignment.end1]) / median_depth #cds_average_ambiguous_depth_ratio = numpy.average(ambiguous_depth[feature_alignment.start1:feature_alignment.end1]) / median_ambiguous_depth #line += '%.1f\t' % cds_average_depth_ratio #line += '%.1f\t' % cds_average_ambiguous_depth_ratio #line += '%.1f..%.1f\t' % (numpy.minimum.reduce(cds_depth)/median_depth, numpy.maximum.reduce(cds_depth)/median_depth) #line += '%.1f+/-%.1f\t' % (numpy.average(cds_depth)/median_depth, numpy.var(cds_depth)**0.5/median_depth) #line += '%.1f..%.1f\t' % (numpy.minimum.reduce(cds_ambiguous_depth)/median_ambiguous_depth, numpy.maximum.reduce(cds_ambiguous_depth)/median_ambiguous_depth) avg_expect = numpy.average(cds_depth_expect) if avg_expect > 0.0: cds_avg_depth = numpy.average(cds_depth)/avg_expect cds_avg_ambiguous_depth = numpy.average(cds_ambiguous_depth)/avg_expect/ambiguous_factor strange = ( (cds_depth >= cds_depth_expect*1.5) | (cds_ambiguous_depth <= cds_depth_expect*(0.5*ambiguous_factor)) ) interesting_coverage = numpy.average(strange) >= coverage_cutoff if interesting_coverage or diffs or diff_start or shifts or len(new_protein) != len(protein): line = name + '\t' + locus_tag + '\t' + \ '%d\t' % (len(protein)-1) + \ '%d\t' % (len(new_protein)-1) + \ '%d\t' % len(diffs) if use_coverage: if avg_expect <= 0.0: line += '\t\t\t' else: line += '%.1f\t' % (cds_avg_depth) + graphlet(cds_depth, cds_depth_expect)+'\t' line += '%.1f\t' % (cds_avg_ambiguous_depth) + graphlet(cds_ambiguous_depth, cds_depth_expect*ambiguous_factor)+'\t' line += '%.1f%%\t' % (numpy.average(cds_ambiguous_depth > 0.0)*100.0) line += '%s\t' % feature.qualifiers.get('gene',[''])[0] + \ '%s' % feature.qualifiers.get('product',[''])[0] notes = [ ] if use_coverage and 'X' in new_protein: xs = new_protein.count('X') if xs == len(new_protein)-1: #First is M, so len-1 notes.append('\ No consensus') else: notes.append('\ No consensus for %d aa' % (new_protein.count('X'))) if len(new_protein) < len(protein): notes.append('\ Shorter by %d aa' % (len(protein)-len(new_protein))) if len(new_protein) > len(protein): notes.append('\ Longer by %d aa' % (len(new_protein)-len(protein))) if diff_start: notes.append('\ Start changed: %s -> %s' % (dna[:3], new_dna[:3])) if new_dna[:3] not in start_codons: notes.append(' No longer a start codon!') if shifts: notes.append('\ Indels:') for pos, old, new in shifts: notes.append(' base %5d / codon %5d %s -> %s' % (pos+1,(pos//3)+1,old,new or '-')) if diffs: if verbose: notes.append('\ Amino acid changes:') for i, j, k in diffs: notes.append(' codon %5d %s->%s (%s->%s)' % ( j+1, protein_ali[i], new_protein_ali[i], dna[j*3:j*3+3] if protein_ali[i] != '-' else '-', new_dna[k*3:k*3+3] if new_protein_ali[i] != '-' else '-' )) #if len(new_protein) > len(protein): # print 'New protein is longer:', new_protein[len(protein):] #if len(new_protein) < len(protein): # print 'New protein is shorter:', protein[len(new_protein):] #print protein #print new_protein if tabular: print line + '\t' + ' '.join([ ' '.join(note.strip().split()) for note in notes ]) else: print line for note in notes: print '\t' + note return 0
def fill_scaffolds(args): max_filler_length, args = grace.get_option_value(args, '--max-filler', int, 4000) if len(args) < 2: print USAGE return 1 (output_dir, graph_dir), args = args[:2], args[2:] scaffolds = [ ] def scaffold(args): circular, args = grace.get_option_value(args, '--circular', grace.as_bool, False) scaffold = [ ] for item in args: scaffold.append( ('contig', int(item)) ) scaffold.append( ('gap', None) ) if not circular: scaffold = scaffold[:-1] name = 'custom_scaffold_%d' % (len(scaffolds)+1) scaffolds.append( (name, scaffold) ) grace.execute(args, [scaffold]) custom_scaffolds = (len(scaffolds) != 0) sequences = dict( (a.split()[0], b.upper()) for a,b in io.read_sequences(os.path.join( graph_dir, '454AllContigs.fna'))) sequence_names = sorted(sequences) sequence_ids = dict(zip(sequence_names, xrange(1,len(sequence_names)+1))) contexts = { } context_names = { } context_depths = { } for i in xrange(1,len(sequence_names)+1): seq = sequences[sequence_names[i-1]] contexts[ i ] = seq context_names[ i ] = sequence_names[i-1]+'-fwd' contexts[ -i ] = bio.reverse_complement(seq) context_names[ -i ] = sequence_names[i-1]+'-rev' links = collections.defaultdict(list) for line in open( os.path.join(graph_dir, '454ContigGraph.txt'), 'rU'): parts = line.rstrip('\n').split('\t') if parts[0].isdigit(): seq = sequence_ids[parts[1]] context_depths[ seq] = float(parts[3]) context_depths[-seq] = float(parts[3]) if parts[0] == 'C': name1 = 'contig%05d' % int(parts[1]) dir1 = {"3'" : 1, "5'" : -1 }[parts[2]] name2 = 'contig%05d' % int(parts[3]) dir2 = {"5'" : 1, "3'" : -1 }[parts[4]] depth = int(parts[5]) #print name1, dir1, name2, dir2, depth links[ sequence_ids[name1] * dir1 ].append( (depth, sequence_ids[name2] * dir2) ) links[ sequence_ids[name2] * -dir2 ].append( (depth, sequence_ids[name1] * -dir1) ) if parts[0] == 'S' and not custom_scaffolds: name = 'scaffold%05d' % int(parts[2]) components = parts[3].split(';') scaffold = [ ] for component in components: a,b = component.split(':') if a == 'gap': scaffold.append( ('gap',int(b)) ) else: strand = { '+': +1, '-': -1 }[ b ] scaffold.append( ('contig', sequence_ids['contig%05d'%int(a)] * strand) ) scaffolds.append( (name, scaffold) ) #paths = { } # #todo = [ ] #for i in contexts: # for depth_left, neg_left in links[-i]: # left = -neg_left # for depth_right, right in links[i]: # todo.append( ( max(-depth_left,-depth_right,-context_depths[i]), left, right, (i,)) ) # #heapq.heapify(todo) #while todo: # score, source, dest, path = heapq.heappop(todo) # if (source,dest) in paths: continue # # paths[(source,dest)] = path # # if len(contexts[dest]) > max_filler_length: continue # # for depth, next in links[dest]: # heapq.heappush(todo, # ( max(score,-depth,-context_depths[dest]), source, next, path+(dest,)) # ) path_source_dest = collections.defaultdict(dict) # source -> dest -> next path_dest_source = collections.defaultdict(dict) # dest -> source -> next # Use links, in order to depth of coverage, to construct paths between contigs # Thus: paths have maximum minimum depth # subsections of paths also have this property todo = [ ] for i in contexts: for depth_link, right in links[i]: todo.append( ( depth_link, i, right) ) todo.sort(reverse=True) for score, left, right in todo: if right in path_source_dest[left]: continue sources = [(left,right)] if len(contexts[left]) <= max_filler_length: sources += path_dest_source[left].items() destinations = [right] if len(contexts[right]) <= max_filler_length: destinations += path_source_dest[right].keys() for source, next in sources: for dest in destinations: if dest in path_source_dest[source]: continue path_source_dest[source][dest] = next path_dest_source[dest][source] = next workspace = io.Workspace(output_dir) scaffold_f = workspace.open('scaffolds.fa','wb') #comments = [ ] features = [ ] used = set() previous_total = 0 for i, (name, scaffold) in enumerate(scaffolds): result = '' # Inefficient. Meh. n_filled = 0 n_failed = 0 for j, item in enumerate(scaffold): if item[0] == 'contig': result += contexts[item[1]] used.add(abs(item[1])) else: left = scaffold[j-1] right = scaffold[ (j+1) % len(scaffold) ] #If gap at end, assume circular assert left[0] == 'contig' assert right[0] == 'contig' gap_start = len(result) can_fill = right[1] in path_source_dest[left[1]] if can_fill: n = 0 k = path_source_dest[left[1]][right[1]] while k != right[1]: n += len(contexts[k]) result += contexts[k].lower() used.add(abs(k)) k = path_source_dest[k][right[1]] n_filled += 1 if item[1] is not None and max(n,item[1]) > min(n,item[1])*4: print >> sys.stderr, 'Warning: gap size changed from %d to %d in scaffold %d' % (item[1],n,i+1) else: n_failed += 1 #print >> sys.stderr, 'Warning: No path to fill a gap in scaffold %d' % (i+1) result += 'n' * (9 if item[1] is None else item[1]) gap_end = len(result) #features.append( '%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s' % ( # 'all-scaffolds', # 'fill-scaffolds', # 'gap', # previous_total + gap_start+1, # previous_total + max(gap_end, gap_start+1), #Allow for zeroed out gaps. Hmm. # '.', #score # '+', #strand # '.', #frame # '' #properties #)) features.append( '%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s' % ( name, 'fill-scaffolds', 'gap', gap_start+1, max(gap_end, gap_start+1), #Allow for zeroed out gaps. Hmm. '.', #score '+', #strand '.', #frame '' #properties )) io.write_fasta(scaffold_f, name, result) previous_total += len(result) #comments.append('##sequence-region %s %d %d' % (name, 1, len(result))) print >> sys.stderr, 'Scaffold%05d: %d gaps filled, %d could not be filled' % (i+1, n_filled, n_failed) scaffold_f.close() gff_f = workspace.open('scaffolds.gff', 'wb') #print >>gff_f, '##gff-version 3' #for comment in comments: # print >>gff_f, comment for feature in features: print >>gff_f, feature gff_f.close() leftovers_f = workspace.open('leftovers.fa', 'wb') for name in sequence_names: if sequence_ids[name] not in used: io.write_fasta(leftovers_f, name, sequences[name]) leftovers_f.close() ends = { } for i, (name, scaffold) in enumerate(scaffolds): if scaffold[-1][0] == 'gap': continue ends[ '%s start' % name ] = scaffold[-1][1] ends[ '%s end ' % name ] = -scaffold[0][1] for end1 in sorted(ends): options = [ end2 for end2 in ends if -ends[end2] in path_source_dest[ends[end1]] ] if len(options) == 1: print >> sys.stderr, 'Note: from', end1, 'only', options[0], 'is reachable'
def main(args): mincov, args = grace.get_option_value(args, '--mincov', int, 1) maxdiff, args = grace.get_option_value(args, '--maxdiff', int, 16) minsize, args = grace.get_option_value(args, '--minsize', int, 200) what, args = grace.get_option_value(args, '--what', as_core_or_unique, 'core') is_core = (what == 'core') grace.expect_no_further_options(args) if len(args) < 2: print >> sys.stderr, HELP raise grace.Help_shown() output_dir, working_dirs = args[0], args[1:] assert not path.exists(path.join(output_dir, 'reference.fa')), \ 'Output directory not given' if not path.exists(output_dir): os.mkdir(output_dir) for name, seq in io.read_sequences(path.join(working_dirs[0],'reference.fa')): print name friendly_name = grace.filesystem_friendly_name(name) good = [ True ] * len(seq) for working_dir in working_dirs: if is_core: suffix = '-depth.userplot' else: suffix = '-ambiguous-depth.userplot' data = trivia.read_unstranded_userplot( os.path.join(working_dir, friendly_name+suffix) ) assert len(seq) == len(data) for i in xrange(len(seq)): if good[i]: if is_core: good[i] = data[i] >= mincov else: good[i] = data[i] < mincov #Close holes start = -maxdiff-1 n_holes = 0 for i in xrange(len(seq)): if good[i]: if 0 < i-start <= maxdiff: for j in xrange(start,i): good[j] = True n_holes += 1 start = i+1 print 'Closed', grace.pretty_number(n_holes), 'holes' f = open(path.join(output_dir, '%s-%s.fa' % (friendly_name,what)), 'wb') io.write_fasta(f, name, ''.join([ (seq[i] if good[i] else 'N') for i in xrange(len(seq)) ]) ) f.close() f = open(path.join(output_dir, '%s-%s_masked.fa' % (friendly_name,what)), 'wb') io.write_fasta(f, name, ''.join([ (seq[i] if good[i] else seq[i].lower()) for i in xrange(len(seq)) ]) ) f.close() f_good = open(path.join(output_dir, '%s-%s_parts.fa' % (friendly_name,what)), 'wb') f_nongood = open(path.join(output_dir, '%s-non%s_parts.fa' % (friendly_name,what)), 'wb') start = 0 n_good = [0] n_good_bases = [0] def emit(i): if i-start < minsize: return if good[start]: n_good[0] += 1 n_good_bases[0] += i-start io.write_fasta( f_good if good[start] else f_nongood, '%s:%d..%d' % (name, start+1,i), seq[start:i] ) for i in xrange(1,len(seq)): if good[i] != good[start]: emit(i) start = i emit(len(seq)) f_nongood.close() f_good.close() print grace.pretty_number(sum(good)), 'bases are '+what+', of', grace.pretty_number(len(seq)), 'in reference sequence' print grace.pretty_number(n_good[0]), 'parts at least', grace.pretty_number(minsize), 'bases long with', grace.pretty_number(n_good_bases[0]), 'total bases' print
def pastiche(args): if len(args) < 4: print USAGE return 1 mask_only, args = grace.get_option_value(args, '--mask', grace.as_bool, False) min_leftover, args = grace.get_option_value(args, '--min-leftover', int, 20) output_dir, args = args[0], args[1:] #, ref_filename, contig_filenames = args[0], args[1], args[2:] ref_filenames = [ ] contig_filenames = [ ] grace.execute(args, { 'contigs' : lambda args: contig_filenames.extend(args) }, lambda args: ref_filenames.extend(args)) assert ref_filenames, 'No reference sequences given' assert contig_filenames, 'No contig sequences given' contigs = dict([ (name.split()[0], seq) for filename in contig_filenames for name, seq in io.read_sequences(filename) ]) dir_contigs = { } for name in contigs: dir_contigs[name + '+'] = contigs[name] dir_contigs[name + '-'] = bio.reverse_complement(contigs[name]) dir_contigs_used = { } for name in dir_contigs: dir_contigs_used[name] = [ False ] * len(dir_contigs[name]) workspace = io.Workspace(output_dir) temp_prefix = workspace._object_filename('temp-pastiche') out_f = workspace.open('pastiche.fa', 'wb') for ref_filename in ref_filenames: for ref_name, ref_seq in io.read_sequences(ref_filename): ref_name = ref_name.split()[0] grace.status(ref_name) f = open(temp_prefix + '.fa','wb') io.write_fasta(f, 'ref', ref_seq) f.close() scores = [ -1 ] * (len(ref_seq)*2) strings = [ 'N', '' ] * (len(ref_seq)) contexts = [ None for i in xrange(len(ref_seq)*2) ] #MAXSCORE = len(ref_seq)+1 #for i in xrange(len(ref_seq)): # if ref_seq[i].upper() != 'N': # strings[i*2] = ref_seq[i] # scores[i*2] = MAXSCORE #for i in xrange(len(ref_seq)-1): # if ref_seq[i].upper() != 'N' and ref_seq[i+1].upper() != 'N': # scores[i*2+1] = MAXSCORE if mask_only: for i in xrange(len(ref_seq)): strings[i*2] = ref_seq[i].lower() def put(position, dir_contig_name, start, end, score): if scores[position] < score: scores[position] = score strings[position] = dir_contigs[dir_contig_name][start:end] contexts[position] = (dir_contig_name, start, end, score) for contig_filename in contig_filenames: execute(['nucmer', '--prefix', temp_prefix, #'--maxmatch', #Very slow '--nosimplify', '--minmatch', '9', '--mincluster', '50', #'--maxgap', '1000', #'--breaklen', '1000', # Increasing this reduces Ns, but is slow #'--diagfactor', '1.0', temp_prefix+'.fa', contig_filename]) for contig_name, contig_seq in io.read_sequences(contig_filename): contig_name = contig_name.split()[0] grace.status(ref_name + ' vs ' + contig_name) p = run(['show-aligns', temp_prefix+'.delta', 'ref', contig_name], stderr=subprocess.PIPE) alignments = [ ] while True: line = p.stdout.readline() if not line: break if not line.startswith('-- BEGIN'): continue parts = line.split() ref_start = int(parts[5]) ref_end = int(parts[7]) query_start = int(parts[10]) query_end = int(parts[12]) #assert ref_start < ref_end #ref_start -= 1 #Zero based coordinates al_ref = [ ] al_query = [ ] while True: block = [ ] end = False while True: line = p.stdout.readline() if line.startswith('-- END'): end = True break if line == '\n': if block: break else: continue block.append(line) if end: break al_ref.append(block[0].split()[1]) al_query.append(block[1].split()[1]) al_ref = ''.join(al_ref) al_query = ''.join(al_query) if ref_start > ref_end: al_ref = bio.reverse_complement(al_ref) al_query = bio.reverse_complement(al_query) ref_start, ref_end = ref_end, ref_start query_start, query_end = query_end, query_start if query_start > query_end: dir_contig_name = contig_name + '-' query_start = len(contig_seq)+1-query_start query_end = len(contig_seq)+1-query_end else: dir_contig_name = contig_name + '+' ref_start -= 1 #Zero based coordinates query_start -= 1 #print al_ref #print al_query #Pretty dumb scoring scheme al_score = 0 for i in xrange(len(al_ref)): if al_ref[i] == al_query[i]: al_score += 1 #else: # al_score -= 1 #Pastiche alignment over reference ref_pos = ref_start query_pos = query_start al_pos = 0 while al_pos < len(al_ref): assert al_ref[al_pos] != '.' if al_query[al_pos] == '.': put(ref_pos*2, dir_contig_name, query_pos, query_pos, al_score) else: assert al_query[al_pos].lower() == dir_contigs[dir_contig_name][query_pos].lower() put(ref_pos*2, dir_contig_name, query_pos, query_pos+1, al_score) query_pos += 1 al_pos += 1 al_pos_end = al_pos query_pos_end = query_pos while al_pos_end < len(al_ref) and al_ref[al_pos_end] == '.': al_pos_end += 1 query_pos_end += 1 #put(ref_pos*2+1, al_query[al_pos:al_pos_end], al_score) assert al_query[al_pos:al_pos_end].lower() == dir_contigs[dir_contig_name][query_pos:query_pos_end].lower() put(ref_pos*2+1, dir_contig_name, query_pos,query_pos_end, al_score) al_pos = al_pos_end query_pos = query_pos_end ref_pos += 1 p.wait() grace.status(ref_name) result = ''.join(strings) io.write_fasta(out_f, ref_name, result) for context in contexts: if context is None: continue name,start,end,score = context for i in xrange(start,end): dir_contigs_used[name][i] = True #Interpolation #result = [ ] #i = 0 #while i < len(ref_seq): # if strings[i*2].upper() != 'N': # result.append(strings[i*2]) # result.append(strings[i*2+1]) # i += 1 # continue # # j = i # while strings[j*2].upper() == 'N': # j += 1 # # grace.status('') # print >> sys.stderr, 'interpolating', i+1,'..',j # # window = 20 #!!!!!!!!!!! # left_contexts = collections.defaultdict(lambda:0) # for i1 in xrange(max(0,i-window),i): # for context_name, context_start, context_end, context_score in contexts[i1*2]: # key = (context_name, context_end + i - i1) # left_contexts[key] = max(left_contexts[key],context_score) # # right_contexts = collections.defaultdict(lambda:0) # for j1 in xrange(j,min(j+window,len(ref_seq))): # for context_name, context_start, context_end, context_score in contexts[j1*2]: # key = (context_name, context_start + j - j1) # right_contexts[key] = max(left_contexts[key],context_score) # # #print >> sys.stderr, left_contexts # #print >> sys.stderr, right_contexts # # options = [ ] # # for (left_name, left_pos), left_score in left_contexts.items(): # for (right_name, right_pos), right_score in right_contexts.items(): # if left_name != right_name: continue # if right_pos < left_pos: continue # # if right_pos-left_pos > (j-i) * 4.0 + 10: continue #!!!!!!!!!!!!!!!!!!!!!!1 # if right_pos-left_pos < (j-i) * 0.25 - 10: continue # # score = float(min(right_pos-left_pos,j-i))/max(right_pos-left_pos,j-i) # score *= left_score + right_score # #print >> sys.stderr, left_name, right_pos-left_pos, j-i, score # options.append( (score, left_name, left_pos, right_pos) ) # # if options: # best = max(options, key=lambda option: option[0]) # print >> sys.stderr, '->', best # result.append( dir_contigs[best[1]][best[2]:best[3]].lower() ) # else: # print >> sys.stderr, '-> no good interpolation' # result.append( ref_seq[i:j] ) # # i = j # #result = ''.join(result) #io.write_fasta(sys.stdout, ref_name, result) #print >> sys.stderr, len(result), result.count('N') #for pos, size in N_runs: # out_size = len(''.join( strings[pos*2:pos*2+2] )) # print >> sys.stderr, pos, size, '->', out_size out_f.close() grace.status('') #for name, seq in io.read_sequences(ref_filename): # result = pastiche(seq, contigs_filename) # io.write_fasta(sys.stdout, name, result) leftover_f = workspace.open('leftovers.fa','wb') for name in sorted(contigs): used = [ (a or b) for a,b in zip(dir_contigs_used[name+'+'],dir_contigs_used[name+'-'][::-1]) ] i = 0 while i < len(used): j = i while j < len(used) and not used[j]: j += 1 if j-i > min_leftover: if i == 0 and j == len(used): out_name = name else: out_name = name + ':%d..%d' % (i+1,j) io.write_fasta(leftover_f, out_name, contigs[name][i:j]) i = j+1 leftover_f.close() for suffix in ['.fa', '.delta']: os.unlink(temp_prefix + suffix)