def _get_timestamp(action): """ Look for ident() in .state subdirectory of current directory. If pickled value matches return the timestamp. """ try: if not os.path.exists('.state'): os.mkdir('.state') filename = os.path.join('.state', grace.filesystem_friendly_name(action.ident())) if os.path.exists(filename): with open(filename,'rb') as f: old = pickle.load(f) if action == old: if not hasattr(old, 'timestamp'): return None return old.timestamp #for parameter in self.parameters: # if parameter.get(self) != parameter.get(old): # print >> sys.stderr, parameter.name, parameter.get(old), '->', parameter.get(self) except Exception, error: import traceback traceback.print_exc() print >> sys.stderr, 'Error making %s, re-running: %s' % (action.ident(), error)
def _get_timestamp(action): """ Look for ident() in .state subdirectory of current directory. If pickled value matches return the timestamp. """ if selection.matches(LOCAL.do_selection, [action.shell_name()]): return None try: for filename in [ action.state_filename(), os.path.join('.state', grace.filesystem_friendly_name(action.ident())), #Old location of state files ]: if os.path.exists(filename): with open(filename,'rb') as f: old = pickle.load(f) if action != old: return None if not hasattr(old, 'timestamp'): return None if hasattr(old, 'timestamp_for') and old.timestamp_for != filename: return None return old.timestamp #for parameter in self.parameters: # if parameter.get(self) != parameter.get(old): # print >> sys.stderr, parameter.name, parameter.get(old), '->', parameter.get(self) except Exception, error: import traceback traceback.print_exc() print >> sys.stderr, 'Error making %s, re-running: %s' % (action.ident(), error)
def get_graph(path, name, suffix): filename = os.path.join(path, grace.filesystem_friendly_name(name) + '-' + suffix + '.userplot') result = [ ] for item in open(filename,'rb'): result.append( float(item.strip()) ) return numpy.array(result)
def target(self, path, dep, *commands): """ path is a directory or prefix or directory/prefix command is a command to execute to produce it """ state_prefix = join( 'state', grace.filesystem_friendly_name(path) + '_' ) state_name = state_prefix + hashlib.sha1( '\n'.join(item.strip() for item in commands) ).hexdigest() self.all.append(state_name) self.lines.extend([ '', '%s : %s' % (state_name, ' '.join(dep)), '\t@rm -f %s%s' % (state_prefix, '?'*40), ] + [ '\t%s' % make_quote(self.submit.replace('%',command)) for command in commands ] + [ '\t@touch %s' % state_name, ]) return state_name
def get_graph(path, name, suffix): filename = os.path.join( path, grace.filesystem_friendly_name(name) + '-' + suffix + '.userplot') result = [] for item in open(filename, 'rb'): result.append(float(item.strip())) return numpy.array(result)
def _run_and_save_state(action, timestamp): filename = os.path.join('.state', grace.filesystem_friendly_name(action.ident())) temp_filename = os.path.join('.state', 'temp-' + grace.filesystem_friendly_name(action.ident())) if os.path.exists(filename): os.unlink(filename) if LOCAL.do_nothing: result = None else: result = action.run() LOCAL.time = max(LOCAL.time, timestamp) action.timestamp = timestamp with open(temp_filename,'wb') as f: pickle.dump(action, f) os.rename(temp_filename, filename) return result
def genbank_callback(name, record): """ Make a copy of any genbank files passed in. """ from Bio import SeqIO SeqIO.write([record], reference_genbank_file, 'genbank') f = open(self / (grace.filesystem_friendly_name(name) + '.gbk'), 'wb') SeqIO.write([record], f, 'genbank') f.close() any_genbank[0] = True
def evidence_reader(working_dir, name): filename = os.path.join(working_dir, grace.filesystem_friendly_name(name) + '-evidence.txt') f = open(filename,'rb') header = f.readline() if header.count('\t') != 7: raise grace.Error('Old style evidence file. Please re-run nesoni consensus.') for line in f: fields = line.rstrip('\n').split('\t') yield Call(fields[4], fields[1], fields[6]) yield Call(fields[5], fields[2], fields[7]) f.close()
def evidence_reader(working_dir, name): filename = os.path.join(working_dir, grace.filesystem_friendly_name(name) + "-evidence.txt") f = open(filename, "rb") header = f.readline() if header.count("\t") != 7: raise grace.Error("Old style evidence file. Please re-run nesoni consensus.") for line in f: fields = line.rstrip("\n").split("\t") yield Call(fields[4], fields[1], fields[6]) yield Call(fields[5], fields[2], fields[7]) f.close()
def debias(args): import numpy radius, args = grace.get_option_value(args, '--radius', int, 2) dirs = args for dir_name in dirs: for name, seq in io.read_sequences( os.path.join(dir_name, 'reference.fa')): for suffix, ambig_suffix in [ ('-depth', '-ambiguous-depth'), ('-pairspan-depth', '-ambiguous-pairspan-depth'), ]: root = grace.filesystem_friendly_name(name) full_name = os.path.join(dir_name, root + suffix + '.userplot') full_ambig_name = os.path.join( dir_name, root + ambig_suffix + '.userplot') if not os.path.exists(full_name): continue if not os.path.exists(full_ambig_name): continue output_suffix = '-%d.userplot' % radius print dir_name, root, output_suffix depths = numpy.array(read_unstranded_userplot(full_name)) ambig_depths = numpy.array( read_unstranded_userplot(full_ambig_name)) expect = expected_depth(root, seq, depths, ambig_depths, radius) write_unstranded_userplot( os.path.join(dir_name, root + suffix + '-expected' + output_suffix), expect) corrected = depths / expect * numpy.median(expect) corrected[expect <= 5.0] = 0.0 write_unstranded_userplot( os.path.join(dir_name, root + suffix + '-corrected' + output_suffix), corrected) ambig_corrected = ambig_depths / expect * numpy.median(expect) ambig_corrected[expect <= 0.0] = 0.0 write_unstranded_userplot( os.path.join( dir_name, root + ambig_suffix + '-corrected' + output_suffix), ambig_corrected)
def run(self): working = io.Workspace(self.output_dir, must_exist=False) for filename in self.files: reader = io.Table_reader(filename) name = os.path.splitext(os.path.split(filename)[1])[0] rname = None files = None for record in reader: if record['Chromosome'] != rname: if files: for item in files: item.close() rname = record['Chromosome'] grace.status('Convert '+name+' '+rname) files = [ open(working / ( name + '-' + grace.filesystem_friendly_name(rname) + '-' + grace.filesystem_friendly_name(item) + '.userplot' ), 'wb') for item in reader.headings[4:] ] pos = 0 assert int(record['Start']) == pos and int(record['End']) == pos + 1 for val, f in zip(record.values()[4:], files): print >> f, val pos += 1 if files: for item in files: item.close() grace.status('')
def debias(args): import numpy radius, args = grace.get_option_value(args, '--radius', int, 2) dirs = args for dir_name in dirs: for name, seq in io.read_sequences(os.path.join(dir_name,'reference.fa')): for suffix, ambig_suffix in [ ('-depth', '-ambiguous-depth'), ('-pairspan-depth', '-ambiguous-pairspan-depth'), ]: root = grace.filesystem_friendly_name(name) full_name = os.path.join(dir_name, root + suffix + '.userplot') full_ambig_name = os.path.join(dir_name, root + ambig_suffix + '.userplot') if not os.path.exists(full_name): continue if not os.path.exists(full_ambig_name): continue output_suffix = '-%d.userplot' % radius print dir_name, root, output_suffix depths = numpy.array( read_unstranded_userplot(full_name) ) ambig_depths = numpy.array( read_unstranded_userplot(full_ambig_name) ) expect = expected_depth(root, seq, depths, ambig_depths, radius) write_unstranded_userplot( os.path.join(dir_name, root + suffix + '-expected' + output_suffix), expect) corrected = depths / expect * numpy.median(expect) corrected[expect <= 5.0] = 0.0 write_unstranded_userplot( os.path.join(dir_name, root + suffix + '-corrected' + output_suffix), corrected) ambig_corrected = ambig_depths / expect * numpy.median(expect) ambig_corrected[expect <= 0.0] = 0.0 write_unstranded_userplot( os.path.join(dir_name, root + ambig_suffix + '-corrected' + output_suffix), ambig_corrected)
def _get_timestamp(action): """ Look for ident() in .state subdirectory of current directory. If pickled value matches return the timestamp. """ if selection.matches(LOCAL.do_selection, [action.shell_name()]): return None try: for filename in [ action.state_filename(), os.path.join( '.state', grace.filesystem_friendly_name( action.ident())), #Old location of state files ]: if os.path.exists(filename): with open(filename, 'rb') as f: old = pickle.load(f) if action != old: return None if not hasattr(old, 'timestamp'): return None if hasattr(old, 'timestamp_for') and old.timestamp_for != filename: return None return old.timestamp #for parameter in self.parameters: # if parameter.get(self) != parameter.get(old): # print >> sys.stderr, parameter.name, parameter.get(old), '->', parameter.get(self) except Exception, error: import traceback traceback.print_exc() print >> sys.stderr, 'Error making %s, re-running: %s' % ( action.ident(), error)
def main(args): mincov, args = grace.get_option_value(args, '--mincov', int, 1) maxdiff, args = grace.get_option_value(args, '--maxdiff', int, 16) minsize, args = grace.get_option_value(args, '--minsize', int, 200) what, args = grace.get_option_value(args, '--what', as_core_or_unique, 'core') is_core = (what == 'core') grace.expect_no_further_options(args) if len(args) < 2: print >> sys.stderr, HELP raise grace.Help_shown() output_dir, working_dirs = args[0], args[1:] assert not path.exists(path.join(output_dir, 'reference.fa')), \ 'Output directory not given' if not path.exists(output_dir): os.mkdir(output_dir) for name, seq in io.read_sequences( path.join(working_dirs[0], 'reference.fa')): print name friendly_name = grace.filesystem_friendly_name(name) good = [True] * len(seq) for working_dir in working_dirs: if is_core: suffix = '-depth.userplot' else: suffix = '-ambiguous-depth.userplot' data = trivia.read_unstranded_userplot( os.path.join(working_dir, friendly_name + suffix)) assert len(seq) == len(data) for i in xrange(len(seq)): if good[i]: if is_core: good[i] = data[i] >= mincov else: good[i] = data[i] < mincov #Close holes start = -maxdiff - 1 n_holes = 0 for i in xrange(len(seq)): if good[i]: if 0 < i - start <= maxdiff: for j in xrange(start, i): good[j] = True n_holes += 1 start = i + 1 print 'Closed', grace.pretty_number(n_holes), 'holes' f = open(path.join(output_dir, '%s-%s.fa' % (friendly_name, what)), 'wb') io.write_fasta( f, name, ''.join([(seq[i] if good[i] else 'N') for i in xrange(len(seq))])) f.close() f = open( path.join(output_dir, '%s-%s_masked.fa' % (friendly_name, what)), 'wb') io.write_fasta( f, name, ''.join([(seq[i] if good[i] else seq[i].lower()) for i in xrange(len(seq))])) f.close() f_good = open( path.join(output_dir, '%s-%s_parts.fa' % (friendly_name, what)), 'wb') f_nongood = open( path.join(output_dir, '%s-non%s_parts.fa' % (friendly_name, what)), 'wb') start = 0 n_good = [0] n_good_bases = [0] def emit(i): if i - start < minsize: return if good[start]: n_good[0] += 1 n_good_bases[0] += i - start io.write_fasta(f_good if good[start] else f_nongood, '%s:%d..%d' % (name, start + 1, i), seq[start:i]) for i in xrange(1, len(seq)): if good[i] != good[start]: emit(i) start = i emit(len(seq)) f_nongood.close() f_good.close() print grace.pretty_number( sum(good)), 'bases are ' + what + ', of', grace.pretty_number( len(seq)), 'in reference sequence' print grace.pretty_number( n_good[0]), 'parts at least', grace.pretty_number( minsize), 'bases long with', grace.pretty_number( n_good_bases[0]), 'total bases' print
def main(args): mincov, args = grace.get_option_value(args, '--mincov', int, 1) maxdiff, args = grace.get_option_value(args, '--maxdiff', int, 16) minsize, args = grace.get_option_value(args, '--minsize', int, 200) what, args = grace.get_option_value(args, '--what', as_core_or_unique, 'core') is_core = (what == 'core') grace.expect_no_further_options(args) if len(args) < 2: print >> sys.stderr, HELP raise grace.Help_shown() output_dir, working_dirs = args[0], args[1:] assert not path.exists(path.join(output_dir, 'reference.fa')), \ 'Output directory not given' if not path.exists(output_dir): os.mkdir(output_dir) for name, seq in io.read_sequences(path.join(working_dirs[0],'reference.fa')): print name friendly_name = grace.filesystem_friendly_name(name) good = [ True ] * len(seq) for working_dir in working_dirs: if is_core: suffix = '-depth.userplot' else: suffix = '-ambiguous-depth.userplot' data = trivia.read_unstranded_userplot( os.path.join(working_dir, friendly_name+suffix) ) assert len(seq) == len(data) for i in xrange(len(seq)): if good[i]: if is_core: good[i] = data[i] >= mincov else: good[i] = data[i] < mincov #Close holes start = -maxdiff-1 n_holes = 0 for i in xrange(len(seq)): if good[i]: if 0 < i-start <= maxdiff: for j in xrange(start,i): good[j] = True n_holes += 1 start = i+1 print 'Closed', grace.pretty_number(n_holes), 'holes' f = open(path.join(output_dir, '%s-%s.fa' % (friendly_name,what)), 'wb') io.write_fasta(f, name, ''.join([ (seq[i] if good[i] else 'N') for i in xrange(len(seq)) ]) ) f.close() f = open(path.join(output_dir, '%s-%s_masked.fa' % (friendly_name,what)), 'wb') io.write_fasta(f, name, ''.join([ (seq[i] if good[i] else seq[i].lower()) for i in xrange(len(seq)) ]) ) f.close() f_good = open(path.join(output_dir, '%s-%s_parts.fa' % (friendly_name,what)), 'wb') f_nongood = open(path.join(output_dir, '%s-non%s_parts.fa' % (friendly_name,what)), 'wb') start = 0 n_good = [0] n_good_bases = [0] def emit(i): if i-start < minsize: return if good[start]: n_good[0] += 1 n_good_bases[0] += i-start io.write_fasta( f_good if good[start] else f_nongood, '%s:%d..%d' % (name, start+1,i), seq[start:i] ) for i in xrange(1,len(seq)): if good[i] != good[start]: emit(i) start = i emit(len(seq)) f_nongood.close() f_good.close() print grace.pretty_number(sum(good)), 'bases are '+what+', of', grace.pretty_number(len(seq)), 'in reference sequence' print grace.pretty_number(n_good[0]), 'parts at least', grace.pretty_number(minsize), 'bases long with', grace.pretty_number(n_good_bases[0]), 'total bases' print
def main(args): title1, args = grace.get_option_value(args, '--title1', str, None) title2, args = grace.get_option_value(args, '--title2', str, None) grace.expect_no_further_options(args) if len(args) != 3: print >> sys.stderr, USAGE return 1 working_dir1 = args[0] working_dir2 = args[1] cutoff = float(args[2]) sequence_names = [ name for name, sequence in io.read_sequences( os.path.join(working_dir1, 'reference.fa')) ] if title1 is None: title1 = working_dir1 if title2 is None: title2 = working_dir2 n = 1 while significance([('A', n)], [('T', n)], 1.0) > cutoff: n += 1 print '%g\tsignificance cutoff' % cutoff print '%d\tdepth required to call substitution (greater if there are errors in the reads)' % n print 'Sequence\tPosition in reference\tChange type\tReference\t%s\t%s\tp-value (no correction for multiple testing)\t%s\t%s' % ( title1, title2, title1, title2) for sequence_name in sequence_names: filename1 = os.path.join( working_dir1, grace.filesystem_friendly_name(sequence_name) + '-evidence.txt') filename2 = os.path.join( working_dir2, grace.filesystem_friendly_name(sequence_name) + '-evidence.txt') for (pos1, ins1, sub1, ref1, conins1, consub1), (pos2, ins2, sub2, ref2, conins2, consub2) in itertools.izip(read_file(filename1), read_file(filename2)): assert pos1 == pos2 and ref1 == ref2 if pos1 % 1000 == 0: grace.status('Testing %s %d' % (sequence_name, pos1)) dec_ins1 = io.decode_evidence(ins1) dec_ins2 = io.decode_evidence(ins2) if dec_ins1 and dec_ins2: sig = significance(io.decode_evidence(ins1), io.decode_evidence(ins2), cutoff) if sig is not None and sig <= cutoff: grace.status('') print '%s\t%d\t%s\t\t%s\t%s\t%g\t%s\t%s' % ( sequence_name, pos1, 'insertion-before', ins1, ins2, sig, conins1, conins2) dec_sub1 = io.decode_evidence(sub1) dec_sub2 = io.decode_evidence(sub2) if dec_sub1 and dec_sub2: sig = significance(dec_sub1, dec_sub2, cutoff) if sig is not None and sig <= cutoff: if dec_sub1[0][0] == '-' or dec_sub2[0][0] == '-': what = 'deletion' elif dec_sub1[0][0] != dec_sub2[0][0]: what = 'substitution' else: what = 'different mix' grace.status('') print '%s\t%d\t%s\t%s\t%s\t%s\t%g\t%s\t%s' % ( sequence_name, pos1, what, ref1, sub1, sub2, sig, consub1, consub2) grace.status('') return 0
def run(self): #mincov, args = grace.get_option_value(args, '--mincov', int, 1) #maxdiff, args = grace.get_option_value(args, '--maxdiff', int, 16) #minsize, args = grace.get_option_value(args, '--minsize', int, 200) #what, args = grace.get_option_value(args, '--what', as_core_or_unique, 'core') #is_core = (what == 'core') # #grace.expect_no_further_options(args) # #if len(args) < 2: # print >> sys.stderr, HELP # raise grace.Help_shown() # #output_dir, working_dirs = args[0], args[1:] # ##assert not path.exists(path.join(output_dir, 'reference.fa')), \ #assert not path.exists(path.join(output_dir, 'parameters')), \ # 'Output directory not given' # #if not path.exists(output_dir): # os.mkdir(output_dir) assert self.what in ('core','unique'), 'Expected --what to be either "core" or "unique".' is_core = (self.what == 'core') workspace = self.get_workspace() for name, seq in io.read_sequences(working_directory.Working(self.working_dirs[0]).get_reference().reference_fasta_filename()): self.log.log(name + '\n') friendly_name = grace.filesystem_friendly_name(name) good = [ True ] * len(seq) for working_dir in self.working_dirs: if is_core: suffix = '-depth.userplot' else: suffix = '-ambiguous-depth.userplot' data = trivia.read_unstranded_userplot( os.path.join(working_dir, friendly_name+suffix) ) assert len(seq) == len(data) for i in xrange(len(seq)): if good[i]: if is_core: good[i] = data[i] >= self.mincov else: good[i] = data[i] < self.mincov #Close holes start = -self.maxdiff-1 n_holes = 0 for i in xrange(len(seq)): if good[i]: if 0 < i-start <= self.maxdiff: for j in xrange(start,i): good[j] = True n_holes += 1 start = i+1 self.log.log('Closed '+grace.pretty_number(n_holes)+' holes\n') f = open( workspace/('%s-%s.fa' % (friendly_name,self.what)), 'wb') io.write_fasta(f, name, ''.join([ (seq[i] if good[i] else 'N') for i in xrange(len(seq)) ]) ) f.close() f = open( workspace/('%s-%s_masked.fa' % (friendly_name,self.what)), 'wb') io.write_fasta(f, name, ''.join([ (seq[i] if good[i] else seq[i].lower()) for i in xrange(len(seq)) ]) ) f.close() f_good = open( workspace/('%s-%s_parts.fa' % (friendly_name,self.what)), 'wb') f_nongood = open( workspace/('%s-non%s_parts.fa' % (friendly_name,self.what)), 'wb') start = 0 n_good = [0] n_good_bases = [0] def emit(i): if i-start < self.minsize: return if good[start]: n_good[0] += 1 n_good_bases[0] += i-start io.write_fasta( f_good if good[start] else f_nongood, '%s:%d..%d' % (name, start+1,i), seq[start:i] ) for i in xrange(1,len(seq)): if good[i] != good[start]: emit(i) start = i emit(len(seq)) f_nongood.close() f_good.close() self.log.log(grace.pretty_number(sum(good))+' bases are '+self.what+', of '+grace.pretty_number(len(seq))+' in reference sequence\n') self.log.log(grace.pretty_number(n_good[0])+' parts at least '+grace.pretty_number(self.minsize)+' bases long with '+grace.pretty_number(n_good_bases[0])+' total bases\n') self.log.log('\n')
def run(self): title1 = self.title1 title2 = self.title2 working1 = working_directory.Working(self.working_dir1) working2 = working_directory.Working(self.working_dir2) cutoff = self.cutoff sequence_names = [ name for name, length in working1.get_reference().get_lengths() ] if title1 is None: title1 = working1.name if title2 is None: title2 = working2.name n = 1 while significance([('A', n)], [('T', n)], 1.0) > cutoff: n += 1 f = open(self.prefix + '.txt', 'wb') print >> f, '%g\tsignificance cutoff' % cutoff print >> f, '%d\tdepth required to call substitution (greater if there are errors in the reads)' % n print >> f, 'Sequence\tPosition in reference\tChange type\tReference\t%s\t%s\tp-value (no correction for multiple testing)\t%s\t%s' % ( title1, title2, title1, title2) for sequence_name in sequence_names: filename1 = working1 / ( grace.filesystem_friendly_name(sequence_name) + '-evidence.txt') filename2 = working2 / ( grace.filesystem_friendly_name(sequence_name) + '-evidence.txt') for (pos1, ins1, sub1, ref1, conins1, consub1), (pos2, ins2, sub2, ref2, conins2, consub2) in itertools.izip(read_file(filename1), read_file(filename2)): assert pos1 == pos2 and ref1 == ref2 if pos1 % 1000 == 0: grace.status('Testing %s %d' % (sequence_name, pos1)) dec_ins1 = io.decode_evidence(ins1) dec_ins2 = io.decode_evidence(ins2) if dec_ins1 and dec_ins2: sig = significance(io.decode_evidence(ins1), io.decode_evidence(ins2), cutoff) if sig is not None and sig <= cutoff: print >> f, '%s\t%d\t%s\t\t%s\t%s\t%g\t%s\t%s' % ( sequence_name, pos1, 'insertion-before', ins1, ins2, sig, conins1, conins2) f.flush() dec_sub1 = io.decode_evidence(sub1) dec_sub2 = io.decode_evidence(sub2) if dec_sub1 and dec_sub2: sig = significance(dec_sub1, dec_sub2, cutoff) if sig is not None and sig <= cutoff: if dec_sub1[0][0] == '-' or dec_sub2[0][0] == '-': what = 'deletion' elif dec_sub1[0][0] != dec_sub2[0][0]: what = 'substitution' else: what = 'different mix' print >> f, '%s\t%d\t%s\t%s\t%s\t%s\t%g\t%s\t%s' % ( sequence_name, pos1, what, ref1, sub1, sub2, sig, consub1, consub2) f.flush() f.close() grace.status('') return 0
def run(self): title1 = self.title1 title2 = self.title2 working1 = working_directory.Working(self.working_dir1) working2 = working_directory.Working(self.working_dir2) cutoff = self.cutoff sequence_names = [ name for name, length in working1.get_reference().get_lengths() ] if title1 is None: title1 = working1.name if title2 is None: title2 = working2.name n = 1 while significance([('A',n)],[('T',n)],1.0) > cutoff: n += 1 f = open(self.prefix + '.txt','wb') print >> f, '%g\tsignificance cutoff' % cutoff print >> f, '%d\tdepth required to call substitution (greater if there are errors in the reads)' % n print >> f, 'Sequence\tPosition in reference\tChange type\tReference\t%s\t%s\tp-value (no correction for multiple testing)\t%s\t%s' % (title1, title2, title1, title2) for sequence_name in sequence_names: filename1 = working1/(grace.filesystem_friendly_name(sequence_name) + '-evidence.txt') filename2 = working2/(grace.filesystem_friendly_name(sequence_name) + '-evidence.txt') for (pos1, ins1, sub1, ref1, conins1, consub1), (pos2, ins2, sub2, ref2, conins2, consub2) in itertools.izip(read_file(filename1), read_file(filename2)): assert pos1 == pos2 and ref1 == ref2 if pos1 % 1000 == 0: grace.status('Testing %s %d' % (sequence_name, pos1)) dec_ins1 = io.decode_evidence(ins1) dec_ins2 = io.decode_evidence(ins2) if dec_ins1 and dec_ins2: sig = significance(io.decode_evidence(ins1), io.decode_evidence(ins2), cutoff) if sig is not None and sig <= cutoff: print >> f, '%s\t%d\t%s\t\t%s\t%s\t%g\t%s\t%s' % (sequence_name, pos1, 'insertion-before', ins1, ins2, sig, conins1, conins2) f.flush() dec_sub1 = io.decode_evidence(sub1) dec_sub2 = io.decode_evidence(sub2) if dec_sub1 and dec_sub2: sig = significance(dec_sub1, dec_sub2, cutoff) if sig is not None and sig <= cutoff: if dec_sub1[0][0] == '-' or dec_sub2[0][0] == '-': what = 'deletion' elif dec_sub1[0][0] != dec_sub2[0][0]: what = 'substitution' else: what = 'different mix' print >> f, '%s\t%d\t%s\t%s\t%s\t%s\t%g\t%s\t%s' % (sequence_name, pos1, what, ref1, sub1, sub2, sig, consub1, consub2) f.flush() f.close() grace.status('') return 0
def old_main(args): use_indels, args = grace.get_option_value(args,'--indels',int,1) use_reference, args = grace.get_option_value(args,'--reference',int,1) make_list, args = grace.get_option_value(args,'--list',int,0) fasta_output, args = grace.get_option_value(args,'--fasta',int,0) grace.expect_no_further_options(args) if len(args) < 1: sys.stderr.write(USAGE) return 1 if fasta_output and use_indels: print >> sys.stderr, 'Indels will not be included in FASTA output' use_indels = 0 working_dirs = args #reference_data = { } # (ref_name, position, change_type) -> string #strain_data = { } # working_dir -> (ref_name, position, change_type) -> string names = ['reference'] + working_dirs substitution_calls = { } # ref_name -> [ [ call ] ] insertion_calls = { } # ref_name -> [ [ call ] ] substitution_evidence = { } insertion_evidence = { } for name, sequence in io.read_sequences(os.path.join(working_dirs[0], 'reference.fa')): substitution_calls[name] = [ list(sequence.upper()) ] insertion_calls[name] = [ [ '-' ] * len(sequence) ] substitution_evidence[name] = [ [ '' ] * len(sequence) ] insertion_evidence[name] = [ [ '' ] * len(sequence) ] for working_dir in working_dirs: for name in substitution_calls: filename = os.path.join(working_dir, grace.filesystem_friendly_name(name) + '-evidence.txt') f = open(filename,'rb') this_substitution_calls = [ ] this_insertion_calls = [ ] this_substitution_evidence = [ ] this_insertion_evidence = [ ] header = f.readline() if header.count('\t') != 5: print >> sys.stderr, 'Old style evidence file. Please re-run nesoni consensus.' return 1 for line in f: fields = line.rstrip('\n').split('\t') this_substitution_calls.append(fields[5]) this_insertion_calls.append(fields[4]) this_substitution_evidence.append(fields[2]) this_insertion_evidence.append(fields[1]) substitution_calls[name].append(this_substitution_calls) insertion_calls[name].append(this_insertion_calls) substitution_evidence[name].append(this_substitution_evidence) insertion_evidence[name].append(this_insertion_evidence) if not use_reference: names.pop(0) for name in substitution_calls: substitution_calls[name].pop(0) insertion_calls[name].pop(0) substitution_evidence[name].pop(0) insertion_evidence[name].pop(0) interesting = find_interesting('substitution', substitution_calls, substitution_evidence) if use_indels: interesting.extend( find_interesting('insertion-before', insertion_calls, insertion_evidence) ) if not use_indels: interesting = [ item for item in interesting if '-' not in item[3] ] interesting.sort() if fasta_output: do_fasta_output(names, interesting) return 0 #strain_reference_having_consensus = { } # working_dir -> ref_name -> string # #for working_dir in working_dirs: # assert working_dir not in strain_data, 'Working directory given twice' # strain_data[working_dir] = { } # # report_file = open(os.path.join(working_dir, 'report.txt'), 'rU') # report_file.readline() # for line in report_file: # ref_name, position, change_type, old, new, evidence = \ # line.rstrip('\n').split('\t') # # if change_type == 'deletion': # change_type = 'substitution' # # if not use_indels and \ # (change_type == 'insertion-before' or new == '-'): # continue # # key = (ref_name, int(position), change_type) # if key in reference_data: # assert reference_data[key] == old # else: # reference_data[key] = old # # strain_data[working_dir][key] = new # report_file.close() # # strain_reference_having_consensus[working_dir] = { } # ref_have_con_filename = os.path.join(working_dir, 'reference_having_consensus.fa') # for name, sequence in io.read_fasta(ref_have_con_filename): # strain_reference_having_consensus[working_dir][name] = sequence # #keys = sorted(reference_data) # ##Fill in any blanks #for working_dir in working_dirs: # for key in keys: # if key in strain_data[working_dir]: continue # # # - Positions in report files start from 1 not 0 # # - Insertions must be bracketed # lacks_consensus = ( # strain_reference_having_consensus[working_dir][key[0]][key[1]-1] == 'N' or # (key[2] == 'insertion-before' and key[1] > 1 and # strain_reference_having_consensus[working_dir][key[0]][key[1]-2] == 'N') # ) # # #If there's no consensus, record it as ambiguous # if lacks_consensus: # strain_data[working_dir][key] = 'N' # else: # strain_data[working_dir][key] = reference_data[key] #all_data_names = ([ 'reference' ] if use_reference else []) + working_dirs #all_data = ([ reference_data ] if use_reference else []) + \ # [ strain_data[working_dir] for working_dir in working_dirs ] #all_data_names = ([ 'reference' ] if use_reference else []) + working_dirs ones = ( 1 << len(names) )-1 total_differences = 0 if make_list: print '\t'.join(['Partition','Sequence','Position in reference','Change type'] + names + names) for i in xrange(1,(1<<len(names))-1,2): set1 = [ ] set2 = [ ] for j in xrange(len(names)): if i & (1<<j): set1.append(j) else: set2.append(j) if make_list: print print ', '.join( names[i] for i in set1 ) + ' vs ' + \ ', '.join( names[i] for i in set2 ) print n = 0 for refname, position, change_type, values, has_ambiguous, evidence in interesting: #Skip if *any* ambiguity if has_ambiguous: continue if any( values[i] != values[set1[0]] for i in set1[1:] ) or \ any( values[i] != values[set2[0]] for i in set2[1:] ): continue if make_list: if change_type == 'substitution' and '-' in values: change_type = 'deletion' print '\t%s\t%d\t%s\t' % (refname,position,change_type) + '\t'.join(values) + '\t' + '\t'.join(evidence) n += 1 total_differences += n if not make_list: print ', '.join( names[i] for i in set1 ) + ' vs ' + \ ', '.join( names[i] for i in set2 ) + \ ': %d differences' %n if not make_list: print print 'Total: %d' % total_differences if make_list: print print 'Ignored' print n_multiway = 0 n_ambiguous = 0 for refname, position, change_type, values, has_ambiguous, evidence in interesting: confusing = False if has_ambiguous: n_ambiguous += 1 confusing = True elif len(set(values)) > 2: n_multiway += 1 confusing = True if make_list and confusing: print '\t%s\t%d\t%s\t' % (refname,position,change_type) + '\t'.join(values) + '\t' + '\t'.join(evidence) if not make_list: print print 'Ambiguities ignored: %d' % n_ambiguous print 'Multi-way changes ignored: %d' % n_multiway assert total_differences + n_ambiguous + n_multiway == len(interesting) return 0
def main(args): title1, args = grace.get_option_value(args, "--title1", str, None) title2, args = grace.get_option_value(args, "--title2", str, None) grace.expect_no_further_options(args) if len(args) != 3: print >> sys.stderr, USAGE return 1 working_dir1 = args[0] working_dir2 = args[1] cutoff = float(args[2]) sequence_names = [name for name, sequence in io.read_sequences(os.path.join(working_dir1, "reference.fa"))] if title1 is None: title1 = working_dir1 if title2 is None: title2 = working_dir2 n = 1 while significance([("A", n)], [("T", n)], 1.0) > cutoff: n += 1 print "%g\tsignificance cutoff" % cutoff print "%d\tdepth required to call substitution (greater if there are errors in the reads)" % n print "Sequence\tPosition in reference\tChange type\tReference\t%s\t%s\tp-value (no correction for multiple testing)\t%s\t%s" % ( title1, title2, title1, title2, ) for sequence_name in sequence_names: filename1 = os.path.join(working_dir1, grace.filesystem_friendly_name(sequence_name) + "-evidence.txt") filename2 = os.path.join(working_dir2, grace.filesystem_friendly_name(sequence_name) + "-evidence.txt") for (pos1, ins1, sub1, ref1, conins1, consub1), (pos2, ins2, sub2, ref2, conins2, consub2) in itertools.izip( read_file(filename1), read_file(filename2) ): assert pos1 == pos2 and ref1 == ref2 if pos1 % 1000 == 0: grace.status("Testing %s %d" % (sequence_name, pos1)) dec_ins1 = io.decode_evidence(ins1) dec_ins2 = io.decode_evidence(ins2) if dec_ins1 and dec_ins2: sig = significance(io.decode_evidence(ins1), io.decode_evidence(ins2), cutoff) if sig is not None and sig <= cutoff: grace.status("") print "%s\t%d\t%s\t\t%s\t%s\t%g\t%s\t%s" % ( sequence_name, pos1, "insertion-before", ins1, ins2, sig, conins1, conins2, ) dec_sub1 = io.decode_evidence(sub1) dec_sub2 = io.decode_evidence(sub2) if dec_sub1 and dec_sub2: sig = significance(dec_sub1, dec_sub2, cutoff) if sig is not None and sig <= cutoff: if dec_sub1[0][0] == "-" or dec_sub2[0][0] == "-": what = "deletion" elif dec_sub1[0][0] != dec_sub2[0][0]: what = "substitution" else: what = "different mix" grace.status("") print "%s\t%d\t%s\t%s\t%s\t%s\t%g\t%s\t%s" % ( sequence_name, pos1, what, ref1, sub1, sub2, sig, consub1, consub2, ) grace.status("") return 0
def run(self): #mincov, args = grace.get_option_value(args, '--mincov', int, 1) #maxdiff, args = grace.get_option_value(args, '--maxdiff', int, 16) #minsize, args = grace.get_option_value(args, '--minsize', int, 200) #what, args = grace.get_option_value(args, '--what', as_core_or_unique, 'core') #is_core = (what == 'core') # #grace.expect_no_further_options(args) # #if len(args) < 2: # print >> sys.stderr, HELP # raise grace.Help_shown() # #output_dir, working_dirs = args[0], args[1:] # ##assert not path.exists(path.join(output_dir, 'reference.fa')), \ #assert not path.exists(path.join(output_dir, 'parameters')), \ # 'Output directory not given' # #if not path.exists(output_dir): # os.mkdir(output_dir) assert self.what in ( 'core', 'unique'), 'Expected --what to be either "core" or "unique".' is_core = (self.what == 'core') workspace = self.get_workspace() for name, seq in io.read_sequences( working_directory.Working(self.working_dirs[0]).get_reference( ).reference_fasta_filename()): self.log.log(name + '\n') friendly_name = grace.filesystem_friendly_name(name) good = [True] * len(seq) for working_dir in self.working_dirs: if is_core: suffix = '-depth.userplot' else: suffix = '-ambiguous-depth.userplot' data = trivia.read_unstranded_userplot( os.path.join(working_dir, friendly_name + suffix)) assert len(seq) == len(data) for i in xrange(len(seq)): if good[i]: if is_core: good[i] = data[i] >= self.mincov else: good[i] = data[i] < self.mincov #Close holes start = -self.maxdiff - 1 n_holes = 0 for i in xrange(len(seq)): if good[i]: if 0 < i - start <= self.maxdiff: for j in xrange(start, i): good[j] = True n_holes += 1 start = i + 1 self.log.log('Closed ' + grace.pretty_number(n_holes) + ' holes\n') f = open(workspace / ('%s-%s.fa' % (friendly_name, self.what)), 'wb') io.write_fasta( f, name, ''.join([(seq[i] if good[i] else 'N') for i in xrange(len(seq))])) f.close() f = open( workspace / ('%s-%s_masked.fa' % (friendly_name, self.what)), 'wb') io.write_fasta( f, name, ''.join([(seq[i] if good[i] else seq[i].lower()) for i in xrange(len(seq))])) f.close() f_good = open( workspace / ('%s-%s_parts.fa' % (friendly_name, self.what)), 'wb') f_nongood = open( workspace / ('%s-non%s_parts.fa' % (friendly_name, self.what)), 'wb') start = 0 n_good = [0] n_good_bases = [0] def emit(i): if i - start < self.minsize: return if good[start]: n_good[0] += 1 n_good_bases[0] += i - start io.write_fasta(f_good if good[start] else f_nongood, '%s:%d..%d' % (name, start + 1, i), seq[start:i]) for i in xrange(1, len(seq)): if good[i] != good[start]: emit(i) start = i emit(len(seq)) f_nongood.close() f_good.close() self.log.log( grace.pretty_number(sum(good)) + ' bases are ' + self.what + ', of ' + grace.pretty_number(len(seq)) + ' in reference sequence\n') self.log.log( grace.pretty_number(n_good[0]) + ' parts at least ' + grace.pretty_number(self.minsize) + ' bases long with ' + grace.pretty_number(n_good_bases[0]) + ' total bases\n') self.log.log('\n')