def main(args): size, args = grace.get_option_value(args, '--size', int, 200) stride, args = grace.get_option_value(args, '--stride', int, 50) grace.expect_no_further_options(args) if not args: print USAGE return 1 for filename in args: for name, seq in io.read_sequences(filename): name_parts = name.split(None, 1) name = name_parts[0] if len(name_parts) > 1: desc = ' ' + name_parts[1] else: desc = '' for i in xrange(-size + stride, len(seq), stride): start = max(0, min(len(seq), i)) end = max(0, min(len(seq), i + size)) io.write_fasta(sys.stdout, '%s:%d..%d' % (name, start + 1, end) + desc, seq[start:end]) return 0
def emit(i): if i - start < minsize: return if good[start]: n_good[0] += 1 n_good_bases[0] += i - start io.write_fasta(f_good if good[start] else f_nongood, '%s:%d..%d' % (name, start + 1, i), seq[start:i])
def run(self): extractions = [ ] for item in self.genes.split(','): extraction = item.split('/') assert len(extraction) == 4 extractions.append(extraction) rename = { } if self.rename: for item in self.rename.split(','): old,new = item.split('=') rename[old] = new work = self.get_workspace() with workspace.tempspace() as temp: items = list(annotation.read_annotations(self.annotation)) for item in items: item.seqid = rename.get(item.seqid, item.seqid) annotation.write_gff3(temp/'temp.gff', get_genes(items, extractions, self.log)) del items with open(temp/'temp.fa','wb') as f: for name,seq in io.read_sequences(self.genome): name = name.split()[0] name = rename.get(name,name) io.write_fasta(f, name, seq) reference_directory.Make_tt_reference( self.output_dir, filenames = [ temp/'temp.fa', temp/'temp.gff' ], index = self.index, ).run()
def run(self): extractions = [ ] for item in self.genes.split(','): extraction = item.split('/') assert len(extraction) == 4 extractions.append(extraction) rename = { } if self.rename: for item in self.rename.split(','): old,new = item.split('=') rename[old] = new work = self.get_workspace() with workspace.tempspace() as temp: items = list(annotation.read_annotations(self.annotation)) for item in items: item.seqid = rename.get(item.seqid, item.seqid) annotation.write_gff3(temp/'temp.gff', get_genes(items, extractions, self.log)) del items with open(temp/'temp.fa','wb') as f: for name,seq in io.read_sequences(self.genome): name = name.split()[0] name = rename.get(name,name) io.write_fasta(f, name, seq) reference_directory.Make_tt_reference( self.output_dir, filenames = [ temp/'temp.fa', temp/'temp.gff' ] + self.extra, index = self.index, shrimp = self.shrimp, bowtie = self.bowtie, star = self.star ).run()
def main(args): size, args = grace.get_option_value(args,'--size',int,200) stride, args = grace.get_option_value(args,'--stride',int,50) grace.expect_no_further_options(args) if not args: print USAGE return 1 for filename in args: for name, seq in io.read_sequences(filename): name_parts = name.split(None, 1) name = name_parts[0] if len(name_parts) > 1: desc = ' ' + name_parts[1] else: desc = '' for i in xrange(-size+stride,len(seq),stride): start = max(0,min(len(seq),i)) end = max(0,min(len(seq), i+size)) io.write_fasta( sys.stdout, '%s:%d..%d' % (name,start+1,end) + desc, seq[start:end] ) return 0
def build_snpeff(self): jar = io.find_jar('snpEff.jar') with open(self/'snpeff.config','wb') as f: print >> f, 'data_dir = snpeff' print >> f, 'genomes : ' + self.name print >> f, self.name + '.genome : ' + self.name snpwork = io.Workspace(self/'snpeff',must_exist=False) snpwork_genome = io.Workspace(snpwork/self.name,must_exist=False) snpwork_genomes = io.Workspace(snpwork/'genomes',must_exist=False) annotations = self.annotations_filename() assert annotations with open(snpwork_genome/'genes.gff','wb') as f: for record in annotation.read_annotations(annotations): if record.end <= record.start: continue if not record.attr: record.attr['attributes'] = 'none' print >> f, record.as_gff() with open(snpwork_genomes/(self.name+'.fa'),'wb') as f: for name, seq in io.read_sequences(self.reference_fasta_filename()): io.write_fasta(f, name, seq) io.execute('java -jar JAR build NAME -gff3 -c CONFIG', JAR=jar, NAME=self.name, CONFIG=self/'snpeff.config')
def emit(i): if i-start < minsize: return if good[start]: n_good[0] += 1 n_good_bases[0] += i-start io.write_fasta( f_good if good[start] else f_nongood, '%s:%d..%d' % (name, start+1,i), seq[start:i] )
def run(self): workspace = self.get_workspace() reference = reference_directory.Reference(self.reference, must_exist=True) reader_f = io.open_possibly_compressed_file(self.vcf) reader = vcf.Reader(reader_f) variants = collections.defaultdict(list) for record in reader: variants[record.CHROM].append(record) reader_f.close() for chrom in variants: variants[chrom].sort(key=lambda item: item.POS) filenames = [workspace / (item + '.fa') for item in reader.samples] for filename in filenames: with open(filename, 'wb'): pass for name, seq in io.read_sequences( reference.reference_fasta_filename()): for i, sample in enumerate(reader.samples): revised = [] pos = 0 for variant in variants[name]: gt = variant.samples[i].data.GT if gt is None: continue assert gt.isdigit( ), 'Unsupported genotype (can only use haploid genotypes): ' + gt gt_number = int(gt) if gt_number == 0: var_seq = variant.REF else: var_seq = str(variant.ALT[gt_number - 1]) assert re.match( '[ACGTN]*$', var_seq), 'Unsupported variant type: ' + var_seq new_pos = variant.POS - 1 assert new_pos >= pos, 'Variants overlap.' revised.append(seq[pos:new_pos]) pos = new_pos revised.append(var_seq) assert seq[pos:pos + len(variant.REF)].upper( ) == variant.REF, 'REF column in VCF does not match reference sequence' pos += len(variant.REF) revised.append(seq[pos:]) with open(filenames[i], 'ab') as f: io.write_fasta(f, name, ''.join(revised)) del variants[name] assert not variants, 'Chromosome names in VCF not in reference: ' + ' '.join( variants)
def run(self): assert self.release assert self.species assert self.assembly assert self.dna extractions = [ ] for item in self.genes.split(','): extraction = item.split('/') assert len(extraction) == 4 extractions.append(extraction) rename = { } if self.rename: for item in self.rename.split(','): old,new = item.split('=') rename[old] = new work = self.get_workspace() ensembl = workspace.Workspace(work/'ensembl') genome_filename = self.species+"."+self.assembly+"."+self.dna+".fa.gz" genome_url = "rsync://ftp.ensembl.org/ensembl/pub/release-"+self.release+"/fasta/"+self.species.lower()+"/dna/"+genome_filename gff_filename = self.species+"."+self.assembly+"."+self.release+".gff3.gz" gff_url = "rsync://ftp.ensembl.org/ensembl/pub/release-"+self.release+"/gff3/"+self.species.lower()+"/"+gff_filename if self.download: self.log.log("Fetching "+genome_url+"\n") io.execute(['rsync','-aP',genome_url, ensembl/genome_filename]) self.log.log("Fetching "+gff_url+"\n") io.execute(['rsync','-aP',gff_url, ensembl/gff_filename]) with workspace.tempspace() as temp: items = list(annotation.read_annotations(ensembl/gff_filename)) for item in items: item.seqid = rename.get(item.seqid, item.seqid) annotation.write_gff3(temp/'temp.gff', get_genes(items, extractions, self.log)) del items with open(temp/'temp.fa','wb') as f: for name,seq in io.read_sequences(ensembl/genome_filename): name = name.split()[0] name = rename.get(name,name) io.write_fasta(f, name, seq) reference_directory.Make_tt_reference( self.output_dir, filenames = [ temp/'temp.fa', temp/'temp.gff' ], index = self.index, ).run()
def do_fasta_output(names, interesting): for i, name in enumerate(names): sequence = [ ] #for refname in substitution_calls: # sequence.extend(substitution_calls[refname][i]) #for i in xrange(len(sequence)): # if sequence[i] not in 'ACGT': # sequence[i] = 'N' for refname, position, change_type, values, has_ambiguous, evidence in interesting: if not has_ambiguous and change_type == 'substitution': sequence.append(values[i]) io.write_fasta(sys.stdout, name + ' variable sites with consensus in all', ''.join(sequence))
def do_fasta_output(names, interesting): for i, name in enumerate(names): sequence = [] # for refname in substitution_calls: # sequence.extend(substitution_calls[refname][i]) # for i in xrange(len(sequence)): # if sequence[i] not in 'ACGT': # sequence[i] = 'N' for refname, position, change_type, values, has_ambiguous, evidence in interesting: if not has_ambiguous and change_type == "substitution": sequence.append(values[i]) io.write_fasta(sys.stdout, name + " variable sites with consensus in all", "".join(sequence))
def run(self): workspace = self.get_workspace() reference = reference_directory.Reference(self.reference, must_exist=True) reader_f = io.open_possibly_compressed_file(self.vcf) reader = vcf.Reader(reader_f) variants = collections.defaultdict(list) for record in reader: variants[record.CHROM].append(record) reader_f.close() for chrom in variants: variants[chrom].sort(key=lambda item: item.POS) filenames = [ workspace/(item+'.fa') for item in reader.samples ] for filename in filenames: with open(filename,'wb'): pass for name, seq in io.read_sequences(reference.reference_fasta_filename()): for i, sample in enumerate(reader.samples): revised = [ ] pos = 0 for variant in variants[name]: gt = variant.samples[i].data.GT if gt is None: continue assert gt.isdigit(), 'Unsupported genotype (can only use haploid genotypes): '+gt gt_number = int(gt) if gt_number == 0: var_seq = variant.REF else: var_seq = str(variant.ALT[gt_number-1]) assert re.match('[ACGTN]*$', var_seq), 'Unsupported variant type: '+var_seq new_pos = variant.POS-1 assert new_pos >= pos, 'Variants overlap.' revised.append(seq[pos:new_pos]) pos = new_pos revised.append(var_seq) assert seq[pos:pos+len(variant.REF)].upper() == variant.REF, 'REF column in VCF does not match reference sequence' pos += len(variant.REF) revised.append(seq[pos:]) with open(filenames[i],'ab') as f: io.write_fasta(f, name, ''.join(revised)) del variants[name] assert not variants, 'Chromosome names in VCF not in reference: '+' '.join(variants)
def run(self): f = self.begin_output() for filename in self.filenames: for name, seq in io.read_sequences(filename): name_parts = name.split(None, 1) name = name_parts[0] for i in xrange(-self.size+self.stride,len(seq),self.stride): start = max(0,min(len(seq),i)) end = max(0,min(len(seq), i+self.size)) shred_name = '%s:%d..%d' % (name,start+1,end) shred_seq = seq if self.quality: io.write_fastq(f, shred_name, seq[start:end], chr(33+self.quality)*(end-start)) else: io.write_fasta(f, shred_name, seq[start:end]) self.end_output(f)
def run(self): f = self.begin_output() for filename in self.filenames: for name, seq in io.read_sequences(filename): name_parts = name.split(None, 1) name = name_parts[0] for i in xrange(-self.size + self.stride, len(seq), self.stride): start = max(0, min(len(seq), i)) end = max(0, min(len(seq), i + self.size)) shred_name = '%s:%d..%d' % (name, start + 1, end) shred_seq = seq if self.quality: io.write_fastq(f, shred_name, seq[start:end], chr(33 + self.quality) * (end - start)) else: io.write_fasta(f, shred_name, seq[start:end]) self.end_output(f)
def set_sequences(self, filenames): reference_genbank_filename = self / 'reference.gbk' reference_filename = self / 'reference.fa' reference_genbank_file = open(reference_genbank_filename,'wb') any_genbank = [ False ] def genbank_callback(name, record): """ Make a copy of any genbank files passed in. """ from Bio import SeqIO SeqIO.write([record], reference_genbank_file, 'genbank') f = open(self / (grace.filesystem_friendly_name(name) + '.gbk'), 'wb') SeqIO.write([record], f, 'genbank') f.close() any_genbank[0] = True lengths = [ ] seen = set() f = open(reference_filename, 'wb') for filename in filenames: for name, seq in io.read_sequences(filename, genbank_callback=genbank_callback): name = name.split()[0] assert name not in seen, 'Duplicate chromosome name: ' + name seen.add(name) lengths.append( (name, len(seq)) ) io.write_fasta(f, name, seq) f.close() self.set_object(lengths, 'reference-lengths.pickle.gz') reference_genbank_file.close() if not any_genbank[0]: os.unlink(reference_genbank_filename) # Create an index of the reference sequences for samtools io.execute([ 'samtools', 'faidx', reference_filename ])
def main(args): mincov, args = grace.get_option_value(args, '--mincov', int, 1) maxdiff, args = grace.get_option_value(args, '--maxdiff', int, 16) minsize, args = grace.get_option_value(args, '--minsize', int, 200) what, args = grace.get_option_value(args, '--what', as_core_or_unique, 'core') is_core = (what == 'core') grace.expect_no_further_options(args) if len(args) < 2: print >> sys.stderr, HELP raise grace.Help_shown() output_dir, working_dirs = args[0], args[1:] assert not path.exists(path.join(output_dir, 'reference.fa')), \ 'Output directory not given' if not path.exists(output_dir): os.mkdir(output_dir) for name, seq in io.read_sequences( path.join(working_dirs[0], 'reference.fa')): print name friendly_name = grace.filesystem_friendly_name(name) good = [True] * len(seq) for working_dir in working_dirs: if is_core: suffix = '-depth.userplot' else: suffix = '-ambiguous-depth.userplot' data = trivia.read_unstranded_userplot( os.path.join(working_dir, friendly_name + suffix)) assert len(seq) == len(data) for i in xrange(len(seq)): if good[i]: if is_core: good[i] = data[i] >= mincov else: good[i] = data[i] < mincov #Close holes start = -maxdiff - 1 n_holes = 0 for i in xrange(len(seq)): if good[i]: if 0 < i - start <= maxdiff: for j in xrange(start, i): good[j] = True n_holes += 1 start = i + 1 print 'Closed', grace.pretty_number(n_holes), 'holes' f = open(path.join(output_dir, '%s-%s.fa' % (friendly_name, what)), 'wb') io.write_fasta( f, name, ''.join([(seq[i] if good[i] else 'N') for i in xrange(len(seq))])) f.close() f = open( path.join(output_dir, '%s-%s_masked.fa' % (friendly_name, what)), 'wb') io.write_fasta( f, name, ''.join([(seq[i] if good[i] else seq[i].lower()) for i in xrange(len(seq))])) f.close() f_good = open( path.join(output_dir, '%s-%s_parts.fa' % (friendly_name, what)), 'wb') f_nongood = open( path.join(output_dir, '%s-non%s_parts.fa' % (friendly_name, what)), 'wb') start = 0 n_good = [0] n_good_bases = [0] def emit(i): if i - start < minsize: return if good[start]: n_good[0] += 1 n_good_bases[0] += i - start io.write_fasta(f_good if good[start] else f_nongood, '%s:%d..%d' % (name, start + 1, i), seq[start:i]) for i in xrange(1, len(seq)): if good[i] != good[start]: emit(i) start = i emit(len(seq)) f_nongood.close() f_good.close() print grace.pretty_number( sum(good)), 'bases are ' + what + ', of', grace.pretty_number( len(seq)), 'in reference sequence' print grace.pretty_number( n_good[0]), 'parts at least', grace.pretty_number( minsize), 'bases long with', grace.pretty_number( n_good_bases[0]), 'total bases' print
# python2.6 modify_sequence.py data/velvet_test_reference.fa >data/velvet_test_reference_modified.fa import sys, random from nesoni import io for name, seq in io.read_sequences(sys.argv[1]): j = 0 for i in xrange(0,len(seq)-100,100): original = seq[i] if j%3 == 0: while True: new = random.choice('ACGT') if new != original: break seq = seq[:i] + new + seq[i+1:] elif j%3 == 1: n = (j // 3) % 9 + 1 seq = seq[:i] + seq[i+n:] else: n = (j // 3) % 9 + 1 seq = seq[:i] + ''.join( random.choice('ACGT') for k in xrange(n) ) + seq[i:] j += 1 io.write_fasta(sys.stdout, name, seq)
def run(self): #mincov, args = grace.get_option_value(args, '--mincov', int, 1) #maxdiff, args = grace.get_option_value(args, '--maxdiff', int, 16) #minsize, args = grace.get_option_value(args, '--minsize', int, 200) #what, args = grace.get_option_value(args, '--what', as_core_or_unique, 'core') #is_core = (what == 'core') # #grace.expect_no_further_options(args) # #if len(args) < 2: # print >> sys.stderr, HELP # raise grace.Help_shown() # #output_dir, working_dirs = args[0], args[1:] # ##assert not path.exists(path.join(output_dir, 'reference.fa')), \ #assert not path.exists(path.join(output_dir, 'parameters')), \ # 'Output directory not given' # #if not path.exists(output_dir): # os.mkdir(output_dir) assert self.what in ( 'core', 'unique'), 'Expected --what to be either "core" or "unique".' is_core = (self.what == 'core') workspace = self.get_workspace() for name, seq in io.read_sequences( working_directory.Working(self.working_dirs[0]).get_reference( ).reference_fasta_filename()): self.log.log(name + '\n') friendly_name = grace.filesystem_friendly_name(name) good = [True] * len(seq) for working_dir in self.working_dirs: if is_core: suffix = '-depth.userplot' else: suffix = '-ambiguous-depth.userplot' data = trivia.read_unstranded_userplot( os.path.join(working_dir, friendly_name + suffix)) assert len(seq) == len(data) for i in xrange(len(seq)): if good[i]: if is_core: good[i] = data[i] >= self.mincov else: good[i] = data[i] < self.mincov #Close holes start = -self.maxdiff - 1 n_holes = 0 for i in xrange(len(seq)): if good[i]: if 0 < i - start <= self.maxdiff: for j in xrange(start, i): good[j] = True n_holes += 1 start = i + 1 self.log.log('Closed ' + grace.pretty_number(n_holes) + ' holes\n') f = open(workspace / ('%s-%s.fa' % (friendly_name, self.what)), 'wb') io.write_fasta( f, name, ''.join([(seq[i] if good[i] else 'N') for i in xrange(len(seq))])) f.close() f = open( workspace / ('%s-%s_masked.fa' % (friendly_name, self.what)), 'wb') io.write_fasta( f, name, ''.join([(seq[i] if good[i] else seq[i].lower()) for i in xrange(len(seq))])) f.close() f_good = open( workspace / ('%s-%s_parts.fa' % (friendly_name, self.what)), 'wb') f_nongood = open( workspace / ('%s-non%s_parts.fa' % (friendly_name, self.what)), 'wb') start = 0 n_good = [0] n_good_bases = [0] def emit(i): if i - start < self.minsize: return if good[start]: n_good[0] += 1 n_good_bases[0] += i - start io.write_fasta(f_good if good[start] else f_nongood, '%s:%d..%d' % (name, start + 1, i), seq[start:i]) for i in xrange(1, len(seq)): if good[i] != good[start]: emit(i) start = i emit(len(seq)) f_nongood.close() f_good.close() self.log.log( grace.pretty_number(sum(good)) + ' bases are ' + self.what + ', of ' + grace.pretty_number(len(seq)) + ' in reference sequence\n') self.log.log( grace.pretty_number(n_good[0]) + ' parts at least ' + grace.pretty_number(self.minsize) + ' bases long with ' + grace.pretty_number(n_good_bases[0]) + ' total bases\n') self.log.log('\n')
def pastiche(args): if len(args) < 4: print USAGE return 1 mask_only, args = grace.get_option_value(args, '--mask', grace.as_bool, False) min_leftover, args = grace.get_option_value(args, '--min-leftover', int, 20) output_dir, args = args[0], args[1:] #, ref_filename, contig_filenames = args[0], args[1], args[2:] ref_filenames = [] contig_filenames = [] grace.execute(args, {'contigs': lambda args: contig_filenames.extend(args)}, lambda args: ref_filenames.extend(args)) assert ref_filenames, 'No reference sequences given' assert contig_filenames, 'No contig sequences given' contigs = dict([(name.split()[0], seq) for filename in contig_filenames for name, seq in io.read_sequences(filename)]) dir_contigs = {} for name in contigs: dir_contigs[name + '+'] = contigs[name] dir_contigs[name + '-'] = bio.reverse_complement(contigs[name]) dir_contigs_used = {} for name in dir_contigs: dir_contigs_used[name] = [False] * len(dir_contigs[name]) workspace = io.Workspace(output_dir) temp_prefix = workspace._object_filename('temp-pastiche') out_f = workspace.open('pastiche.fa', 'wb') for ref_filename in ref_filenames: for ref_name, ref_seq in io.read_sequences(ref_filename): ref_name = ref_name.split()[0] grace.status(ref_name) f = open(temp_prefix + '.fa', 'wb') io.write_fasta(f, 'ref', ref_seq) f.close() scores = [-1] * (len(ref_seq) * 2) strings = ['N', ''] * (len(ref_seq)) contexts = [None for i in xrange(len(ref_seq) * 2)] #MAXSCORE = len(ref_seq)+1 #for i in xrange(len(ref_seq)): # if ref_seq[i].upper() != 'N': # strings[i*2] = ref_seq[i] # scores[i*2] = MAXSCORE #for i in xrange(len(ref_seq)-1): # if ref_seq[i].upper() != 'N' and ref_seq[i+1].upper() != 'N': # scores[i*2+1] = MAXSCORE if mask_only: for i in xrange(len(ref_seq)): strings[i * 2] = ref_seq[i].lower() def put(position, dir_contig_name, start, end, score): if scores[position] < score: scores[position] = score strings[position] = dir_contigs[dir_contig_name][start:end] contexts[position] = (dir_contig_name, start, end, score) for contig_filename in contig_filenames: execute([ 'nucmer', '--prefix', temp_prefix, #'--maxmatch', #Very slow '--nosimplify', '--minmatch', '9', '--mincluster', '50', #'--maxgap', '1000', #'--breaklen', '1000', # Increasing this reduces Ns, but is slow #'--diagfactor', '1.0', temp_prefix + '.fa', contig_filename ]) for contig_name, contig_seq in io.read_sequences( contig_filename): contig_name = contig_name.split()[0] grace.status(ref_name + ' vs ' + contig_name) p = run([ 'show-aligns', temp_prefix + '.delta', 'ref', contig_name ], stderr=subprocess.PIPE) alignments = [] while True: line = p.stdout.readline() if not line: break if not line.startswith('-- BEGIN'): continue parts = line.split() ref_start = int(parts[5]) ref_end = int(parts[7]) query_start = int(parts[10]) query_end = int(parts[12]) #assert ref_start < ref_end #ref_start -= 1 #Zero based coordinates al_ref = [] al_query = [] while True: block = [] end = False while True: line = p.stdout.readline() if line.startswith('-- END'): end = True break if line == '\n': if block: break else: continue block.append(line) if end: break al_ref.append(block[0].split()[1]) al_query.append(block[1].split()[1]) al_ref = ''.join(al_ref) al_query = ''.join(al_query) if ref_start > ref_end: al_ref = bio.reverse_complement(al_ref) al_query = bio.reverse_complement(al_query) ref_start, ref_end = ref_end, ref_start query_start, query_end = query_end, query_start if query_start > query_end: dir_contig_name = contig_name + '-' query_start = len(contig_seq) + 1 - query_start query_end = len(contig_seq) + 1 - query_end else: dir_contig_name = contig_name + '+' ref_start -= 1 #Zero based coordinates query_start -= 1 #print al_ref #print al_query #Pretty dumb scoring scheme al_score = 0 for i in xrange(len(al_ref)): if al_ref[i] == al_query[i]: al_score += 1 #else: # al_score -= 1 #Pastiche alignment over reference ref_pos = ref_start query_pos = query_start al_pos = 0 while al_pos < len(al_ref): assert al_ref[al_pos] != '.' if al_query[al_pos] == '.': put(ref_pos * 2, dir_contig_name, query_pos, query_pos, al_score) else: assert al_query[al_pos].lower() == dir_contigs[ dir_contig_name][query_pos].lower() put(ref_pos * 2, dir_contig_name, query_pos, query_pos + 1, al_score) query_pos += 1 al_pos += 1 al_pos_end = al_pos query_pos_end = query_pos while al_pos_end < len( al_ref) and al_ref[al_pos_end] == '.': al_pos_end += 1 query_pos_end += 1 #put(ref_pos*2+1, al_query[al_pos:al_pos_end], al_score) assert al_query[al_pos:al_pos_end].lower( ) == dir_contigs[dir_contig_name][ query_pos:query_pos_end].lower() put(ref_pos * 2 + 1, dir_contig_name, query_pos, query_pos_end, al_score) al_pos = al_pos_end query_pos = query_pos_end ref_pos += 1 p.wait() grace.status(ref_name) result = ''.join(strings) io.write_fasta(out_f, ref_name, result) for context in contexts: if context is None: continue name, start, end, score = context for i in xrange(start, end): dir_contigs_used[name][i] = True #Interpolation #result = [ ] #i = 0 #while i < len(ref_seq): # if strings[i*2].upper() != 'N': # result.append(strings[i*2]) # result.append(strings[i*2+1]) # i += 1 # continue # # j = i # while strings[j*2].upper() == 'N': # j += 1 # # grace.status('') # print >> sys.stderr, 'interpolating', i+1,'..',j # # window = 20 #!!!!!!!!!!! # left_contexts = collections.defaultdict(lambda:0) # for i1 in xrange(max(0,i-window),i): # for context_name, context_start, context_end, context_score in contexts[i1*2]: # key = (context_name, context_end + i - i1) # left_contexts[key] = max(left_contexts[key],context_score) # # right_contexts = collections.defaultdict(lambda:0) # for j1 in xrange(j,min(j+window,len(ref_seq))): # for context_name, context_start, context_end, context_score in contexts[j1*2]: # key = (context_name, context_start + j - j1) # right_contexts[key] = max(left_contexts[key],context_score) # # #print >> sys.stderr, left_contexts # #print >> sys.stderr, right_contexts # # options = [ ] # # for (left_name, left_pos), left_score in left_contexts.items(): # for (right_name, right_pos), right_score in right_contexts.items(): # if left_name != right_name: continue # if right_pos < left_pos: continue # # if right_pos-left_pos > (j-i) * 4.0 + 10: continue #!!!!!!!!!!!!!!!!!!!!!!1 # if right_pos-left_pos < (j-i) * 0.25 - 10: continue # # score = float(min(right_pos-left_pos,j-i))/max(right_pos-left_pos,j-i) # score *= left_score + right_score # #print >> sys.stderr, left_name, right_pos-left_pos, j-i, score # options.append( (score, left_name, left_pos, right_pos) ) # # if options: # best = max(options, key=lambda option: option[0]) # print >> sys.stderr, '->', best # result.append( dir_contigs[best[1]][best[2]:best[3]].lower() ) # else: # print >> sys.stderr, '-> no good interpolation' # result.append( ref_seq[i:j] ) # # i = j # #result = ''.join(result) #io.write_fasta(sys.stdout, ref_name, result) #print >> sys.stderr, len(result), result.count('N') #for pos, size in N_runs: # out_size = len(''.join( strings[pos*2:pos*2+2] )) # print >> sys.stderr, pos, size, '->', out_size out_f.close() grace.status('') #for name, seq in io.read_sequences(ref_filename): # result = pastiche(seq, contigs_filename) # io.write_fasta(sys.stdout, name, result) leftover_f = workspace.open('leftovers.fa', 'wb') for name in sorted(contigs): used = [ (a or b) for a, b in zip(dir_contigs_used[name + '+'], dir_contigs_used[name + '-'][::-1]) ] i = 0 while i < len(used): j = i while j < len(used) and not used[j]: j += 1 if j - i > min_leftover: if i == 0 and j == len(used): out_name = name else: out_name = name + ':%d..%d' % (i + 1, j) io.write_fasta(leftover_f, out_name, contigs[name][i:j]) i = j + 1 leftover_f.close() for suffix in ['.fa', '.delta']: os.unlink(temp_prefix + suffix)
def fill_scaffolds(args): max_filler_length, args = grace.get_option_value(args, '--max-filler', int, 4000) if len(args) < 2: print USAGE return 1 (output_dir, graph_dir), args = args[:2], args[2:] scaffolds = [ ] def scaffold(args): circular, args = grace.get_option_value(args, '--circular', grace.as_bool, False) scaffold = [ ] for item in args: scaffold.append( ('contig', int(item)) ) scaffold.append( ('gap', None) ) if not circular: scaffold = scaffold[:-1] name = 'custom_scaffold_%d' % (len(scaffolds)+1) scaffolds.append( (name, scaffold) ) grace.execute(args, [scaffold]) custom_scaffolds = (len(scaffolds) != 0) sequences = dict( (a.split()[0], b.upper()) for a,b in io.read_sequences(os.path.join( graph_dir, '454AllContigs.fna'))) sequence_names = sorted(sequences) sequence_ids = dict(zip(sequence_names, xrange(1,len(sequence_names)+1))) contexts = { } context_names = { } context_depths = { } for i in xrange(1,len(sequence_names)+1): seq = sequences[sequence_names[i-1]] contexts[ i ] = seq context_names[ i ] = sequence_names[i-1]+'-fwd' contexts[ -i ] = bio.reverse_complement(seq) context_names[ -i ] = sequence_names[i-1]+'-rev' links = collections.defaultdict(list) for line in open( os.path.join(graph_dir, '454ContigGraph.txt'), 'rU'): parts = line.rstrip('\n').split('\t') if parts[0].isdigit(): seq = sequence_ids[parts[1]] context_depths[ seq] = float(parts[3]) context_depths[-seq] = float(parts[3]) if parts[0] == 'C': name1 = 'contig%05d' % int(parts[1]) dir1 = {"3'" : 1, "5'" : -1 }[parts[2]] name2 = 'contig%05d' % int(parts[3]) dir2 = {"5'" : 1, "3'" : -1 }[parts[4]] depth = int(parts[5]) #print name1, dir1, name2, dir2, depth links[ sequence_ids[name1] * dir1 ].append( (depth, sequence_ids[name2] * dir2) ) links[ sequence_ids[name2] * -dir2 ].append( (depth, sequence_ids[name1] * -dir1) ) if parts[0] == 'S' and not custom_scaffolds: name = 'scaffold%05d' % int(parts[2]) components = parts[3].split(';') scaffold = [ ] for component in components: a,b = component.split(':') if a == 'gap': scaffold.append( ('gap',int(b)) ) else: strand = { '+': +1, '-': -1 }[ b ] scaffold.append( ('contig', sequence_ids['contig%05d'%int(a)] * strand) ) scaffolds.append( (name, scaffold) ) #paths = { } # #todo = [ ] #for i in contexts: # for depth_left, neg_left in links[-i]: # left = -neg_left # for depth_right, right in links[i]: # todo.append( ( max(-depth_left,-depth_right,-context_depths[i]), left, right, (i,)) ) # #heapq.heapify(todo) #while todo: # score, source, dest, path = heapq.heappop(todo) # if (source,dest) in paths: continue # # paths[(source,dest)] = path # # if len(contexts[dest]) > max_filler_length: continue # # for depth, next in links[dest]: # heapq.heappush(todo, # ( max(score,-depth,-context_depths[dest]), source, next, path+(dest,)) # ) path_source_dest = collections.defaultdict(dict) # source -> dest -> next path_dest_source = collections.defaultdict(dict) # dest -> source -> next # Use links, in order to depth of coverage, to construct paths between contigs # Thus: paths have maximum minimum depth # subsections of paths also have this property todo = [ ] for i in contexts: for depth_link, right in links[i]: todo.append( ( depth_link, i, right) ) todo.sort(reverse=True) for score, left, right in todo: if right in path_source_dest[left]: continue sources = [(left,right)] if len(contexts[left]) <= max_filler_length: sources += path_dest_source[left].items() destinations = [right] if len(contexts[right]) <= max_filler_length: destinations += path_source_dest[right].keys() for source, next in sources: for dest in destinations: if dest in path_source_dest[source]: continue path_source_dest[source][dest] = next path_dest_source[dest][source] = next workspace = io.Workspace(output_dir) scaffold_f = workspace.open('scaffolds.fa','wb') #comments = [ ] features = [ ] used = set() previous_total = 0 for i, (name, scaffold) in enumerate(scaffolds): result = '' # Inefficient. Meh. n_filled = 0 n_failed = 0 for j, item in enumerate(scaffold): if item[0] == 'contig': result += contexts[item[1]] used.add(abs(item[1])) else: left = scaffold[j-1] right = scaffold[ (j+1) % len(scaffold) ] #If gap at end, assume circular assert left[0] == 'contig' assert right[0] == 'contig' gap_start = len(result) can_fill = right[1] in path_source_dest[left[1]] if can_fill: n = 0 k = path_source_dest[left[1]][right[1]] while k != right[1]: n += len(contexts[k]) result += contexts[k].lower() used.add(abs(k)) k = path_source_dest[k][right[1]] n_filled += 1 if item[1] is not None and max(n,item[1]) > min(n,item[1])*4: print >> sys.stderr, 'Warning: gap size changed from %d to %d in scaffold %d' % (item[1],n,i+1) else: n_failed += 1 #print >> sys.stderr, 'Warning: No path to fill a gap in scaffold %d' % (i+1) result += 'n' * (9 if item[1] is None else item[1]) gap_end = len(result) #features.append( '%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s' % ( # 'all-scaffolds', # 'fill-scaffolds', # 'gap', # previous_total + gap_start+1, # previous_total + max(gap_end, gap_start+1), #Allow for zeroed out gaps. Hmm. # '.', #score # '+', #strand # '.', #frame # '' #properties #)) features.append( '%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s' % ( name, 'fill-scaffolds', 'gap', gap_start+1, max(gap_end, gap_start+1), #Allow for zeroed out gaps. Hmm. '.', #score '+', #strand '.', #frame '' #properties )) io.write_fasta(scaffold_f, name, result) previous_total += len(result) #comments.append('##sequence-region %s %d %d' % (name, 1, len(result))) print >> sys.stderr, 'Scaffold%05d: %d gaps filled, %d could not be filled' % (i+1, n_filled, n_failed) scaffold_f.close() gff_f = workspace.open('scaffolds.gff', 'wb') #print >>gff_f, '##gff-version 3' #for comment in comments: # print >>gff_f, comment for feature in features: print >>gff_f, feature gff_f.close() leftovers_f = workspace.open('leftovers.fa', 'wb') for name in sequence_names: if sequence_ids[name] not in used: io.write_fasta(leftovers_f, name, sequences[name]) leftovers_f.close() ends = { } for i, (name, scaffold) in enumerate(scaffolds): if scaffold[-1][0] == 'gap': continue ends[ '%s start' % name ] = scaffold[-1][1] ends[ '%s end ' % name ] = -scaffold[0][1] for end1 in sorted(ends): options = [ end2 for end2 in ends if -ends[end2] in path_source_dest[ends[end1]] ] if len(options) == 1: print >> sys.stderr, 'Note: from', end1, 'only', options[0], 'is reachable'
def run(self): for filename in self.filenames: for name, seq in io.read_sequences(filename): if self.only and name.split()[0] not in self.only: continue io.write_fasta(sys.stdout, name, seq)
def run(self): #mincov, args = grace.get_option_value(args, '--mincov', int, 1) #maxdiff, args = grace.get_option_value(args, '--maxdiff', int, 16) #minsize, args = grace.get_option_value(args, '--minsize', int, 200) #what, args = grace.get_option_value(args, '--what', as_core_or_unique, 'core') #is_core = (what == 'core') # #grace.expect_no_further_options(args) # #if len(args) < 2: # print >> sys.stderr, HELP # raise grace.Help_shown() # #output_dir, working_dirs = args[0], args[1:] # ##assert not path.exists(path.join(output_dir, 'reference.fa')), \ #assert not path.exists(path.join(output_dir, 'parameters')), \ # 'Output directory not given' # #if not path.exists(output_dir): # os.mkdir(output_dir) assert self.what in ('core','unique'), 'Expected --what to be either "core" or "unique".' is_core = (self.what == 'core') workspace = self.get_workspace() for name, seq in io.read_sequences(working_directory.Working(self.working_dirs[0]).get_reference().reference_fasta_filename()): self.log.log(name + '\n') friendly_name = grace.filesystem_friendly_name(name) good = [ True ] * len(seq) for working_dir in self.working_dirs: if is_core: suffix = '-depth.userplot' else: suffix = '-ambiguous-depth.userplot' data = trivia.read_unstranded_userplot( os.path.join(working_dir, friendly_name+suffix) ) assert len(seq) == len(data) for i in xrange(len(seq)): if good[i]: if is_core: good[i] = data[i] >= self.mincov else: good[i] = data[i] < self.mincov #Close holes start = -self.maxdiff-1 n_holes = 0 for i in xrange(len(seq)): if good[i]: if 0 < i-start <= self.maxdiff: for j in xrange(start,i): good[j] = True n_holes += 1 start = i+1 self.log.log('Closed '+grace.pretty_number(n_holes)+' holes\n') f = open( workspace/('%s-%s.fa' % (friendly_name,self.what)), 'wb') io.write_fasta(f, name, ''.join([ (seq[i] if good[i] else 'N') for i in xrange(len(seq)) ]) ) f.close() f = open( workspace/('%s-%s_masked.fa' % (friendly_name,self.what)), 'wb') io.write_fasta(f, name, ''.join([ (seq[i] if good[i] else seq[i].lower()) for i in xrange(len(seq)) ]) ) f.close() f_good = open( workspace/('%s-%s_parts.fa' % (friendly_name,self.what)), 'wb') f_nongood = open( workspace/('%s-non%s_parts.fa' % (friendly_name,self.what)), 'wb') start = 0 n_good = [0] n_good_bases = [0] def emit(i): if i-start < self.minsize: return if good[start]: n_good[0] += 1 n_good_bases[0] += i-start io.write_fasta( f_good if good[start] else f_nongood, '%s:%d..%d' % (name, start+1,i), seq[start:i] ) for i in xrange(1,len(seq)): if good[i] != good[start]: emit(i) start = i emit(len(seq)) f_nongood.close() f_good.close() self.log.log(grace.pretty_number(sum(good))+' bases are '+self.what+', of '+grace.pretty_number(len(seq))+' in reference sequence\n') self.log.log(grace.pretty_number(n_good[0])+' parts at least '+grace.pretty_number(self.minsize)+' bases long with '+grace.pretty_number(n_good_bases[0])+' total bases\n') self.log.log('\n')
def run(self): workspace = self.get_workspace() read_length = 100 left = rand_seq(read_length-1) while True: flank = rand_seq(1) if flank != self.ref[:1]: break left += flank right = rand_seq(read_length-1) while True: flank = rand_seq(1) if flank != self.ref[-1:]: break right = flank+right i = 0 variants_used = [ ] with open(workspace/'reads.fq','wb') as f: for i, variant in enumerate(self.variants): if 'x' in variant: variant, count = variant.split('x') count = int(count) else: count = 10 variants_used.append( (variant,count) ) seq = left+variant+right for j in xrange(count): pos = len(variant)+random.randrange(read_length-len(variant)) read = seq[pos:pos+read_length] if random.randrange(2): read = bio.reverse_complement(read) i += 1 io.write_fastq(f,'read_%s_%d' % (variant,i),read,chr(64+30)*len(read)) reference = left+self.ref+right primary_variant = left+variants_used[0][0]+right with open(workspace/'reference.fa','wb') as f: io.write_fasta(f,'chr1',reference) legion.remake_needed() self.analysis( workspace/'sample', workspace/'reference.fa', reads = [ workspace/'reads.fq' ], ).run() self.freebayes( workspace/'freebayes', workspace/'sample', ).run() self.vcf_filter( workspace/'filtered', workspace/'freebayes.vcf', ).run() Vcf_patch( workspace/'patch', workspace/('sample','reference'), workspace/'filtered.vcf' ).run() patched = io.read_sequences(workspace/('patch','sample.fa')).next()[1] masked = io.read_sequences(workspace/('sample','consensus_masked.fa')).next()[1].upper() with open(workspace/'freebayes.vcf','rU') as f: reader = vcf.Reader(f) raw_count = len(list(reader)) with open(workspace/'filtered.vcf','rU') as f: reader = vcf.Reader(f) filtered_count = len(list(vcf.Reader(open(workspace/'filtered.vcf','rU')))) with open(workspace/('sample','report.txt'),'rb') as f: nesoni_count = len(f.readlines()) - 1 self.log.log('\n') self.log.datum(workspace.name,'changes found by "nesoni consensus:"', nesoni_count) self.log.datum(workspace.name,'is correctly patched by "nesoni consensus:"', masked == primary_variant) self.log.log('\n') self.log.datum(workspace.name,'raw variants', raw_count) self.log.datum(workspace.name,'variants after filtering', filtered_count) self.log.datum(workspace.name,'is correctly patched by VCF pipeline', patched == primary_variant) self.log.log('\n')
def main(args): grace.require_shrimp_1() n_cpus = grace.how_many_cpus() solid, args = grace.get_flag(args, '--solid') verbose, args = grace.get_flag(args, '--verbose') threshold, args = grace.get_option_value(args, '--threshold', str, '68%') stride, args = grace.get_option_value(args, '--stride', int, 1) max_shrimps, args = grace.get_option_value(args, '--cpus', int, n_cpus) batch_size, args = grace.get_option_value(args, '--batch-size', int, 5000000) input_reference_filenames = [ ] reads_filenames = [ ] shrimp_options = [ '-h', threshold ] if threshold.endswith('%'): threshold = -float(threshold[:-1])/100.0 else: threshold = int(threshold) output_dir = [ ] #As list so can write to from function. Gah. def front_command(args): grace.expect_no_further_options(args) if len(args) < 1: return output_dir.append(args[0]) input_reference_filenames.extend( [ os.path.abspath(filename) for filename in args[1:] ]) def reads_command(args): grace.expect_no_further_options(args) reads_filenames.extend([ [ os.path.abspath(filename) ] for filename in args]) def pairs_command(args): grace.expect_no_further_options(args) assert len(args) == 2, 'Expected exactly two files in "pairs"' reads_filenames.append([ os.path.abspath(filename) for filename in args ]) def shrimp_options_command(args): shrimp_options.extend(args) grace.execute(args, { 'reads': reads_command, '--reads': reads_command, 'pairs': pairs_command, 'shrimp-options': shrimp_options_command, '--shrimp-options': shrimp_options_command, }, front_command) if not output_dir: print >> sys.stderr, USAGE % n_cpus return 1 output_dir = output_dir[0] assert input_reference_filenames, 'No reference files given' assert reads_filenames, 'No read files given' for filename in itertools.chain(input_reference_filenames, *reads_filenames): assert os.path.exists(filename), '%s does not exist' % filename if not os.path.isdir(output_dir): os.mkdir(output_dir) if solid: shrimp = 'rmapper-cs' else: shrimp = 'rmapper-ls' reference_filename = os.path.join(output_dir,'reference.fa') reference_file = open(reference_filename,'wb') total_reference_sequences = 0 total_reference_bases = 0 for input_reference_filename in input_reference_filenames: for name, sequence in io.read_sequences(input_reference_filename): #Don't retain any comment name = name.split()[0] io.write_fasta(reference_file, name, sequence) total_reference_sequences += 1 total_reference_bases += len(sequence) reference_file.close() print '%s base%s in %s reference sequence%s' % ( grace.pretty_number(total_reference_bases), 's' if total_reference_bases != 1 else '', grace.pretty_number(total_reference_sequences), 's' if total_reference_sequences != 1 else '') assert total_reference_bases, 'Reference sequence file is empty' config = { 'references' : input_reference_filenames, 'reads' : reads_filenames, 'stride' : stride, 'solid': solid, 'threshold': threshold, } config_file = open(os.path.join(output_dir, 'config.txt'), 'wb') pprint.pprint(config, config_file) config_file.close() output_filename = os.path.join(output_dir, 'shrimp_hits.txt.gz') output_file = gzip.open(output_filename, 'wb') unmapped_filename = os.path.join(output_dir, 'unmapped.fa.gz') unmapped_file = gzip.open(unmapped_filename, 'wb') dirty_filenames = set() dirty_filenames.add(output_filename) dirty_filenames.add(unmapped_filename) #warn_low_threshold = True try: #Cleanup temporary files N = [0] def do_shrimp(read_set): my_number = N[0] N[0] += 1 tempname = os.path.join(output_dir,'temp%d-%d.fa' % (os.getpid(),my_number)) tempname_out = os.path.join(output_dir,'temp%d-%d.txt' % (os.getpid(),my_number)) dirty_filenames.add(tempname) dirty_filenames.add(tempname_out) f = open(tempname,'wb') for read_name, read_seq in read_set: print >> f, '>' + read_name print >> f, read_seq f.close() command = shrimp + ' ' + ' '.join(shrimp_options) + ' ' + \ tempname + ' ' + reference_filename + ' >' + tempname_out if not verbose: command += ' 2>/dev/null' #f = os.popen(command, 'r') child_pid = os.spawnl(os.P_NOWAIT,'/bin/sh','/bin/sh','-c',command) #print 'SHRiMP %d running' % my_number def finalize(): exit_status = os.waitpid(child_pid, 0)[1] assert exit_status == 0, 'Shrimp indicated an error' hits = { } # read_name -> [ hit line ] f = open(tempname_out,'rb') for line in f: if line.startswith('>'): read_name = line.split(None,1)[0][1:] if read_name not in hits: hits[read_name] = [ ] hits[read_name].append(line) f.close() for read_name, read_seq in read_set: if read_name in hits: for hit in hits[read_name]: output_file.write(hit) else: print >> unmapped_file, '>' + read_name print >> unmapped_file, read_seq output_file.flush() unmapped_file.flush() os.unlink(tempname) dirty_filenames.remove(tempname) os.unlink(tempname_out) dirty_filenames.remove(tempname_out) #print 'SHRiMP %d finished' % my_number return finalize shrimps = [ ] reader = iter_reads(config) read_count = 0 while True: read_set = [ ] read_set_bases = 0 #Read name should not include comment cruft # - SHRIMP passes this through # - might stuff up identification of pairs for read_name, read_seq in reader: read_name = read_name.split()[0] read_set.append((read_name, read_seq)) read_set_bases += len(read_seq) #if warn_low_threshold and len(read_seq)*7 < threshold: #Require 70% exact match # sys.stderr.write('\n*** WARNING: Short reads, consider reducing --threshold ***\n\n') # warn_low_threshold = False read_count += 1 if read_set_bases >= batch_size: break if not read_set: break if len(shrimps) >= max_shrimps: shrimps.pop(0)() shrimps.append( do_shrimp(read_set) ) grace.status('SHRiMPing %s' % grace.pretty_number(read_count)) while shrimps: grace.status('Waiting for SHRiMPs to finish %d ' % len(shrimps) ) shrimps.pop(0)() grace.status('') output_file.close() dirty_filenames.remove(output_filename) unmapped_file.close() dirty_filenames.remove(unmapped_filename) return 0 finally: for filename in dirty_filenames: if os.path.exists(filename): os.unlink(filename)
def main(args): grace.require_shrimp_1() n_cpus = grace.how_many_cpus() solid, args = grace.get_flag(args, '--solid') verbose, args = grace.get_flag(args, '--verbose') threshold, args = grace.get_option_value(args, '--threshold', str, '68%') stride, args = grace.get_option_value(args, '--stride', int, 1) max_shrimps, args = grace.get_option_value(args, '--cpus', int, n_cpus) batch_size, args = grace.get_option_value(args, '--batch-size', int, 5000000) input_reference_filenames = [] reads_filenames = [] shrimp_options = ['-h', threshold] if threshold.endswith('%'): threshold = -float(threshold[:-1]) / 100.0 else: threshold = int(threshold) output_dir = [] #As list so can write to from function. Gah. def front_command(args): grace.expect_no_further_options(args) if len(args) < 1: return output_dir.append(args[0]) input_reference_filenames.extend( [os.path.abspath(filename) for filename in args[1:]]) def reads_command(args): grace.expect_no_further_options(args) reads_filenames.extend([[os.path.abspath(filename)] for filename in args]) def pairs_command(args): grace.expect_no_further_options(args) assert len(args) == 2, 'Expected exactly two files in "pairs"' reads_filenames.append( [os.path.abspath(filename) for filename in args]) def shrimp_options_command(args): shrimp_options.extend(args) grace.execute( args, { 'reads': reads_command, '--reads': reads_command, 'pairs': pairs_command, 'shrimp-options': shrimp_options_command, '--shrimp-options': shrimp_options_command, }, front_command) if not output_dir: print >> sys.stderr, USAGE % n_cpus return 1 output_dir = output_dir[0] assert input_reference_filenames, 'No reference files given' assert reads_filenames, 'No read files given' for filename in itertools.chain(input_reference_filenames, *reads_filenames): assert os.path.exists(filename), '%s does not exist' % filename if not os.path.isdir(output_dir): os.mkdir(output_dir) if solid: shrimp = 'rmapper-cs' else: shrimp = 'rmapper-ls' reference_filename = os.path.join(output_dir, 'reference.fa') reference_file = open(reference_filename, 'wb') total_reference_sequences = 0 total_reference_bases = 0 for input_reference_filename in input_reference_filenames: for name, sequence in io.read_sequences(input_reference_filename): #Don't retain any comment name = name.split()[0] io.write_fasta(reference_file, name, sequence) total_reference_sequences += 1 total_reference_bases += len(sequence) reference_file.close() print '%s base%s in %s reference sequence%s' % ( grace.pretty_number(total_reference_bases), 's' if total_reference_bases != 1 else '', grace.pretty_number(total_reference_sequences), 's' if total_reference_sequences != 1 else '') assert total_reference_bases, 'Reference sequence file is empty' config = { 'references': input_reference_filenames, 'reads': reads_filenames, 'stride': stride, 'solid': solid, 'threshold': threshold, } config_file = open(os.path.join(output_dir, 'config.txt'), 'wb') pprint.pprint(config, config_file) config_file.close() output_filename = os.path.join(output_dir, 'shrimp_hits.txt.gz') output_file = gzip.open(output_filename, 'wb') unmapped_filename = os.path.join(output_dir, 'unmapped.fa.gz') unmapped_file = gzip.open(unmapped_filename, 'wb') dirty_filenames = set() dirty_filenames.add(output_filename) dirty_filenames.add(unmapped_filename) #warn_low_threshold = True try: #Cleanup temporary files N = [0] def do_shrimp(read_set): my_number = N[0] N[0] += 1 tempname = os.path.join(output_dir, 'temp%d-%d.fa' % (os.getpid(), my_number)) tempname_out = os.path.join( output_dir, 'temp%d-%d.txt' % (os.getpid(), my_number)) dirty_filenames.add(tempname) dirty_filenames.add(tempname_out) f = open(tempname, 'wb') for read_name, read_seq in read_set: print >> f, '>' + read_name print >> f, read_seq f.close() command = shrimp + ' ' + ' '.join(shrimp_options) + ' ' + \ tempname + ' ' + reference_filename + ' >' + tempname_out if not verbose: command += ' 2>/dev/null' #f = os.popen(command, 'r') child_pid = os.spawnl(os.P_NOWAIT, '/bin/sh', '/bin/sh', '-c', command) #print 'SHRiMP %d running' % my_number def finalize(): exit_status = os.waitpid(child_pid, 0)[1] assert exit_status == 0, 'Shrimp indicated an error' hits = {} # read_name -> [ hit line ] f = open(tempname_out, 'rb') for line in f: if line.startswith('>'): read_name = line.split(None, 1)[0][1:] if read_name not in hits: hits[read_name] = [] hits[read_name].append(line) f.close() for read_name, read_seq in read_set: if read_name in hits: for hit in hits[read_name]: output_file.write(hit) else: print >> unmapped_file, '>' + read_name print >> unmapped_file, read_seq output_file.flush() unmapped_file.flush() os.unlink(tempname) dirty_filenames.remove(tempname) os.unlink(tempname_out) dirty_filenames.remove(tempname_out) #print 'SHRiMP %d finished' % my_number return finalize shrimps = [] reader = iter_reads(config) read_count = 0 while True: read_set = [] read_set_bases = 0 #Read name should not include comment cruft # - SHRIMP passes this through # - might stuff up identification of pairs for read_name, read_seq in reader: read_name = read_name.split()[0] read_set.append((read_name, read_seq)) read_set_bases += len(read_seq) #if warn_low_threshold and len(read_seq)*7 < threshold: #Require 70% exact match # sys.stderr.write('\n*** WARNING: Short reads, consider reducing --threshold ***\n\n') # warn_low_threshold = False read_count += 1 if read_set_bases >= batch_size: break if not read_set: break if len(shrimps) >= max_shrimps: shrimps.pop(0)() shrimps.append(do_shrimp(read_set)) grace.status('SHRiMPing %s' % grace.pretty_number(read_count)) while shrimps: grace.status('Waiting for SHRiMPs to finish %d ' % len(shrimps)) shrimps.pop(0)() grace.status('') output_file.close() dirty_filenames.remove(output_filename) unmapped_file.close() dirty_filenames.remove(unmapped_filename) return 0 finally: for filename in dirty_filenames: if os.path.exists(filename): os.unlink(filename)
# python2.6 modify_sequence.py data/velvet_test_reference.fa >data/velvet_test_reference_modified.fa import sys, random from nesoni import io for name, seq in io.read_sequences(sys.argv[1]): j = 0 for i in xrange(0, len(seq) - 100, 100): original = seq[i] if j % 3 == 0: while True: new = random.choice('ACGT') if new != original: break seq = seq[:i] + new + seq[i + 1:] elif j % 3 == 1: n = (j // 3) % 9 + 1 seq = seq[:i] + seq[i + n:] else: n = (j // 3) % 9 + 1 seq = seq[:i] + ''.join(random.choice('ACGT') for k in xrange(n)) + seq[i:] j += 1 io.write_fasta(sys.stdout, name, seq)
def run(self): workspace = self.get_workspace() read_length = 100 left = rand_seq(read_length - 1) while True: flank = rand_seq(1) if flank != self.ref[:1]: break left += flank right = rand_seq(read_length - 1) while True: flank = rand_seq(1) if flank != self.ref[-1:]: break right = flank + right i = 0 variants_used = [] with open(workspace / 'reads.fq', 'wb') as f: for i, variant in enumerate(self.variants): if 'x' in variant: variant, count = variant.split('x') count = int(count) else: count = 10 variants_used.append((variant, count)) seq = left + variant + right for j in xrange(count): pos = len(variant) + random.randrange(read_length - len(variant)) read = seq[pos:pos + read_length] if random.randrange(2): read = bio.reverse_complement(read) i += 1 io.write_fastq(f, 'read_%s_%d' % (variant, i), read, chr(64 + 30) * len(read)) reference = left + self.ref + right primary_variant = left + variants_used[0][0] + right with open(workspace / 'reference.fa', 'wb') as f: io.write_fasta(f, 'chr1', reference) legion.remake_needed() self.analysis( workspace / 'sample', workspace / 'reference.fa', reads=[workspace / 'reads.fq'], ).run() self.freebayes( workspace / 'freebayes', workspace / 'sample', ).run() self.vcf_filter( workspace / 'filtered', workspace / 'freebayes.vcf', ).run() Vcf_patch(workspace / 'patch', workspace / ('sample', 'reference'), workspace / 'filtered.vcf').run() patched = io.read_sequences(workspace / ('patch', 'sample.fa')).next()[1] masked = io.read_sequences( workspace / ('sample', 'consensus_masked.fa')).next()[1].upper() with open(workspace / 'freebayes.vcf', 'rU') as f: reader = vcf.Reader(f) raw_count = len(list(reader)) with open(workspace / 'filtered.vcf', 'rU') as f: reader = vcf.Reader(f) filtered_count = len( list(vcf.Reader(open(workspace / 'filtered.vcf', 'rU')))) with open(workspace / ('sample', 'report.txt'), 'rb') as f: nesoni_count = len(f.readlines()) - 1 self.log.log('\n') self.log.datum(workspace.name, 'changes found by "nesoni consensus:"', nesoni_count) self.log.datum(workspace.name, 'is correctly patched by "nesoni consensus:"', masked == primary_variant) self.log.log('\n') self.log.datum(workspace.name, 'raw variants', raw_count) self.log.datum(workspace.name, 'variants after filtering', filtered_count) self.log.datum(workspace.name, 'is correctly patched by VCF pipeline', patched == primary_variant) self.log.log('\n')
def main(args): mincov, args = grace.get_option_value(args, '--mincov', int, 1) maxdiff, args = grace.get_option_value(args, '--maxdiff', int, 16) minsize, args = grace.get_option_value(args, '--minsize', int, 200) what, args = grace.get_option_value(args, '--what', as_core_or_unique, 'core') is_core = (what == 'core') grace.expect_no_further_options(args) if len(args) < 2: print >> sys.stderr, HELP raise grace.Help_shown() output_dir, working_dirs = args[0], args[1:] assert not path.exists(path.join(output_dir, 'reference.fa')), \ 'Output directory not given' if not path.exists(output_dir): os.mkdir(output_dir) for name, seq in io.read_sequences(path.join(working_dirs[0],'reference.fa')): print name friendly_name = grace.filesystem_friendly_name(name) good = [ True ] * len(seq) for working_dir in working_dirs: if is_core: suffix = '-depth.userplot' else: suffix = '-ambiguous-depth.userplot' data = trivia.read_unstranded_userplot( os.path.join(working_dir, friendly_name+suffix) ) assert len(seq) == len(data) for i in xrange(len(seq)): if good[i]: if is_core: good[i] = data[i] >= mincov else: good[i] = data[i] < mincov #Close holes start = -maxdiff-1 n_holes = 0 for i in xrange(len(seq)): if good[i]: if 0 < i-start <= maxdiff: for j in xrange(start,i): good[j] = True n_holes += 1 start = i+1 print 'Closed', grace.pretty_number(n_holes), 'holes' f = open(path.join(output_dir, '%s-%s.fa' % (friendly_name,what)), 'wb') io.write_fasta(f, name, ''.join([ (seq[i] if good[i] else 'N') for i in xrange(len(seq)) ]) ) f.close() f = open(path.join(output_dir, '%s-%s_masked.fa' % (friendly_name,what)), 'wb') io.write_fasta(f, name, ''.join([ (seq[i] if good[i] else seq[i].lower()) for i in xrange(len(seq)) ]) ) f.close() f_good = open(path.join(output_dir, '%s-%s_parts.fa' % (friendly_name,what)), 'wb') f_nongood = open(path.join(output_dir, '%s-non%s_parts.fa' % (friendly_name,what)), 'wb') start = 0 n_good = [0] n_good_bases = [0] def emit(i): if i-start < minsize: return if good[start]: n_good[0] += 1 n_good_bases[0] += i-start io.write_fasta( f_good if good[start] else f_nongood, '%s:%d..%d' % (name, start+1,i), seq[start:i] ) for i in xrange(1,len(seq)): if good[i] != good[start]: emit(i) start = i emit(len(seq)) f_nongood.close() f_good.close() print grace.pretty_number(sum(good)), 'bases are '+what+', of', grace.pretty_number(len(seq)), 'in reference sequence' print grace.pretty_number(n_good[0]), 'parts at least', grace.pretty_number(minsize), 'bases long with', grace.pretty_number(n_good_bases[0]), 'total bases' print
def pastiche(args): if len(args) < 4: print USAGE return 1 mask_only, args = grace.get_option_value(args, '--mask', grace.as_bool, False) min_leftover, args = grace.get_option_value(args, '--min-leftover', int, 20) output_dir, args = args[0], args[1:] #, ref_filename, contig_filenames = args[0], args[1], args[2:] ref_filenames = [ ] contig_filenames = [ ] grace.execute(args, { 'contigs' : lambda args: contig_filenames.extend(args) }, lambda args: ref_filenames.extend(args)) assert ref_filenames, 'No reference sequences given' assert contig_filenames, 'No contig sequences given' contigs = dict([ (name.split()[0], seq) for filename in contig_filenames for name, seq in io.read_sequences(filename) ]) dir_contigs = { } for name in contigs: dir_contigs[name + '+'] = contigs[name] dir_contigs[name + '-'] = bio.reverse_complement(contigs[name]) dir_contigs_used = { } for name in dir_contigs: dir_contigs_used[name] = [ False ] * len(dir_contigs[name]) workspace = io.Workspace(output_dir) temp_prefix = workspace._object_filename('temp-pastiche') out_f = workspace.open('pastiche.fa', 'wb') for ref_filename in ref_filenames: for ref_name, ref_seq in io.read_sequences(ref_filename): ref_name = ref_name.split()[0] grace.status(ref_name) f = open(temp_prefix + '.fa','wb') io.write_fasta(f, 'ref', ref_seq) f.close() scores = [ -1 ] * (len(ref_seq)*2) strings = [ 'N', '' ] * (len(ref_seq)) contexts = [ None for i in xrange(len(ref_seq)*2) ] #MAXSCORE = len(ref_seq)+1 #for i in xrange(len(ref_seq)): # if ref_seq[i].upper() != 'N': # strings[i*2] = ref_seq[i] # scores[i*2] = MAXSCORE #for i in xrange(len(ref_seq)-1): # if ref_seq[i].upper() != 'N' and ref_seq[i+1].upper() != 'N': # scores[i*2+1] = MAXSCORE if mask_only: for i in xrange(len(ref_seq)): strings[i*2] = ref_seq[i].lower() def put(position, dir_contig_name, start, end, score): if scores[position] < score: scores[position] = score strings[position] = dir_contigs[dir_contig_name][start:end] contexts[position] = (dir_contig_name, start, end, score) for contig_filename in contig_filenames: execute(['nucmer', '--prefix', temp_prefix, #'--maxmatch', #Very slow '--nosimplify', '--minmatch', '9', '--mincluster', '50', #'--maxgap', '1000', #'--breaklen', '1000', # Increasing this reduces Ns, but is slow #'--diagfactor', '1.0', temp_prefix+'.fa', contig_filename]) for contig_name, contig_seq in io.read_sequences(contig_filename): contig_name = contig_name.split()[0] grace.status(ref_name + ' vs ' + contig_name) p = run(['show-aligns', temp_prefix+'.delta', 'ref', contig_name], stderr=subprocess.PIPE) alignments = [ ] while True: line = p.stdout.readline() if not line: break if not line.startswith('-- BEGIN'): continue parts = line.split() ref_start = int(parts[5]) ref_end = int(parts[7]) query_start = int(parts[10]) query_end = int(parts[12]) #assert ref_start < ref_end #ref_start -= 1 #Zero based coordinates al_ref = [ ] al_query = [ ] while True: block = [ ] end = False while True: line = p.stdout.readline() if line.startswith('-- END'): end = True break if line == '\n': if block: break else: continue block.append(line) if end: break al_ref.append(block[0].split()[1]) al_query.append(block[1].split()[1]) al_ref = ''.join(al_ref) al_query = ''.join(al_query) if ref_start > ref_end: al_ref = bio.reverse_complement(al_ref) al_query = bio.reverse_complement(al_query) ref_start, ref_end = ref_end, ref_start query_start, query_end = query_end, query_start if query_start > query_end: dir_contig_name = contig_name + '-' query_start = len(contig_seq)+1-query_start query_end = len(contig_seq)+1-query_end else: dir_contig_name = contig_name + '+' ref_start -= 1 #Zero based coordinates query_start -= 1 #print al_ref #print al_query #Pretty dumb scoring scheme al_score = 0 for i in xrange(len(al_ref)): if al_ref[i] == al_query[i]: al_score += 1 #else: # al_score -= 1 #Pastiche alignment over reference ref_pos = ref_start query_pos = query_start al_pos = 0 while al_pos < len(al_ref): assert al_ref[al_pos] != '.' if al_query[al_pos] == '.': put(ref_pos*2, dir_contig_name, query_pos, query_pos, al_score) else: assert al_query[al_pos].lower() == dir_contigs[dir_contig_name][query_pos].lower() put(ref_pos*2, dir_contig_name, query_pos, query_pos+1, al_score) query_pos += 1 al_pos += 1 al_pos_end = al_pos query_pos_end = query_pos while al_pos_end < len(al_ref) and al_ref[al_pos_end] == '.': al_pos_end += 1 query_pos_end += 1 #put(ref_pos*2+1, al_query[al_pos:al_pos_end], al_score) assert al_query[al_pos:al_pos_end].lower() == dir_contigs[dir_contig_name][query_pos:query_pos_end].lower() put(ref_pos*2+1, dir_contig_name, query_pos,query_pos_end, al_score) al_pos = al_pos_end query_pos = query_pos_end ref_pos += 1 p.wait() grace.status(ref_name) result = ''.join(strings) io.write_fasta(out_f, ref_name, result) for context in contexts: if context is None: continue name,start,end,score = context for i in xrange(start,end): dir_contigs_used[name][i] = True #Interpolation #result = [ ] #i = 0 #while i < len(ref_seq): # if strings[i*2].upper() != 'N': # result.append(strings[i*2]) # result.append(strings[i*2+1]) # i += 1 # continue # # j = i # while strings[j*2].upper() == 'N': # j += 1 # # grace.status('') # print >> sys.stderr, 'interpolating', i+1,'..',j # # window = 20 #!!!!!!!!!!! # left_contexts = collections.defaultdict(lambda:0) # for i1 in xrange(max(0,i-window),i): # for context_name, context_start, context_end, context_score in contexts[i1*2]: # key = (context_name, context_end + i - i1) # left_contexts[key] = max(left_contexts[key],context_score) # # right_contexts = collections.defaultdict(lambda:0) # for j1 in xrange(j,min(j+window,len(ref_seq))): # for context_name, context_start, context_end, context_score in contexts[j1*2]: # key = (context_name, context_start + j - j1) # right_contexts[key] = max(left_contexts[key],context_score) # # #print >> sys.stderr, left_contexts # #print >> sys.stderr, right_contexts # # options = [ ] # # for (left_name, left_pos), left_score in left_contexts.items(): # for (right_name, right_pos), right_score in right_contexts.items(): # if left_name != right_name: continue # if right_pos < left_pos: continue # # if right_pos-left_pos > (j-i) * 4.0 + 10: continue #!!!!!!!!!!!!!!!!!!!!!!1 # if right_pos-left_pos < (j-i) * 0.25 - 10: continue # # score = float(min(right_pos-left_pos,j-i))/max(right_pos-left_pos,j-i) # score *= left_score + right_score # #print >> sys.stderr, left_name, right_pos-left_pos, j-i, score # options.append( (score, left_name, left_pos, right_pos) ) # # if options: # best = max(options, key=lambda option: option[0]) # print >> sys.stderr, '->', best # result.append( dir_contigs[best[1]][best[2]:best[3]].lower() ) # else: # print >> sys.stderr, '-> no good interpolation' # result.append( ref_seq[i:j] ) # # i = j # #result = ''.join(result) #io.write_fasta(sys.stdout, ref_name, result) #print >> sys.stderr, len(result), result.count('N') #for pos, size in N_runs: # out_size = len(''.join( strings[pos*2:pos*2+2] )) # print >> sys.stderr, pos, size, '->', out_size out_f.close() grace.status('') #for name, seq in io.read_sequences(ref_filename): # result = pastiche(seq, contigs_filename) # io.write_fasta(sys.stdout, name, result) leftover_f = workspace.open('leftovers.fa','wb') for name in sorted(contigs): used = [ (a or b) for a,b in zip(dir_contigs_used[name+'+'],dir_contigs_used[name+'-'][::-1]) ] i = 0 while i < len(used): j = i while j < len(used) and not used[j]: j += 1 if j-i > min_leftover: if i == 0 and j == len(used): out_name = name else: out_name = name + ':%d..%d' % (i+1,j) io.write_fasta(leftover_f, out_name, contigs[name][i:j]) i = j+1 leftover_f.close() for suffix in ['.fa', '.delta']: os.unlink(temp_prefix + suffix)