def run(self): f = self.begin_output() for filename in self.filenames: any = False name = os.path.splitext(os.path.split(filename)[1])[0] try: iterator = io.read_sequences(filename, qualities=True) except grace.Error: iterator = None if iterator is not None: total = 0 total_length = 0 for seq in io.read_sequences(filename, qualities=True): total += 1 total_length += len(seq[1]) print >> f, grace.datum(name, 'sequences', total) print >> f, grace.datum(name, 'average length', float(total_length) / total) print >> f any = True try: iterator = annotation.read_annotations(filename) except grace.Error: iterator = None if iterator: total = 0 counts = {} for item in iterator: total += 1 counts[item.type] = counts.get(item.type, 0) + 1 print >> f, grace.datum(name, 'features', total) for key in sorted(counts): print >> f, grace.datum(name, key + ' features', counts[key]) print >> f any = True if not any: raise grace.Error( filename + ' is neither a sequence file nor an annotation file that nesoni can read.' ) self.end_output(f)
def main(args): size, args = grace.get_option_value(args, '--size', int, 200) stride, args = grace.get_option_value(args, '--stride', int, 50) grace.expect_no_further_options(args) if not args: print USAGE return 1 for filename in args: for name, seq in io.read_sequences(filename): name_parts = name.split(None, 1) name = name_parts[0] if len(name_parts) > 1: desc = ' ' + name_parts[1] else: desc = '' for i in xrange(-size + stride, len(seq), stride): start = max(0, min(len(seq), i)) end = max(0, min(len(seq), i + size)) io.write_fasta(sys.stdout, '%s:%d..%d' % (name, start + 1, end) + desc, seq[start:end]) return 0
def run(self): extractions = [ ] for item in self.genes.split(','): extraction = item.split('/') assert len(extraction) == 4 extractions.append(extraction) rename = { } if self.rename: for item in self.rename.split(','): old,new = item.split('=') rename[old] = new work = self.get_workspace() with workspace.tempspace() as temp: items = list(annotation.read_annotations(self.annotation)) for item in items: item.seqid = rename.get(item.seqid, item.seqid) annotation.write_gff3(temp/'temp.gff', get_genes(items, extractions, self.log)) del items with open(temp/'temp.fa','wb') as f: for name,seq in io.read_sequences(self.genome): name = name.split()[0] name = rename.get(name,name) io.write_fasta(f, name, seq) reference_directory.Make_tt_reference( self.output_dir, filenames = [ temp/'temp.fa', temp/'temp.gff' ], index = self.index, ).run()
def make_file_for_primer_3 (gff_file, ref_file, names_file, output_file): # check for a tmp direcory if len(glob.glob("./tmp")) == 0: call (["mkdir", "tmp"]) gff_file = list(annotation.read_annotations(gff_file)) print "\n Reading in the reference file" seq_dict = dict(io.read_sequences(ref_file)) names_file = open(names_file).readlines() config = open("primer_config.txt").readlines() with open("tmp/regions_" + output_file, 'w') as out_f: for name in names_file: sname = name.strip("\n") found = False for line in gff_file: gff_name = line.attr.get ("Name", "No_name") peak = line.attr.get ("id", "No_id") if sname in gff_name.split("/"): out_f.write ("SEQUENCE_ID="+ gff_name.replace("/", "_") + "_" + peak + "\n") out_f.write("SEQUENCE_TEMPLATE=" + line.shifted(-100, 0).get_seq(seq_dict) + "\n") found = True for cline in config: out_f.write(cline.strip("\n") + "\n") out_f.write("=" + "\n") if found ==False: print "Could not find the gene " + sname + " in the gff file"
def main(args): size, args = grace.get_option_value(args,'--size',int,200) stride, args = grace.get_option_value(args,'--stride',int,50) grace.expect_no_further_options(args) if not args: print USAGE return 1 for filename in args: for name, seq in io.read_sequences(filename): name_parts = name.split(None, 1) name = name_parts[0] if len(name_parts) > 1: desc = ' ' + name_parts[1] else: desc = '' for i in xrange(-size+stride,len(seq),stride): start = max(0,min(len(seq),i)) end = max(0,min(len(seq), i+size)) io.write_fasta( sys.stdout, '%s:%d..%d' % (name,start+1,end) + desc, seq[start:end] ) return 0
def build_snpeff(self): jar = io.find_jar('snpEff.jar') with open(self/'snpeff.config','wb') as f: print >> f, 'data_dir = snpeff' print >> f, 'genomes : ' + self.name print >> f, self.name + '.genome : ' + self.name snpwork = io.Workspace(self/'snpeff',must_exist=False) snpwork_genome = io.Workspace(snpwork/self.name,must_exist=False) snpwork_genomes = io.Workspace(snpwork/'genomes',must_exist=False) annotations = self.annotations_filename() assert annotations with open(snpwork_genome/'genes.gff','wb') as f: for record in annotation.read_annotations(annotations): if record.end <= record.start: continue if not record.attr: record.attr['attributes'] = 'none' print >> f, record.as_gff() with open(snpwork_genomes/(self.name+'.fa'),'wb') as f: for name, seq in io.read_sequences(self.reference_fasta_filename()): io.write_fasta(f, name, seq) io.execute('java -jar JAR build NAME -gff3 -c CONFIG', JAR=jar, NAME=self.name, CONFIG=self/'snpeff.config')
def make_file_for_primer_3(gff_file, ref_file, names_file, output_file, start, end): # check for a tmp direcory if len(glob.glob("./tmp")) == 0: call(["mkdir", "tmp"]) gff_file = list(annotation.read_annotations(gff_file)) print "\nReading in the reference file\n" seq_dict = dict(io.read_sequences(ref_file)) names_file = open(names_file).readlines() config = open("Software/primer_config.txt").readlines() with open("tmp/regions_" + output_file, 'w') as out_f: for name in names_file: sname = name.strip("\n ") found = False for line in gff_file: gff_name = line.attr.get("Name", "No_name") peak = line.attr.get("id", "No_id") if sname in gff_name.split("/"): out_f.write("SEQUENCE_ID=" + gff_name.replace("/", "_") + "_" + peak + "\n") # Move the peaks 30 bases proximal out_f.write("SEQUENCE_TEMPLATE=" + line.shifted(start, end).get_seq(seq_dict) + "\n") found = True for cline in config: out_f.write(cline.strip("\n") + "\n") out_f.write("=" + "\n") if found == False: print "Could not find the gene " + sname + " in the reference gff file\n"
def run(self): extractions = [ ] for item in self.genes.split(','): extraction = item.split('/') assert len(extraction) == 4 extractions.append(extraction) rename = { } if self.rename: for item in self.rename.split(','): old,new = item.split('=') rename[old] = new work = self.get_workspace() with workspace.tempspace() as temp: items = list(annotation.read_annotations(self.annotation)) for item in items: item.seqid = rename.get(item.seqid, item.seqid) annotation.write_gff3(temp/'temp.gff', get_genes(items, extractions, self.log)) del items with open(temp/'temp.fa','wb') as f: for name,seq in io.read_sequences(self.genome): name = name.split()[0] name = rename.get(name,name) io.write_fasta(f, name, seq) reference_directory.Make_tt_reference( self.output_dir, filenames = [ temp/'temp.fa', temp/'temp.gff' ] + self.extra, index = self.index, shrimp = self.shrimp, bowtie = self.bowtie, star = self.star ).run()
def run(self): seqs = [] seen = 0 for filename in self.filenames: for seq in io.read_sequences(filename, qualities=True): seen += 1 if seen % 100000 == 0: grace.status('Scanned ' + grace.pretty_number(seen)) if len(seqs) < self.n: seqs.append(seq) elif self.n <= random.random() * seen: seqs[random.randrange(self.n)] = seq grace.status('') print >> sys.stderr, 'Sampled', grace.pretty_number( len(seqs)), 'of', grace.pretty_number(seen), 'sequences' if not seqs: return qualities = len(seqs[0]) if qualities: for name, seq, qual in seqs: io.write_fastq(sys.stdout, name, seq, qual) else: for name, seq in seqs: io.write_fastq(sys.stdout, name, seq)
def run(self): f = self.begin_output() for filename in self.filenames: any = False name = os.path.splitext(os.path.split(filename)[1])[0] try: iterator = io.read_sequences(filename, qualities=True) except grace.Error: iterator = None if iterator is not None: total = 0 total_length = 0 for seq in io.read_sequences(filename, qualities=True): total += 1 total_length += len(seq[1]) print >> f, grace.datum(name, 'sequences', total) print >> f, grace.datum(name, 'average length', float(total_length)/total) print >> f any = True try: iterator = annotation.read_annotations(filename) except grace.Error: iterator = None if iterator: total = 0 counts = { } for item in iterator: total += 1 counts[item.type] = counts.get(item.type,0)+1 print >> f, grace.datum(name, 'features', total) for key in sorted(counts): print >> f, grace.datum(name, key + ' features', counts[key]) print >> f any = True if not any: raise grace.Error(filename + ' is neither a sequence file nor an annotation file that nesoni can read.') self.end_output(f)
def run(self): base = os.path.split(self.prefix)[1] annotations = [ ] sequences = [ ] for filename in self.filenames: any = False if io.is_sequence_file(filename): sequences.append(filename) any = True if annotation.is_annotation_file(filename): annotations.append(filename) any = True assert any, 'File is neither a recognized sequence or annotation file' cytoband_filename = os.path.join(self.prefix,base+'_cytoband.txt') property_filename = os.path.join(self.prefix,'property.txt') gff_filename = os.path.join(self.prefix,base+'.gff') output_filenames = [ cytoband_filename, property_filename, gff_filename ] if not os.path.exists(self.prefix): os.mkdir(self.prefix) f = open(property_filename,'wb') print >> f, 'ordered=true' print >> f, 'id=%s' % base print >> f, 'name=%s' % (self.name or base) print >> f, 'cytobandFile=%s_cytoband.txt' % base print >> f, 'geneFile=%s.gff' % base print >> f, 'sequenceLocation=%s' % base f.close() trivia.As_gff(output=gff_filename, filenames=annotations, exclude=[ 'gene', 'source' ] ).run() f_cyt = open(cytoband_filename,'wb') for filename in sequences: for name, seq in io.read_sequences(filename): assert '/' not in name f = open(os.path.join(self.prefix, name + '.txt'), 'wb') f.write(seq) f.close() print >> f_cyt, '%s\t0\t%d' % (name, len(seq)) f_cyt.close() genome_filename = self.prefix + '.genome' if os.path.exists(genome_filename): os.unlink(genome_filename) io.execute( ['zip', '-j', io.abspath(genome_filename)] + [ io.abspath(item) for item in output_filenames ] ) for filename in output_filenames: if os.path.exists(filename): os.unlink(filename)
def get_lengths(self): #Legacy working directory if not self.object_exists('reference-lengths.pickle.gz'): lengths = [ ] for name, seq in io.read_sequences(self.reference_fasta_filename()): name = name.split()[0] lengths.append( (name, len(seq)) ) self.set_object(lengths, 'reference-lengths.pickle.gz') return self.get_object('reference-lengths.pickle.gz')
def convert(filename): info = io.get_file_info(filename) ok = selection.matches('type-fastq:[compression-none/compression-gzip/compression-bzip2]', info) if ok: return filename result_name = tempname() with open(result_name,'wb') as f: for name, seq, qual in io.read_sequences(filename, qualities='required'): io.write_fastq(f, name, seq, qual) return result_name
def run(self): workspace = self.get_workspace() reference = reference_directory.Reference(self.reference, must_exist=True) reader_f = io.open_possibly_compressed_file(self.vcf) reader = vcf.Reader(reader_f) variants = collections.defaultdict(list) for record in reader: variants[record.CHROM].append(record) reader_f.close() for chrom in variants: variants[chrom].sort(key=lambda item: item.POS) filenames = [workspace / (item + '.fa') for item in reader.samples] for filename in filenames: with open(filename, 'wb'): pass for name, seq in io.read_sequences( reference.reference_fasta_filename()): for i, sample in enumerate(reader.samples): revised = [] pos = 0 for variant in variants[name]: gt = variant.samples[i].data.GT if gt is None: continue assert gt.isdigit( ), 'Unsupported genotype (can only use haploid genotypes): ' + gt gt_number = int(gt) if gt_number == 0: var_seq = variant.REF else: var_seq = str(variant.ALT[gt_number - 1]) assert re.match( '[ACGTN]*$', var_seq), 'Unsupported variant type: ' + var_seq new_pos = variant.POS - 1 assert new_pos >= pos, 'Variants overlap.' revised.append(seq[pos:new_pos]) pos = new_pos revised.append(var_seq) assert seq[pos:pos + len(variant.REF)].upper( ) == variant.REF, 'REF column in VCF does not match reference sequence' pos += len(variant.REF) revised.append(seq[pos:]) with open(filenames[i], 'ab') as f: io.write_fasta(f, name, ''.join(revised)) del variants[name] assert not variants, 'Chromosome names in VCF not in reference: ' + ' '.join( variants)
def run(self): assert self.release assert self.species assert self.assembly assert self.dna extractions = [ ] for item in self.genes.split(','): extraction = item.split('/') assert len(extraction) == 4 extractions.append(extraction) rename = { } if self.rename: for item in self.rename.split(','): old,new = item.split('=') rename[old] = new work = self.get_workspace() ensembl = workspace.Workspace(work/'ensembl') genome_filename = self.species+"."+self.assembly+"."+self.dna+".fa.gz" genome_url = "rsync://ftp.ensembl.org/ensembl/pub/release-"+self.release+"/fasta/"+self.species.lower()+"/dna/"+genome_filename gff_filename = self.species+"."+self.assembly+"."+self.release+".gff3.gz" gff_url = "rsync://ftp.ensembl.org/ensembl/pub/release-"+self.release+"/gff3/"+self.species.lower()+"/"+gff_filename if self.download: self.log.log("Fetching "+genome_url+"\n") io.execute(['rsync','-aP',genome_url, ensembl/genome_filename]) self.log.log("Fetching "+gff_url+"\n") io.execute(['rsync','-aP',gff_url, ensembl/gff_filename]) with workspace.tempspace() as temp: items = list(annotation.read_annotations(ensembl/gff_filename)) for item in items: item.seqid = rename.get(item.seqid, item.seqid) annotation.write_gff3(temp/'temp.gff', get_genes(items, extractions, self.log)) del items with open(temp/'temp.fa','wb') as f: for name,seq in io.read_sequences(ensembl/genome_filename): name = name.split()[0] name = rename.get(name,name) io.write_fasta(f, name, seq) reference_directory.Make_tt_reference( self.output_dir, filenames = [ temp/'temp.fa', temp/'temp.gff' ], index = self.index, ).run()
def convert(filename): info = io.get_file_info(filename) ok = selection.matches( 'type-fastq:[compression-none/compression-gzip/compression-bzip2]', info) if ok: return filename result_name = tempname() with open(result_name, 'wb') as f: for name, seq, qual in io.read_sequences( filename, qualities='required'): io.write_fastq(f, name, seq, qual) return result_name
def run(self): f = self.begin_output() for filename in self.filenames: info = io.get_file_info(filename) any = False name = os.path.splitext(os.path.split(filename)[1])[0] if info.matches('sequences'): total = 0 total_length = 0 for seq in io.read_sequences(filename, qualities=True): total += 1 total_length += len(seq[1]) print >> f, grace.datum(name, 'sequences', total) print >> f, grace.datum(name, 'total bases', total_length) if total: print >> f, grace.datum(name, 'average length', float(total_length) / total) print >> f any = True if info.matches('annotations'): total = 0 counts = {} for item in annotation.read_annotations(filename, "/"): total += 1 counts[item.type] = counts.get(item.type, 0) + 1 print >> f, grace.datum(name, 'features', total) for key in sorted(counts): print >> f, grace.datum(name, key + ' features', counts[key]) print >> f any = True if info.matches('type-vcf'): reader_f = io.open_possibly_compressed_file(filename) reader = vcf.Reader(reader_f) n = 0 for item in reader: n += 1 print >> f, grace.datum(name, 'variants', n) any = True if not any: raise grace.Error('Don\'t know what to do with ' + filename) self.end_output(f)
def debias(args): import numpy radius, args = grace.get_option_value(args, '--radius', int, 2) dirs = args for dir_name in dirs: for name, seq in io.read_sequences( os.path.join(dir_name, 'reference.fa')): for suffix, ambig_suffix in [ ('-depth', '-ambiguous-depth'), ('-pairspan-depth', '-ambiguous-pairspan-depth'), ]: root = grace.filesystem_friendly_name(name) full_name = os.path.join(dir_name, root + suffix + '.userplot') full_ambig_name = os.path.join( dir_name, root + ambig_suffix + '.userplot') if not os.path.exists(full_name): continue if not os.path.exists(full_ambig_name): continue output_suffix = '-%d.userplot' % radius print dir_name, root, output_suffix depths = numpy.array(read_unstranded_userplot(full_name)) ambig_depths = numpy.array( read_unstranded_userplot(full_ambig_name)) expect = expected_depth(root, seq, depths, ambig_depths, radius) write_unstranded_userplot( os.path.join(dir_name, root + suffix + '-expected' + output_suffix), expect) corrected = depths / expect * numpy.median(expect) corrected[expect <= 5.0] = 0.0 write_unstranded_userplot( os.path.join(dir_name, root + suffix + '-corrected' + output_suffix), corrected) ambig_corrected = ambig_depths / expect * numpy.median(expect) ambig_corrected[expect <= 0.0] = 0.0 write_unstranded_userplot( os.path.join( dir_name, root + ambig_suffix + '-corrected' + output_suffix), ambig_corrected)
def run(self): f = self.begin_output() for filename in self.filenames: info = io.get_file_info(filename) any = False name = os.path.splitext(os.path.split(filename)[1])[0] if info.matches('sequences'): total = 0 total_length = 0 for seq in io.read_sequences(filename, qualities=True): total += 1 total_length += len(seq[1]) print >> f, grace.datum(name, 'sequences', total) print >> f, grace.datum(name, 'total bases', total_length) if total: print >> f, grace.datum(name, 'average length', float(total_length)/total) print >> f any = True if info.matches('annotations'): total = 0 counts = { } for item in annotation.read_annotations(filename): total += 1 counts[item.type] = counts.get(item.type,0)+1 print >> f, grace.datum(name, 'features', total) for key in sorted(counts): print >> f, grace.datum(name, key + ' features', counts[key]) print >> f any = True if info.matches('type-vcf'): reader_f = io.open_possibly_compressed_file(filename) reader = vcf.Reader(reader_f) n = 0 for item in reader: n += 1 print >> f, grace.datum(name, 'variants', n) any = True if not any: raise grace.Error('Don\'t know what to do with ' + filename) self.end_output(f)
def iter_reads(config, qualities=False): if 'stride' not in config: raise grace.Error('Please re-run nesoni shrimp, output format has changed') stride = config['stride'] for reads_filename_set in config['reads']: if config['solid']: reader = [ io.read_solid(filename) for filename in reads_filename_set ] else: reader = [ io.read_sequences(filename, qualities) for filename in reads_filename_set ] reader = itertools.izip(*reader) for i, items in enumerate(reader): if i % stride == 0: for item in items: yield item
def run(self): workspace = self.get_workspace() reference = reference_directory.Reference(self.reference, must_exist=True) reader_f = io.open_possibly_compressed_file(self.vcf) reader = vcf.Reader(reader_f) variants = collections.defaultdict(list) for record in reader: variants[record.CHROM].append(record) reader_f.close() for chrom in variants: variants[chrom].sort(key=lambda item: item.POS) filenames = [ workspace/(item+'.fa') for item in reader.samples ] for filename in filenames: with open(filename,'wb'): pass for name, seq in io.read_sequences(reference.reference_fasta_filename()): for i, sample in enumerate(reader.samples): revised = [ ] pos = 0 for variant in variants[name]: gt = variant.samples[i].data.GT if gt is None: continue assert gt.isdigit(), 'Unsupported genotype (can only use haploid genotypes): '+gt gt_number = int(gt) if gt_number == 0: var_seq = variant.REF else: var_seq = str(variant.ALT[gt_number-1]) assert re.match('[ACGTN]*$', var_seq), 'Unsupported variant type: '+var_seq new_pos = variant.POS-1 assert new_pos >= pos, 'Variants overlap.' revised.append(seq[pos:new_pos]) pos = new_pos revised.append(var_seq) assert seq[pos:pos+len(variant.REF)].upper() == variant.REF, 'REF column in VCF does not match reference sequence' pos += len(variant.REF) revised.append(seq[pos:]) with open(filenames[i],'ab') as f: io.write_fasta(f, name, ''.join(revised)) del variants[name] assert not variants, 'Chromosome names in VCF not in reference: '+' '.join(variants)
def run(self): f = self.begin_output() for filename in self.filenames: for name, seq in io.read_sequences(filename): name_parts = name.split(None, 1) name = name_parts[0] for i in xrange(-self.size+self.stride,len(seq),self.stride): start = max(0,min(len(seq),i)) end = max(0,min(len(seq), i+self.size)) shred_name = '%s:%d..%d' % (name,start+1,end) shred_seq = seq if self.quality: io.write_fastq(f, shred_name, seq[start:end], chr(33+self.quality)*(end-start)) else: io.write_fasta(f, shred_name, seq[start:end]) self.end_output(f)
def run(self): min_quality = chr(33+self.quality) with io.open_possibly_compressed_writer(self.prefix+'.csfastq.gz') as out_file: n = 0 n_discarded = 0 n_clipped = 0 total_before = 0 total_clipped = 0 for filename in self.filenames: for name, seq, qual in io.read_sequences(filename, qualities='required'): score = 0 start = 0 for i in xrange(len(seq)-1): if qual[i] >= min_quality: if seq[i+1] == '0': score += 1 else: score = max(0, score-4) if not score: start = i+2 n += 1 total_before += len(seq) if start > self.length+1: if start < len(seq): n_clipped += 1 total_clipped += len(seq)-start print >> out_file, '@'+name print >> out_file, seq[:start] print >> out_file, '+' print >> out_file, qual[:start-1] else: n_discarded += 1 self.log.datum(self.sample,'reads',n) if n: self.log.datum(self.sample,'mean length before poly-A clipping',float(total_before)/n) self.log.datum(self.sample,'reads discarded as too short after poly-A clipping',n_discarded) self.log.datum(self.sample,'reads poly-A clipped and kept',n_clipped) if n_clipped: self.log.datum(self.sample,'mean length clipped',float(total_clipped)/n_clipped)
def debias(args): import numpy radius, args = grace.get_option_value(args, '--radius', int, 2) dirs = args for dir_name in dirs: for name, seq in io.read_sequences(os.path.join(dir_name,'reference.fa')): for suffix, ambig_suffix in [ ('-depth', '-ambiguous-depth'), ('-pairspan-depth', '-ambiguous-pairspan-depth'), ]: root = grace.filesystem_friendly_name(name) full_name = os.path.join(dir_name, root + suffix + '.userplot') full_ambig_name = os.path.join(dir_name, root + ambig_suffix + '.userplot') if not os.path.exists(full_name): continue if not os.path.exists(full_ambig_name): continue output_suffix = '-%d.userplot' % radius print dir_name, root, output_suffix depths = numpy.array( read_unstranded_userplot(full_name) ) ambig_depths = numpy.array( read_unstranded_userplot(full_ambig_name) ) expect = expected_depth(root, seq, depths, ambig_depths, radius) write_unstranded_userplot( os.path.join(dir_name, root + suffix + '-expected' + output_suffix), expect) corrected = depths / expect * numpy.median(expect) corrected[expect <= 5.0] = 0.0 write_unstranded_userplot( os.path.join(dir_name, root + suffix + '-corrected' + output_suffix), corrected) ambig_corrected = ambig_depths / expect * numpy.median(expect) ambig_corrected[expect <= 0.0] = 0.0 write_unstranded_userplot( os.path.join(dir_name, root + ambig_suffix + '-corrected' + output_suffix), ambig_corrected)
def run(self): f = self.begin_output() for filename in self.filenames: for name, seq in io.read_sequences(filename): name_parts = name.split(None, 1) name = name_parts[0] for i in xrange(-self.size + self.stride, len(seq), self.stride): start = max(0, min(len(seq), i)) end = max(0, min(len(seq), i + self.size)) shred_name = '%s:%d..%d' % (name, start + 1, end) shred_seq = seq if self.quality: io.write_fastq(f, shred_name, seq[start:end], chr(33 + self.quality) * (end - start)) else: io.write_fasta(f, shred_name, seq[start:end]) self.end_output(f)
def set_sequences(self, filenames): reference_genbank_filename = self / 'reference.gbk' reference_filename = self / 'reference.fa' reference_genbank_file = open(reference_genbank_filename,'wb') any_genbank = [ False ] def genbank_callback(name, record): """ Make a copy of any genbank files passed in. """ from Bio import SeqIO SeqIO.write([record], reference_genbank_file, 'genbank') f = open(self / (grace.filesystem_friendly_name(name) + '.gbk'), 'wb') SeqIO.write([record], f, 'genbank') f.close() any_genbank[0] = True lengths = [ ] seen = set() f = open(reference_filename, 'wb') for filename in filenames: for name, seq in io.read_sequences(filename, genbank_callback=genbank_callback): name = name.split()[0] assert name not in seen, 'Duplicate chromosome name: ' + name seen.add(name) lengths.append( (name, len(seq)) ) io.write_fasta(f, name, seq) f.close() self.set_object(lengths, 'reference-lengths.pickle.gz') reference_genbank_file.close() if not any_genbank[0]: os.unlink(reference_genbank_filename) # Create an index of the reference sequences for samtools io.execute([ 'samtools', 'faidx', reference_filename ])
def iter_reads(config, qualities=False): if 'stride' not in config: raise grace.Error( 'Please re-run nesoni shrimp, output format has changed') stride = config['stride'] for reads_filename_set in config['reads']: if config['solid']: reader = [ io.read_solid(filename) for filename in reads_filename_set ] else: reader = [ io.read_sequences(filename, qualities) for filename in reads_filename_set ] reader = itertools.izip(*reader) for i, items in enumerate(reader): if i % stride == 0: for item in items: yield item
def run(self): seqs = [ ] seen = 0 for filename in self.filenames: for seq in io.read_sequences(filename, qualities=True): seen += 1 if seen % 100000 == 0: grace.status('Scanned '+grace.pretty_number(seen)) if len(seqs) < self.n: seqs.append(seq) elif self.n <= random.random() * seen: seqs[ random.randrange(self.n) ] = seq grace.status('') print >> sys.stderr, 'Sampled', grace.pretty_number(len(seqs)), 'of', grace.pretty_number(seen), 'sequences' if not seqs: return qualities = len(seqs[0]) if qualities: for name, seq, qual in seqs: io.write_fastq(sys.stdout, name, seq, qual) else: for name, seq in seqs: io.write_fastq(sys.stdout, name, seq)
def run(self): assert self.reads or self.pairs or self.interleaved, 'No reads given' io.check_name_uniqueness(self.reads, self.pairs, self.interleaved) working = self.get_workspace() working.setup_reference(self.references, bowtie=True) working.update_param(snp_cost=2.0) reference = working.get_reference() log_file = open(self.log_filename(), 'wb') with workspace.tempspace(dir=working.working_dir) as temp: n = [0] def tempname(): n[0] += 1 return temp / ('%d.fq' % n[0]) def convert(filename): info = io.get_file_info(filename) ok = selection.matches( 'type-fastq:[compression-none/compression-gzip/compression-bzip2]', info) if ok: return filename result_name = tempname() with open(result_name, 'wb') as f: for name, seq, qual in io.read_sequences( filename, qualities='required'): io.write_fastq(f, name, seq, qual) return result_name ones = [] twos = [] singles = [] for pair in self.pairs: assert len( pair) == 2, 'Need two files in each "pair:" section.' ones.append(convert(pair[0])) twos.append(convert(pair[1])) for item in self.interleaved: left_name = tempname() right_name = tempname() ones.append(left_name) twos.append(right_name) with open(left_name,'wb') as left, \ open(right_name,'wb') as right: reader = io.read_sequences(item, qualities='required') while True: try: name, seq, qual = reader.next() except StopIteration: break io.write_fastq(left, name, seq, qual) try: name, seq, qual = reader.next() except StopIteration: raise grace.Error( 'Interleaved file contains odd number of sequences' ) io.write_fastq(right, name, seq, qual) for item in self.reads: singles.append(convert(item)) cores = min(self.cores, legion.coordinator().get_cores()) command = ([ 'bowtie2', '--threads', str(cores), '--rg-id', '1', '--rg', 'SM:' + working.name, ] + self.bowtie_options + ['-x', reference.get_bowtie_index_prefix()]) commands = [] if ones: commands.append(command + ['-1', ','.join(ones), '-2', ','.join(twos)]) if singles: commands.append(command + ['-U', ','.join(singles)]) temp_bam_name = temp / 'temp.bam' with io.pipe_to(['samtools', 'view', '-S', '-b', '-'], stdout=open(temp_bam_name, 'wb'), stderr=log_file) as f: header_sent = False for command in commands: self.log.log('Running:\n' + ' '.join(command) + '\n') with io.pipe_from(command, stderr=log_file, cores=cores) as f_out: for line in f_out: if not header_sent or not line.startswith('@'): f.write(line) header_sent = True #io.execute([ # 'samtools', 'sort', '-n', temp_bam_name, working/'alignments' # ]) sam.sort_bam(temp_bam_name, working / 'alignments', by_name=True, cores=self.cores) log_file.close()
def fill_scaffolds(args): max_filler_length, args = grace.get_option_value(args, '--max-filler', int, 4000) if len(args) < 2: print USAGE return 1 (output_dir, graph_dir), args = args[:2], args[2:] scaffolds = [ ] def scaffold(args): circular, args = grace.get_option_value(args, '--circular', grace.as_bool, False) scaffold = [ ] for item in args: scaffold.append( ('contig', int(item)) ) scaffold.append( ('gap', None) ) if not circular: scaffold = scaffold[:-1] name = 'custom_scaffold_%d' % (len(scaffolds)+1) scaffolds.append( (name, scaffold) ) grace.execute(args, [scaffold]) custom_scaffolds = (len(scaffolds) != 0) sequences = dict( (a.split()[0], b.upper()) for a,b in io.read_sequences(os.path.join( graph_dir, '454AllContigs.fna'))) sequence_names = sorted(sequences) sequence_ids = dict(zip(sequence_names, xrange(1,len(sequence_names)+1))) contexts = { } context_names = { } context_depths = { } for i in xrange(1,len(sequence_names)+1): seq = sequences[sequence_names[i-1]] contexts[ i ] = seq context_names[ i ] = sequence_names[i-1]+'-fwd' contexts[ -i ] = bio.reverse_complement(seq) context_names[ -i ] = sequence_names[i-1]+'-rev' links = collections.defaultdict(list) for line in open( os.path.join(graph_dir, '454ContigGraph.txt'), 'rU'): parts = line.rstrip('\n').split('\t') if parts[0].isdigit(): seq = sequence_ids[parts[1]] context_depths[ seq] = float(parts[3]) context_depths[-seq] = float(parts[3]) if parts[0] == 'C': name1 = 'contig%05d' % int(parts[1]) dir1 = {"3'" : 1, "5'" : -1 }[parts[2]] name2 = 'contig%05d' % int(parts[3]) dir2 = {"5'" : 1, "3'" : -1 }[parts[4]] depth = int(parts[5]) #print name1, dir1, name2, dir2, depth links[ sequence_ids[name1] * dir1 ].append( (depth, sequence_ids[name2] * dir2) ) links[ sequence_ids[name2] * -dir2 ].append( (depth, sequence_ids[name1] * -dir1) ) if parts[0] == 'S' and not custom_scaffolds: name = 'scaffold%05d' % int(parts[2]) components = parts[3].split(';') scaffold = [ ] for component in components: a,b = component.split(':') if a == 'gap': scaffold.append( ('gap',int(b)) ) else: strand = { '+': +1, '-': -1 }[ b ] scaffold.append( ('contig', sequence_ids['contig%05d'%int(a)] * strand) ) scaffolds.append( (name, scaffold) ) #paths = { } # #todo = [ ] #for i in contexts: # for depth_left, neg_left in links[-i]: # left = -neg_left # for depth_right, right in links[i]: # todo.append( ( max(-depth_left,-depth_right,-context_depths[i]), left, right, (i,)) ) # #heapq.heapify(todo) #while todo: # score, source, dest, path = heapq.heappop(todo) # if (source,dest) in paths: continue # # paths[(source,dest)] = path # # if len(contexts[dest]) > max_filler_length: continue # # for depth, next in links[dest]: # heapq.heappush(todo, # ( max(score,-depth,-context_depths[dest]), source, next, path+(dest,)) # ) path_source_dest = collections.defaultdict(dict) # source -> dest -> next path_dest_source = collections.defaultdict(dict) # dest -> source -> next # Use links, in order to depth of coverage, to construct paths between contigs # Thus: paths have maximum minimum depth # subsections of paths also have this property todo = [ ] for i in contexts: for depth_link, right in links[i]: todo.append( ( depth_link, i, right) ) todo.sort(reverse=True) for score, left, right in todo: if right in path_source_dest[left]: continue sources = [(left,right)] if len(contexts[left]) <= max_filler_length: sources += path_dest_source[left].items() destinations = [right] if len(contexts[right]) <= max_filler_length: destinations += path_source_dest[right].keys() for source, next in sources: for dest in destinations: if dest in path_source_dest[source]: continue path_source_dest[source][dest] = next path_dest_source[dest][source] = next workspace = io.Workspace(output_dir) scaffold_f = workspace.open('scaffolds.fa','wb') #comments = [ ] features = [ ] used = set() previous_total = 0 for i, (name, scaffold) in enumerate(scaffolds): result = '' # Inefficient. Meh. n_filled = 0 n_failed = 0 for j, item in enumerate(scaffold): if item[0] == 'contig': result += contexts[item[1]] used.add(abs(item[1])) else: left = scaffold[j-1] right = scaffold[ (j+1) % len(scaffold) ] #If gap at end, assume circular assert left[0] == 'contig' assert right[0] == 'contig' gap_start = len(result) can_fill = right[1] in path_source_dest[left[1]] if can_fill: n = 0 k = path_source_dest[left[1]][right[1]] while k != right[1]: n += len(contexts[k]) result += contexts[k].lower() used.add(abs(k)) k = path_source_dest[k][right[1]] n_filled += 1 if item[1] is not None and max(n,item[1]) > min(n,item[1])*4: print >> sys.stderr, 'Warning: gap size changed from %d to %d in scaffold %d' % (item[1],n,i+1) else: n_failed += 1 #print >> sys.stderr, 'Warning: No path to fill a gap in scaffold %d' % (i+1) result += 'n' * (9 if item[1] is None else item[1]) gap_end = len(result) #features.append( '%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s' % ( # 'all-scaffolds', # 'fill-scaffolds', # 'gap', # previous_total + gap_start+1, # previous_total + max(gap_end, gap_start+1), #Allow for zeroed out gaps. Hmm. # '.', #score # '+', #strand # '.', #frame # '' #properties #)) features.append( '%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s' % ( name, 'fill-scaffolds', 'gap', gap_start+1, max(gap_end, gap_start+1), #Allow for zeroed out gaps. Hmm. '.', #score '+', #strand '.', #frame '' #properties )) io.write_fasta(scaffold_f, name, result) previous_total += len(result) #comments.append('##sequence-region %s %d %d' % (name, 1, len(result))) print >> sys.stderr, 'Scaffold%05d: %d gaps filled, %d could not be filled' % (i+1, n_filled, n_failed) scaffold_f.close() gff_f = workspace.open('scaffolds.gff', 'wb') #print >>gff_f, '##gff-version 3' #for comment in comments: # print >>gff_f, comment for feature in features: print >>gff_f, feature gff_f.close() leftovers_f = workspace.open('leftovers.fa', 'wb') for name in sequence_names: if sequence_ids[name] not in used: io.write_fasta(leftovers_f, name, sequences[name]) leftovers_f.close() ends = { } for i, (name, scaffold) in enumerate(scaffolds): if scaffold[-1][0] == 'gap': continue ends[ '%s start' % name ] = scaffold[-1][1] ends[ '%s end ' % name ] = -scaffold[0][1] for end1 in sorted(ends): options = [ end2 for end2 in ends if -ends[end2] in path_source_dest[ends[end1]] ] if len(options) == 1: print >> sys.stderr, 'Note: from', end1, 'only', options[0], 'is reachable'
def run(self): #mincov, args = grace.get_option_value(args, '--mincov', int, 1) #maxdiff, args = grace.get_option_value(args, '--maxdiff', int, 16) #minsize, args = grace.get_option_value(args, '--minsize', int, 200) #what, args = grace.get_option_value(args, '--what', as_core_or_unique, 'core') #is_core = (what == 'core') # #grace.expect_no_further_options(args) # #if len(args) < 2: # print >> sys.stderr, HELP # raise grace.Help_shown() # #output_dir, working_dirs = args[0], args[1:] # ##assert not path.exists(path.join(output_dir, 'reference.fa')), \ #assert not path.exists(path.join(output_dir, 'parameters')), \ # 'Output directory not given' # #if not path.exists(output_dir): # os.mkdir(output_dir) assert self.what in ( 'core', 'unique'), 'Expected --what to be either "core" or "unique".' is_core = (self.what == 'core') workspace = self.get_workspace() for name, seq in io.read_sequences( working_directory.Working(self.working_dirs[0]).get_reference( ).reference_fasta_filename()): self.log.log(name + '\n') friendly_name = grace.filesystem_friendly_name(name) good = [True] * len(seq) for working_dir in self.working_dirs: if is_core: suffix = '-depth.userplot' else: suffix = '-ambiguous-depth.userplot' data = trivia.read_unstranded_userplot( os.path.join(working_dir, friendly_name + suffix)) assert len(seq) == len(data) for i in xrange(len(seq)): if good[i]: if is_core: good[i] = data[i] >= self.mincov else: good[i] = data[i] < self.mincov #Close holes start = -self.maxdiff - 1 n_holes = 0 for i in xrange(len(seq)): if good[i]: if 0 < i - start <= self.maxdiff: for j in xrange(start, i): good[j] = True n_holes += 1 start = i + 1 self.log.log('Closed ' + grace.pretty_number(n_holes) + ' holes\n') f = open(workspace / ('%s-%s.fa' % (friendly_name, self.what)), 'wb') io.write_fasta( f, name, ''.join([(seq[i] if good[i] else 'N') for i in xrange(len(seq))])) f.close() f = open( workspace / ('%s-%s_masked.fa' % (friendly_name, self.what)), 'wb') io.write_fasta( f, name, ''.join([(seq[i] if good[i] else seq[i].lower()) for i in xrange(len(seq))])) f.close() f_good = open( workspace / ('%s-%s_parts.fa' % (friendly_name, self.what)), 'wb') f_nongood = open( workspace / ('%s-non%s_parts.fa' % (friendly_name, self.what)), 'wb') start = 0 n_good = [0] n_good_bases = [0] def emit(i): if i - start < self.minsize: return if good[start]: n_good[0] += 1 n_good_bases[0] += i - start io.write_fasta(f_good if good[start] else f_nongood, '%s:%d..%d' % (name, start + 1, i), seq[start:i]) for i in xrange(1, len(seq)): if good[i] != good[start]: emit(i) start = i emit(len(seq)) f_nongood.close() f_good.close() self.log.log( grace.pretty_number(sum(good)) + ' bases are ' + self.what + ', of ' + grace.pretty_number(len(seq)) + ' in reference sequence\n') self.log.log( grace.pretty_number(n_good[0]) + ' parts at least ' + grace.pretty_number(self.minsize) + ' bases long with ' + grace.pretty_number(n_good_bases[0]) + ' total bases\n') self.log.log('\n')
def run(self): grace.require_shrimp_2() grace.require_samtools() assert self.references, 'No reference sequences given' assert self.reads or self.pairs or self.interleaved, 'No reads given' for pair in self.pairs: assert len(pair) == 2, 'Two files required in each pair: section' io.check_name_uniqueness(self.reads, self.pairs, self.interleaved) read_sets = [] for item in self.reads: read_sets.append(([item], False)) for item in self.pairs: read_sets.append((item, True)) for item in self.interleaved: read_sets.append(([item], True)) #Create working directory workspace = self.get_workspace() workspace.setup_reference(self.references) workspace.update_param(snp_cost=25) reference = workspace.get_reference() reference_filename = reference.reference_fasta_filename() cores = min(self.cores, legion.coordinator().get_cores()) default_options = { '-E': None, '-T': None, '-N': str(cores), '-n': '2', '-w': '200%', '-p': 'opp-in', '-I': '0,500', '-X': None, } if self.sam_unaligned: default_options['--sam-unaligned'] = None if self.half_paired: default_options['--half-paired'] = None else: default_options['--no-half-paired'] = None cutoff = '55%' #Default changed in SHRiMP 2.0.2 if '-h' in self.shrimp_options: cutoff = self.shrimp_options[self.shrimp_options.index('-h') + 1] #Run shrimp bam_filename = io.abspath(self.output_dir, 'alignments.bam') bam_prefix = io.abspath(self.output_dir, 'alignments') bam_sorted_prefix = io.abspath(self.output_dir, 'alignments_sorted') temp_filename = io.abspath(self.output_dir, 'temp.bam') log_filename = io.abspath(self.output_dir, 'shrimp_log.txt') log_file = open(log_filename, 'wb') sam_eater = sam.Bam_writer(temp_filename) sam_header_sent = [False] n_seen = [0] def eat(f): for line in f: if line.startswith('@'): if sam_header_sent[0]: continue else: n_seen[0] += 1 if n_seen[0] % 100000 == 0: grace.status('%s alignments produced' % grace.pretty_number(n_seen[0])) sam_eater.write_raw(line) sam_header_sent[0] = True def remove_pair_options(options): for flag in ['-p', '-I']: while flag in options: pos = options.index(flag) options = options[:pos] + options[pos + 2:] for flag in ['--half-paired']: while flag in options: pos = options.index(flag) options = options[:pos] + options[pos + 1:] return options for i, (filenames, is_paired) in enumerate(read_sets): options = self.shrimp_options[:] has_qualities = all( len(io.read_sequences(filename, qualities=True).next()) == 3 #A little ugly for filename in filenames) if has_qualities: options.append('--fastq') if len(filenames) == 1: reads_parameters = [filenames[0]] else: reads_parameters = ['-1', filenames[0], '-2', filenames[1]] if '--qv-offset' not in self.shrimp_options: #guesses = [ ] #for filename in filenames: # guesses.append(io.guess_quality_offset(filename)) #assert len(set(guesses)) == 1, 'Conflicting quality offset guesses, please specify --qv-offset manually.' #default_options['--qv-offset'] = str(guesses[0]) default_options['--qv-offset'] = str( io.guess_quality_offset(*filenames)) default_options['--read-group'] = '%s,%s' % ( workspace.name.replace(',', '_'), workspace.name.replace(',', '_')) for flag in default_options: if flag not in options: options.append(flag) if default_options[flag] is not None: options.append(default_options[flag]) if not is_paired: options = remove_pair_options(options) grace.status('') full_param = reference.shrimp_command(self.cs, options + reads_parameters) print >> sys.stderr, 'Running', ' '.join(full_param) with io.pipe_from(full_param, stderr=log_file, cores=cores) as f: eat(f) log_file.close() sam_eater.close() grace.status('Sort') #io.execute([ # 'samtools', 'sort', '-n', temp_filename, bam_prefix #]) sam.sort_bam(temp_filename, bam_prefix, by_name=True, cores=self.cores) os.unlink(temp_filename) grace.status('')
def run(self): references = {} for filename in self.reference_filenames: for name, seq in io.read_sequences(filename): references[name] = seq tail_lengths = {} adaptor_bases = {} for filename in self.clips: with io.open_possibly_compressed_file(filename) as f: for line in f: if line.startswith('#'): continue parts = line.rstrip('\n').split('\t') name = parts[0].split()[0] tail_lengths[name] = int(parts[3]) - int(parts[2]) adaptor_bases[name] = int(parts[6]) in_file = self.begin_input() out_file = self.begin_output() assert self.prop_a >= 0.0 and self.prop_a <= 1.0 a_score = 1 - self.prop_a non_a_score = -self.prop_a for line in in_file: line = line.rstrip() if line.startswith('@'): print >> out_file, line continue al = Alignment(line) if al.flag & FLAG_UNMAPPED: continue #ref = references[al.rname] reverse = al.flag & FLAG_REVERSE if reverse: read_bases = rev_comp(al.seq) read_qual = al.qual[::-1] cigar = cigar_decode(al.cigar)[::-1] else: read_bases = al.seq read_qual = al.qual cigar = cigar_decode(al.cigar) n_tail = tail_lengths[al.qname] #if reverse: # if al.pos-1-n_tail < 0: continue #TODO: handle tail extending beyond end of reference # bases_ref = rev_comp(ref[al.pos-1-n_tail:al.pos-1]) #else: # if al.pos-1+al.length+n_tail > len(ref): continue #TODO: handle tail extending beyond end of reference # bases_ref = ref[al.pos-1+al.length:al.pos-1+al.length+n_tail] .upper()#upper was missing for a long time. Bug! # #extension = 0 #while extension < n_tail and bases_ref[extension] == 'A': # extension += 1 if reverse: feat = annotation.Annotation(al.rname, start=al.pos - 1 - n_tail, end=al.pos - 1, strand=-1) else: feat = annotation.Annotation(al.rname, start=al.pos - 1 + al.length, end=al.pos - 1 + al.length + n_tail, strand=1) bases_ref = feat.get_seq(references).upper() # Allow up to 60% mismatch on As # Treat soft clipping as insertion for simplicity cigar = cigar.replace("S", "I") assert "H" not in cigar, "Can't handle hard clipping" extension = 0 best_score = 0.0 score = 0.0 # Soft clipping treated as a mismatch i = len(cigar) - 1 while i >= 0 and cigar[i] in "I": score += non_a_score i -= 1 for i in xrange(n_tail): if bases_ref[i] == "A": score += a_score else: score += non_a_score if score >= best_score: extension = i + 1 best_score = score #print >> sys.stderr, reverse!=0, n_tail, extension, bases_ref if n_tail - extension > 0: al.extra.append('AN:i:%d' % (n_tail - extension)) al.extra.append('AG:i:%d' % (extension)) if adaptor_bases[al.qname]: al.extra.append('AD:i:%d' % adaptor_bases[al.qname]) if n_tail - extension >= self.tail: #if reverse: # tail_refpos = al.pos-extension #else: # tail_refpos = al.pos+al.length+extension-1 #al.extra.append('AA:i:%d'%tail_refpos) al.extra.append('AA:i:1') cigar += 'M' * extension read_bases += 'N' * extension #Since mispriming is so common (and loading the original sequence here would be a pain) read_qual += chr(33 + 20) * extension #Arbitrarily give quality 20 al.length += extension if reverse: al.pos -= extension al.seq = rev_comp(read_bases) al.qual = read_qual[::-1] al.cigar = cigar_encode(cigar[::-1]) else: al.seq = read_bases al.qual = read_qual al.cigar = cigar_encode(cigar) print >> out_file, al self.end_output(out_file) self.end_input(in_file)
def run(self): references = { } for filename in self.reference_filenames: for name, seq in io.read_sequences(filename): references[name] = seq tail_lengths = { } adaptor_bases = { } for filename in self.clips: with io.open_possibly_compressed_file(filename) as f: for line in f: if line.startswith('#'): continue parts = line.rstrip('\n').split('\t') name = parts[0].split()[0] tail_lengths[name] = int(parts[3])-int(parts[2]) adaptor_bases[name] = int(parts[6]) in_file = self.begin_input() out_file = self.begin_output() assert self.prop_a >= 0.0 and self.prop_a <= 1.0 a_score = 1-self.prop_a non_a_score = -self.prop_a for line in in_file: line = line.rstrip() if line.startswith('@'): print >> out_file, line continue al = Alignment(line) if al.flag & FLAG_UNMAPPED: continue #ref = references[al.rname] reverse = al.flag & FLAG_REVERSE if reverse: read_bases = rev_comp(al.seq) read_qual = al.qual[::-1] cigar = cigar_decode(al.cigar)[::-1] else: read_bases = al.seq read_qual = al.qual cigar = cigar_decode(al.cigar) n_tail = tail_lengths[al.qname] #if reverse: # if al.pos-1-n_tail < 0: continue #TODO: handle tail extending beyond end of reference # bases_ref = rev_comp(ref[al.pos-1-n_tail:al.pos-1]) #else: # if al.pos-1+al.length+n_tail > len(ref): continue #TODO: handle tail extending beyond end of reference # bases_ref = ref[al.pos-1+al.length:al.pos-1+al.length+n_tail] .upper()#upper was missing for a long time. Bug! # #extension = 0 #while extension < n_tail and bases_ref[extension] == 'A': # extension += 1 if reverse: feat = annotation.Annotation(al.rname, start=al.pos-1-n_tail, end=al.pos-1, strand=-1) else: feat = annotation.Annotation(al.rname, start=al.pos-1+al.length, end=al.pos-1+al.length+n_tail, strand=1) bases_ref = feat.get_seq(references).upper() # Allow up to 60% mismatch on As # Treat soft clipping as insertion for simplicity cigar = cigar.replace("S","I") assert "H" not in cigar, "Can't handle hard clipping" extension = 0 best_score = 0.0 score = 0.0 # Soft clipping treated as a mismatch i = len(cigar)-1 while i >= 0 and cigar[i] in "I": score += non_a_score i -= 1 for i in xrange(n_tail): if bases_ref[i] == "A": score += a_score else: score += non_a_score if score >= best_score: extension = i+1 best_score = score #print >> sys.stderr, reverse!=0, n_tail, extension, bases_ref if n_tail-extension > 0: al.extra.append('AN:i:%d' % (n_tail-extension)) al.extra.append('AG:i:%d' % (extension)) if adaptor_bases[al.qname]: al.extra.append('AD:i:%d' % adaptor_bases[al.qname]) if n_tail-extension >= self.tail: #if reverse: # tail_refpos = al.pos-extension #else: # tail_refpos = al.pos+al.length+extension-1 #al.extra.append('AA:i:%d'%tail_refpos) al.extra.append('AA:i:1') cigar += 'M' * extension read_bases += 'N' * extension #Since mispriming is so common (and loading the original sequence here would be a pain) read_qual += chr(33+20) * extension #Arbitrarily give quality 20 al.length += extension if reverse: al.pos -= extension al.seq = rev_comp(read_bases) al.qual = read_qual[::-1] al.cigar = cigar_encode(cigar[::-1]) else: al.seq = read_bases al.qual = read_qual al.cigar = cigar_encode(cigar) print >> out_file, al self.end_output(out_file) self.end_input(in_file)
def run(self): grace.require_shrimp_2() grace.require_samtools() assert self.references, "No reference sequences given" assert self.reads or self.pairs or self.interleaved, "No reads given" for pair in self.pairs: assert len(pair) == 2, "Two files required in each pair: section" io.check_name_uniqueness(self.reads, self.pairs, self.interleaved) read_sets = [] for item in self.reads: read_sets.append(([item], False)) for item in self.pairs: read_sets.append((item, True)) for item in self.interleaved: read_sets.append(([item], True)) # Create working directory workspace = self.get_workspace() workspace.setup_reference(self.references) workspace.update_param(snp_cost=25) reference = workspace.get_reference() reference_filename = reference.reference_fasta_filename() cores = min(self.cores, legion.coordinator().get_cores()) default_options = { "-E": None, "-T": None, "-N": str(cores), "-n": "2", "-w": "200%", "-p": "opp-in", "-I": "0,500", "-X": None, } if self.sam_unaligned: default_options["--sam-unaligned"] = None if self.half_paired: default_options["--half-paired"] = None else: default_options["--no-half-paired"] = None cutoff = "55%" # Default changed in SHRiMP 2.0.2 if "-h" in self.shrimp_options: cutoff = self.shrimp_options[self.shrimp_options.index("-h") + 1] # Run shrimp bam_filename = io.abspath(self.output_dir, "alignments.bam") bam_prefix = io.abspath(self.output_dir, "alignments") bam_sorted_prefix = io.abspath(self.output_dir, "alignments_sorted") temp_filename = io.abspath(self.output_dir, "temp.bam") log_filename = io.abspath(self.output_dir, "shrimp_log.txt") log_file = open(log_filename, "wb") sam_eater = sam.Bam_writer(temp_filename) sam_header_sent = [False] n_seen = [0] def eat(f): for line in f: if line.startswith("@"): if sam_header_sent[0]: continue else: n_seen[0] += 1 if n_seen[0] % 100000 == 0: grace.status("%s alignments produced" % grace.pretty_number(n_seen[0])) sam_eater.write_raw(line) sam_header_sent[0] = True def remove_pair_options(options): for flag in ["-p", "-I"]: while flag in options: pos = options.index(flag) options = options[:pos] + options[pos + 2 :] for flag in ["--half-paired"]: while flag in options: pos = options.index(flag) options = options[:pos] + options[pos + 1 :] return options for i, (filenames, is_paired) in enumerate(read_sets): options = self.shrimp_options[:] has_qualities = all( len(io.read_sequences(filename, qualities=True).next()) == 3 for filename in filenames # A little ugly ) if has_qualities: options.append("--fastq") if len(filenames) == 1: reads_parameters = [filenames[0]] else: reads_parameters = ["-1", filenames[0], "-2", filenames[1]] if "--qv-offset" not in self.shrimp_options: guesses = [] for filename in filenames: guesses.append(io.guess_quality_offset(filename)) assert ( len(set(guesses)) == 1 ), "Conflicting quality offset guesses, please specify --qv-offset manually." default_options["--qv-offset"] = str(guesses[0]) default_options["--read-group"] = "%s,%s" % ( workspace.name.replace(",", "_"), workspace.name.replace(",", "_"), ) for flag in default_options: if flag not in options: options.append(flag) if default_options[flag] is not None: options.append(default_options[flag]) if not is_paired: options = remove_pair_options(options) grace.status("") full_param = reference.shrimp_command(self.cs, options + reads_parameters) print >>sys.stderr, "Running", " ".join(full_param) with io.pipe_from(full_param, stderr=log_file, cores=cores) as f: eat(f) log_file.close() sam_eater.close() grace.status("Sort") io.execute(["samtools", "sort", "-n", temp_filename, bam_prefix]) os.unlink(temp_filename) grace.status("")
def run(self): for filename in self.filenames: for name, seq in io.read_sequences(filename): if self.only and name.split()[0] not in self.only: continue io.write_fasta(sys.stdout, name, seq)
def run(self): references = { } for filename in self.reference_filenames: print >> sys.stderr, 'Load', filename for name, seq in io.read_sequences(filename): references[name] = seq reads = { } for filename in self.reads: print >> sys.stderr, 'Load', filename for name, seq, qual in io.read_sequences(filename, qualities='required'): reads[name] = (seq, qual) print >> sys.stderr, 'Begin' in_file = self.begin_input() out_file = self.begin_output() for line in in_file: line = line.rstrip() if line.startswith('@'): print >> out_file, line continue al = Alignment(line) if al.flag & FLAG_UNMAPPED: continue reverse = al.flag & FLAG_REVERSE if reverse: read_bases = rev_comp(al.seq) read_qual = al.qual[::-1] cigar = cigar_decode(al.cigar)[::-1] else: read_bases = al.seq read_qual = al.qual cigar = cigar_decode(al.cigar) al.extra = [ item for item in al.extra if not item.startswith('CQ:Z:') and not item.startswith('CS:Z:') ] + [ 'CQ:Z:'+reads[al.qname][1], 'CS:Z:'+reads[al.qname][0], ] ref = references[al.rname] seq_tail = reads[al.qname][0][ len(al.seq)+1: ] qual_tail = reads[al.qname][1][ len(al.seq): ] n_tail = len(seq_tail) if reverse: if al.pos-1-n_tail < 0: continue #TODO: handle tail extending beyond end of reference bases_ref = rev_comp(ref[al.pos-1-n_tail:al.pos-1+1]) else: if al.pos-1+al.length+n_tail > len(ref): continue #TODO: handle tail extending beyond end of reference bases_ref = ref[al.pos-1+al.length-1:al.pos-1+al.length+n_tail] seq_ref = solid_encode( bases_ref ) basic_score = alignment_score(qual_tail, seq_tail, seq_ref, self.quality) if n_tail: tail_score, tail_pos = max( (alignment_score( qual_tail, seq_tail, solid_encode(bases_ref[:1+i] + 'A'*(n_tail-i)), self.quality)[0], i) for i in xrange(n_tail+1) ) baseline = max(0, alignment_score( qual_tail[:tail_pos], seq_tail[:tail_pos], seq_ref[:tail_pos], self.quality)[0]) if tail_score >= baseline + self.tail: #Record position of end of transcript in 'AA' (1-based position) if reverse: tail_refpos = al.pos - tail_pos else: tail_refpos = al.pos+al.length + tail_pos - 1 al.extra.append('AA:i:%d' % tail_refpos) #Record sequence's poly(A) tail length in AN: from the end of correspondence with the reference sequence to the end of good quality As estimated_tail_length = alignment_score(qual_tail,seq_tail,solid_encode(bases_ref[:1+tail_pos]+'A'*(n_tail-tail_pos)),self.quality)[1] - tail_pos if estimated_tail_length > 0: al.extra.append('AN:i:%d' % estimated_tail_length) if tail_pos: read_bases += solid_decode(read_bases[-1], seq_tail[:tail_pos]) read_qual += qual_tail[:tail_pos] cigar += 'M'*tail_pos al.length += tail_pos if reverse: al.pos -= tail_pos al.seq = rev_comp(read_bases) al.qual = read_qual[::-1] al.cigar = cigar_encode(cigar[::-1]) else: al.seq = read_bases al.qual = read_qual al.cigar = cigar_encode(cigar) print >> out_file, al self.end_output(out_file) self.end_input(in_file)
def run(self): workspace = self.get_workspace() read_length = 100 left = rand_seq(read_length-1) while True: flank = rand_seq(1) if flank != self.ref[:1]: break left += flank right = rand_seq(read_length-1) while True: flank = rand_seq(1) if flank != self.ref[-1:]: break right = flank+right i = 0 variants_used = [ ] with open(workspace/'reads.fq','wb') as f: for i, variant in enumerate(self.variants): if 'x' in variant: variant, count = variant.split('x') count = int(count) else: count = 10 variants_used.append( (variant,count) ) seq = left+variant+right for j in xrange(count): pos = len(variant)+random.randrange(read_length-len(variant)) read = seq[pos:pos+read_length] if random.randrange(2): read = bio.reverse_complement(read) i += 1 io.write_fastq(f,'read_%s_%d' % (variant,i),read,chr(64+30)*len(read)) reference = left+self.ref+right primary_variant = left+variants_used[0][0]+right with open(workspace/'reference.fa','wb') as f: io.write_fasta(f,'chr1',reference) legion.remake_needed() self.analysis( workspace/'sample', workspace/'reference.fa', reads = [ workspace/'reads.fq' ], ).run() self.freebayes( workspace/'freebayes', workspace/'sample', ).run() self.vcf_filter( workspace/'filtered', workspace/'freebayes.vcf', ).run() Vcf_patch( workspace/'patch', workspace/('sample','reference'), workspace/'filtered.vcf' ).run() patched = io.read_sequences(workspace/('patch','sample.fa')).next()[1] masked = io.read_sequences(workspace/('sample','consensus_masked.fa')).next()[1].upper() with open(workspace/'freebayes.vcf','rU') as f: reader = vcf.Reader(f) raw_count = len(list(reader)) with open(workspace/'filtered.vcf','rU') as f: reader = vcf.Reader(f) filtered_count = len(list(vcf.Reader(open(workspace/'filtered.vcf','rU')))) with open(workspace/('sample','report.txt'),'rb') as f: nesoni_count = len(f.readlines()) - 1 self.log.log('\n') self.log.datum(workspace.name,'changes found by "nesoni consensus:"', nesoni_count) self.log.datum(workspace.name,'is correctly patched by "nesoni consensus:"', masked == primary_variant) self.log.log('\n') self.log.datum(workspace.name,'raw variants', raw_count) self.log.datum(workspace.name,'variants after filtering', filtered_count) self.log.datum(workspace.name,'is correctly patched by VCF pipeline', patched == primary_variant) self.log.log('\n')
def run(self): log = self.log #quality_cutoff, args = grace.get_option_value(args, '--quality', int, 10) #qoffset, args = grace.get_option_value(args, '--qoffset', int, None) #clip_ambiguous, args = grace.get_option_value(args, '--clip-ambiguous', grace.as_bool, True) #length_cutoff, args = grace.get_option_value(args, '--length', int, 24) #adaptor_cutoff, args = grace.get_option_value(args, '--match', int, 10) #max_error, args = grace.get_option_value(args, '--max-errors', int, 1) #adaptor_set, args = grace.get_option_value(args, '--adaptors', str, 'truseq-adapter,truseq-srna,genomic,multiplexing,pe,srna') #disallow_homopolymers, args = grace.get_option_value(args, '--homopolymers', grace.as_bool, False) #reverse_complement, args = grace.get_option_value(args, '--revcom', grace.as_bool, False) #trim_start, args = grace.get_option_value(args, '--trim-start', int, 0) #trim_end, args = grace.get_option_value(args, '--trim-end', int, 0) #output_fasta, args = grace.get_option_value(args, '--fasta', grace.as_bool, False) #use_gzip, args = grace.get_option_value(args, '--gzip', grace.as_bool, True) #output_rejects, args = grace.get_option_value(args, '--rejects', grace.as_bool, False) #grace.expect_no_further_options(args) prefix = self.prefix log_name = os.path.split(prefix)[1] quality_cutoff = self.quality qoffset = self.qoffset clip_ambiguous = self.clip_ambiguous length_cutoff = self.length adaptor_cutoff = self.match max_error = self.max_errors adaptor_set = self.adaptors disallow_homopolymers = self.homopolymers reverse_complement = self.revcom trim_start = self.trim_start trim_end = self.trim_end output_fasta = self.fasta use_gzip = self.gzip output_rejects = self.rejects iterators = [] filenames = [] any_paired = False for filename in self.reads: filenames.append(filename) iterators.append( itertools.izip(io.read_sequences(filename, qualities=True))) for pair_filenames in self.pairs: assert len(pair_filenames ) == 2, 'Expected a pair of files for "pairs" section.' filenames.extend(pair_filenames) any_paired = True iterators.append( itertools.izip( io.read_sequences(pair_filenames[0], qualities=True), io.read_sequences(pair_filenames[1], qualities=True))) for filename in self.interleaved: filenames.extend(filename) any_paired = True iterators.append( deinterleave(io.read_sequences(filename, qualities=True))) fragment_reads = (2 if any_paired else 1) read_in_fragment_names = ['read-1', 'read-2' ] if any_paired else ['read'] assert iterators, 'Nothing to clip' if qoffset is None: guesses = [ io.guess_quality_offset(filename) for filename in filenames ] assert len( set(guesses) ) == 1, 'Conflicting quality offset guesses, please specify manually.' qoffset = guesses[0] log.log('FASTQ offset seems to be %d\n' % qoffset) quality_cutoff_char = chr(qoffset + quality_cutoff) #log.log('Minimum quality: %d (%s)\n' % (quality_cutoff, quality_cutoff_char)) #log.log('Clip ambiguous bases: %s\n' % (grace.describe_bool(clip_ambiguous))) #log.log('Minimum adaptor match: %d bases, %d errors\n' % (adaptor_cutoff, max_error)) #log.log('Minimum length: %d bases\n' % length_cutoff) adaptor_seqs = [] adaptor_names = [] if adaptor_set and adaptor_set.lower() != 'none': for item in adaptor_set.split(','): item = item.strip().lower() + ' ' any = False for line in ADAPTORS.strip().split('\n'): if line.startswith('#'): continue if not line.lower().startswith(item): continue any = True name, seq = line.rsplit(None, 1) seq = seq.replace('U', 'T') #if seq in adaptor_seqs: print 'Dup', name adaptor_seqs.append(seq) adaptor_names.append(name) adaptor_seqs.append(bio.reverse_complement(seq)) adaptor_names.append(name) if not any: raise grace.Error('Unknown adaptor set: ' + item) matcher = Matcher(adaptor_seqs, adaptor_names, max_error) start_clips = [ collections.defaultdict(list) for i in xrange(fragment_reads) ] end_clips = [ collections.defaultdict(list) for i in xrange(fragment_reads) ] if output_fasta: write_sequence = io.write_fasta_single_line else: write_sequence = io.write_fastq f_single = io.open_possibly_compressed_writer( self.reads_output_filenames()[0]) if fragment_reads == 2: f_paired = io.open_possibly_compressed_writer( self.interleaved_output_filenames()[0]) if output_rejects: f_reject = io.open_possibly_compressed_writer( self.rejects_output_filenames()[0]) n_single = 0 n_paired = 0 n_in_single = 0 n_in_paired = 0 total_in_length = [0] * fragment_reads n_out = [0] * fragment_reads n_q_clipped = [0] * fragment_reads n_a_clipped = [0] * fragment_reads n_homopolymers = [0] * fragment_reads total_out_length = [0] * fragment_reads #log.attach(open(prefix + '_log.txt', 'wb')) for iterator in iterators: for fragment in iterator: if (n_in_single + n_in_paired) % 10000 == 0: grace.status( 'Clipping fragment %s' % grace.pretty_number(n_in_single + n_in_paired)) if len(fragment) == 1: n_in_single += 1 else: n_in_paired += 1 graduates = [] rejects = [] for i, (name, seq, qual) in enumerate(fragment): name = name.split()[0] seq = seq.upper() total_in_length[i] += len(seq) start = trim_start best_start = 0 best_len = 0 for j in xrange(len(seq) - trim_end): if qual[j] < quality_cutoff_char or \ (clip_ambiguous and seq[j] not in 'ACGT'): if best_len < j - start: best_start = start best_len = j - start start = j + 1 j = len(seq) - trim_end if best_len < j - start: best_start = start best_len = j - start clipped_seq = seq[best_start:best_start + best_len] clipped_qual = qual[best_start:best_start + best_len] if len(clipped_seq) < length_cutoff: n_q_clipped[i] += 1 rejects.append((name, seq, qual, 'quality')) continue match = matcher.match(clipped_seq) if match and match[0] >= adaptor_cutoff: clipped_seq = clipped_seq[match[0]:] clipped_qual = clipped_qual[match[0]:] start_clips[i][match[0]].append(match[1][0]) if len(clipped_seq) < length_cutoff: n_a_clipped[i] += 1 rejects.append((name, seq, qual, 'adaptor')) continue match = matcher.match(bio.reverse_complement(clipped_seq)) if match and match[0] >= adaptor_cutoff: clipped_seq = clipped_seq[:len(clipped_seq) - match[0]] clipped_qual = clipped_qual[:len(clipped_qual) - match[0]] end_clips[i][match[0]].append(match[1][0]) if len(clipped_seq) < length_cutoff: n_a_clipped[i] += 1 rejects.append((name, seq, qual, 'adaptor')) continue if disallow_homopolymers and len(set(clipped_seq)) <= 1: n_homopolymers[i] += 1 rejects.append((name, seq, qual, 'homopolymer')) continue graduates.append((name, clipped_seq, clipped_qual)) n_out[i] += 1 total_out_length[i] += len(clipped_seq) if output_rejects: for name, seq, qual, reason in rejects: write_sequence(f_reject, name + ' ' + reason, seq, qual) if graduates: if reverse_complement: graduates = [(name, bio.reverse_complement(seq), qual[::-1]) for name, seq, qual in graduates] if len(graduates) == 1: this_f = f_single n_single += 1 else: assert len(graduates) == 2 this_f = f_paired n_paired += 1 for name, seq, qual in graduates: write_sequence(this_f, name, seq, qual) grace.status('') if output_rejects: f_reject.close() if fragment_reads == 2: f_paired.close() f_single.close() def summarize_clips(name, location, clips): total = 0 for i in clips: total += len(clips[i]) log.datum(log_name, name + ' adaptors clipped at ' + location, total) if not clips: return for i in xrange(min(clips), max(clips) + 1): item = clips[i] log.quietly_log('%3d bases: %10d ' % (i, len(item))) if item: avg_errors = float(sum(item2[0] for item2 in item)) / len(item) log.quietly_log(' avg errors: %5.2f ' % avg_errors) counts = collections.defaultdict(int) for item2 in item: counts[item2[1]] += 1 #print counts for no in sorted(counts, key=lambda item2: counts[item2], reverse=True)[:2]: log.quietly_log('%dx%s ' % (counts[no], matcher.names[no])) if len(counts) > 2: log.quietly_log('...') log.quietly_log('\n') log.quietly_log('\n') if n_in_paired: log.datum(log_name, 'read-pairs', n_in_paired) if n_in_single: log.datum(log_name, 'single reads', n_in_single) for i in xrange(fragment_reads): if start_clips: summarize_clips(read_in_fragment_names[i], 'start', start_clips[i]) if end_clips: summarize_clips(read_in_fragment_names[i], 'end', end_clips[i]) prefix = read_in_fragment_names[i] log.datum(log_name, prefix + ' too short after quality clip', n_q_clipped[i]) log.datum(log_name, prefix + ' too short after adaptor clip', n_a_clipped[i]) if disallow_homopolymers: log.datum(log_name, prefix + ' homopolymers', n_homopolymers[i]) if fragment_reads > 1: log.datum(log_name, prefix + ' kept', n_out[i]) log.datum(log_name, prefix + ' average input length', float(total_in_length[i]) / (n_in_single + n_in_paired)) if n_out[i]: log.datum(log_name, prefix + ' average output length', float(total_out_length[i]) / n_out[i]) if fragment_reads == 2: log.datum(log_name, 'pairs kept after clipping', n_paired) log.datum(log_name, 'reads kept after clipping', n_single)
def main(args): title1, args = grace.get_option_value(args, "--title1", str, None) title2, args = grace.get_option_value(args, "--title2", str, None) grace.expect_no_further_options(args) if len(args) != 3: print >> sys.stderr, USAGE return 1 working_dir1 = args[0] working_dir2 = args[1] cutoff = float(args[2]) sequence_names = [name for name, sequence in io.read_sequences(os.path.join(working_dir1, "reference.fa"))] if title1 is None: title1 = working_dir1 if title2 is None: title2 = working_dir2 n = 1 while significance([("A", n)], [("T", n)], 1.0) > cutoff: n += 1 print "%g\tsignificance cutoff" % cutoff print "%d\tdepth required to call substitution (greater if there are errors in the reads)" % n print "Sequence\tPosition in reference\tChange type\tReference\t%s\t%s\tp-value (no correction for multiple testing)\t%s\t%s" % ( title1, title2, title1, title2, ) for sequence_name in sequence_names: filename1 = os.path.join(working_dir1, grace.filesystem_friendly_name(sequence_name) + "-evidence.txt") filename2 = os.path.join(working_dir2, grace.filesystem_friendly_name(sequence_name) + "-evidence.txt") for (pos1, ins1, sub1, ref1, conins1, consub1), (pos2, ins2, sub2, ref2, conins2, consub2) in itertools.izip( read_file(filename1), read_file(filename2) ): assert pos1 == pos2 and ref1 == ref2 if pos1 % 1000 == 0: grace.status("Testing %s %d" % (sequence_name, pos1)) dec_ins1 = io.decode_evidence(ins1) dec_ins2 = io.decode_evidence(ins2) if dec_ins1 and dec_ins2: sig = significance(io.decode_evidence(ins1), io.decode_evidence(ins2), cutoff) if sig is not None and sig <= cutoff: grace.status("") print "%s\t%d\t%s\t\t%s\t%s\t%g\t%s\t%s" % ( sequence_name, pos1, "insertion-before", ins1, ins2, sig, conins1, conins2, ) dec_sub1 = io.decode_evidence(sub1) dec_sub2 = io.decode_evidence(sub2) if dec_sub1 and dec_sub2: sig = significance(dec_sub1, dec_sub2, cutoff) if sig is not None and sig <= cutoff: if dec_sub1[0][0] == "-" or dec_sub2[0][0] == "-": what = "deletion" elif dec_sub1[0][0] != dec_sub2[0][0]: what = "substitution" else: what = "different mix" grace.status("") print "%s\t%d\t%s\t%s\t%s\t%s\t%g\t%s\t%s" % ( sequence_name, pos1, what, ref1, sub1, sub2, sig, consub1, consub2, ) grace.status("") return 0
def nway_main(gbk_filename, use_indels, use_reference, give_evidence, give_consequences, require_all, require_bisect, full_output, format, working_dirs, split_a, split_b, f=sys.stdout): assert working_dirs, 'Need at least one working directory.' workspaces = [ working_directory.Working(dirname, must_exist=True) for dirname in working_dirs ] reference = workspaces[0].get_reference() #if not annotation_filename: # annotation_filename = reference.annotations_filename() #May still be None if use_reference: names = ['reference'] evidence_start = 1 else: names = [ ] evidence_start = 0 names.extend( norm_name(item) for item in working_dirs ) references = io.read_sequences(reference.reference_fasta_filename()) annotations = { } if gbk_filename: from Bio import SeqIO for record in SeqIO.parse(io.open_possibly_compressed_file(gbk_filename),'genbank'): sequence = record.seq.tostring() features = [ item for item in record.features if item.type != 'source' ] features.sort(key=lambda item: item.location.nofuzzy_start) annotations[sequence] = features iterator = reader(working_dirs, references, use_reference, annotations) if not use_indels: iterator = itertools.ifilter(has_no_indels, iterator) if require_all or require_bisect or format == 'counts': iterator = itertools.ifilter(fully_unambiguous, iterator) if require_bisect: iterator = itertools.ifilter(is_binary_partition, iterator) if not require_bisect: if full_output: iterator = itertools.ifilter(not_boring_insertion, iterator) else: iterator = itertools.ifilter(is_interesting, iterator) if split_a or split_b: assert len(names) == len(set(names)), 'Two samples with the same name' try: split_a = [ names.index(norm_name(item)) for item in split_a ] split_b = [ names.index(norm_name(item)) for item in split_b ] except ValueError: raise grace.Error('Sample to be split is not amongst samples given') iterator = itertools.ifilter(is_split(split_a, split_b), iterator) #if limit: # iterator = itertools.islice(iterator, limit) if format == 'table': line = 'Reference\tPosition\tChange type' line += '\t' + '\t'.join(names) if give_evidence: line += '\t' + '\t'.join(names[evidence_start:]) if give_consequences: line += '\t' + '\t'.join(names[evidence_start:]) if annotations: line += '\tAnnotations' print >> f, line for calls in iterator: line = '%s\t%d\t%s\t%s' % ( calls.ref_name, calls.ref_pos+1, change_type(calls), '\t'.join(item.consensus for item in calls.calls)) if give_evidence: line += '\t' + '\t'.join(item.evidence for item in calls.calls[evidence_start:]) if give_consequences: line += '\t' + '\t'.join(item.consequences for item in calls.calls[evidence_start:]) if annotations: line += '\t' + describe_features(calls.features) print >> f, line elif format == 'compact': for line in transpose_strings(names): print >> f, line print >> f for calls in iterator: if calls.is_insertion: footer = '%12d.5 %s' % (calls.ref_pos, calls.ref_name) else: footer = '%12d %s' % (calls.ref_pos+1, calls.ref_name) t = transpose_strings([ item.consensus for item in calls.calls ], '-', 1) top = t[0] + ' ' + footer if give_consequences: consequences = [ ] for call in calls.calls: if call.consequences: for item in call.consequences.split(', '): item = ' '.join(item.split()[:3]) if item not in consequences: consequences.append(item) if consequences: top += ' ' + ' / '.join(sorted(consequences)) top += ' ' + describe_features(calls.features) print >> f, top for line in t[1:]: print >> f, line elif format == 'nexus': buckets = [ [ ] for name in names ] for calls in iterator: for i, char in enumerate(partition_string(calls)): buckets[i].append(char) print >> f, '#NEXUS' print >> f, 'begin taxa;' print >> f, 'dimensions ntax=%d;' % len(names) print >> f, 'taxlabels' for name in names: print >> f, name print >> f, ';' print >> f, 'end;' print >> f, 'begin characters;' print >> f, 'dimensions nchar=%d;' % len(buckets[0]) print >> f, 'format datatype=STANDARD symbols="ACGT-0123456789" missing=N;' print >> f, 'matrix' for name, bucket in itertools.izip(names, buckets): print >> f, name, ''.join(bucket) print >> f, ';' print >> f, 'end;' elif format == 'counts': for line in transpose_strings(names): print >> f, line print >> f counts = { } for calls in iterator: count_str = partition_string(calls) if count_str not in counts: counts[count_str] = 1 else: counts[count_str] += 1 for count_str in sorted(counts, key=lambda x: (counts[x], x), reverse=True): print >> f, '%s %d' % (transpose_strings(count_str)[0], counts[count_str]) else: raise grace.Error('Unknown output format: ' + format)
def recombination(args): grace.expect_no_further_options(args) if len(args) != 2: print >> sys.stderr, USAGE raise grace.Help_shown() working_dir, seq_name = args references = dict(io.read_sequences(os.path.join(working_dir, 'reference.fa'))) depth = { } prefixes = { } suffixes = { } for name in references: depth[name] = numpy.zeros(len(references[name]), 'int64') prefixes[name] = [ [] for base in references[name] ] suffixes[name] = [ [] for base in references[name] ] def register_divergence(hit): if not hit.query_forward: hit = hit.reversed() margin = 20 if hit.target_end - hit.target_start < 20: return False depth[hit.target_name][hit.target_start : hit.target_end] += 1 any = False if hit.query_end <= len(hit.query_seq)-margin: # and hit.target_end < len(hit.target_seq): suffixes[hit.target_name][hit.target_end-1].append( hit.query_seq[hit.query_end:] ) any = True if hit.query_start >= margin: # and hit.target_start > 0: prefixes[hit.target_name][hit.target_start].append( hit.query_seq[:hit.query_start] ) any = True return any n = 0 for (read_name, read_seq), hits in shrimp.iter_read_hits(working_dir): # Skip reads containing Ns if 'N' in read_seq: continue for line in hits: register_divergence(alignment_from_shrimp(line, references, read_name, read_seq)) n += 1 #if n > 100000: # break if n%10000 == 0: grace.status('Processing read %s' % grace.pretty_number(n)) grace.status('') def show_items(items): original_length = len(items) cut = 0 while len(items) > 80: cut += 1 items = [ item for item in items if item[0] >= cut ] for item in items: print item[1] if len(items) < original_length: print '(and %d more occurring %d times or less)' % (original_length-len(items), cut-1) def score(items): if not items: return 1.0 return float(sum( item[0] * item[0] for item in items )) / (sum( item[0] for item in items )**2) def summarize_prefixes(seqs, pad): seqs = sorted(seqs, key=lambda seq: seq[::-1]) cut = 100 while True: items = [ ] for (seq, iterator) in itertools.groupby(seqs, key = lambda x: x[-cut:]): ss = list(iterator) anylong = any( item != seq for item in ss ) n = len(ss) items.append( (n, ('%'+str(pad)+'s')%(('...' if anylong else '') + seq) + ' x %d' % n) ) if score(items) >= 1.0/20: break cut -= 1 show_items(items) def summarize_suffixes(seqs, pad): seqs = sorted(seqs) cut = 100 while True: items = [ ] for (seq, iterator) in itertools.groupby(seqs, key = lambda x: x[:cut]): ss = list(iterator) anylong = any( item != seq for item in ss ) n = len(ss) items.append( (n, ('%'+str(pad)+'s')%('%d x '%n) + seq + ('...' if anylong else '')) ) if score(items) >= 1.0/20: break cut -= 1 show_items(items) print 'Position Depth Changed prefixes Changed suffixes' print ' Count % of depth Count % of depth' for i in xrange(len(references[seq_name])): print '%8d %10d %9d %11s %9d %11s' % ( i+1, depth[seq_name][i], len(prefixes[seq_name][i]), '%.3f%%' % (len(prefixes[seq_name][i])*100.0/depth[seq_name][i]) if prefixes[seq_name][i] else '', len(suffixes[seq_name][i]), '%.3f%%' % (len(suffixes[seq_name][i])*100.0/depth[seq_name][i]) if suffixes[seq_name][i] else '') #summarize_suffixes(suffixes[name][i], references[name][i+1:], references[name], suffix_depth[name][i]) print print 'Details' print for i in xrange(len(references[seq_name])): print '%-80s*' % ('Base %d' % (i+1)) print pad_slice(references[seq_name], i-80,i+1+80) summarize_prefixes(prefixes[seq_name][i], 80) summarize_suffixes(suffixes[seq_name][i], 81) print
def run(self): workspace = self.get_workspace() read_length = 100 left = rand_seq(read_length - 1) while True: flank = rand_seq(1) if flank != self.ref[:1]: break left += flank right = rand_seq(read_length - 1) while True: flank = rand_seq(1) if flank != self.ref[-1:]: break right = flank + right i = 0 variants_used = [] with open(workspace / 'reads.fq', 'wb') as f: for i, variant in enumerate(self.variants): if 'x' in variant: variant, count = variant.split('x') count = int(count) else: count = 10 variants_used.append((variant, count)) seq = left + variant + right for j in xrange(count): pos = len(variant) + random.randrange(read_length - len(variant)) read = seq[pos:pos + read_length] if random.randrange(2): read = bio.reverse_complement(read) i += 1 io.write_fastq(f, 'read_%s_%d' % (variant, i), read, chr(64 + 30) * len(read)) reference = left + self.ref + right primary_variant = left + variants_used[0][0] + right with open(workspace / 'reference.fa', 'wb') as f: io.write_fasta(f, 'chr1', reference) legion.remake_needed() self.analysis( workspace / 'sample', workspace / 'reference.fa', reads=[workspace / 'reads.fq'], ).run() self.freebayes( workspace / 'freebayes', workspace / 'sample', ).run() self.vcf_filter( workspace / 'filtered', workspace / 'freebayes.vcf', ).run() Vcf_patch(workspace / 'patch', workspace / ('sample', 'reference'), workspace / 'filtered.vcf').run() patched = io.read_sequences(workspace / ('patch', 'sample.fa')).next()[1] masked = io.read_sequences( workspace / ('sample', 'consensus_masked.fa')).next()[1].upper() with open(workspace / 'freebayes.vcf', 'rU') as f: reader = vcf.Reader(f) raw_count = len(list(reader)) with open(workspace / 'filtered.vcf', 'rU') as f: reader = vcf.Reader(f) filtered_count = len( list(vcf.Reader(open(workspace / 'filtered.vcf', 'rU')))) with open(workspace / ('sample', 'report.txt'), 'rb') as f: nesoni_count = len(f.readlines()) - 1 self.log.log('\n') self.log.datum(workspace.name, 'changes found by "nesoni consensus:"', nesoni_count) self.log.datum(workspace.name, 'is correctly patched by "nesoni consensus:"', masked == primary_variant) self.log.log('\n') self.log.datum(workspace.name, 'raw variants', raw_count) self.log.datum(workspace.name, 'variants after filtering', filtered_count) self.log.datum(workspace.name, 'is correctly patched by VCF pipeline', patched == primary_variant) self.log.log('\n')
def run(self): #mincov, args = grace.get_option_value(args, '--mincov', int, 1) #maxdiff, args = grace.get_option_value(args, '--maxdiff', int, 16) #minsize, args = grace.get_option_value(args, '--minsize', int, 200) #what, args = grace.get_option_value(args, '--what', as_core_or_unique, 'core') #is_core = (what == 'core') # #grace.expect_no_further_options(args) # #if len(args) < 2: # print >> sys.stderr, HELP # raise grace.Help_shown() # #output_dir, working_dirs = args[0], args[1:] # ##assert not path.exists(path.join(output_dir, 'reference.fa')), \ #assert not path.exists(path.join(output_dir, 'parameters')), \ # 'Output directory not given' # #if not path.exists(output_dir): # os.mkdir(output_dir) assert self.what in ('core','unique'), 'Expected --what to be either "core" or "unique".' is_core = (self.what == 'core') workspace = self.get_workspace() for name, seq in io.read_sequences(working_directory.Working(self.working_dirs[0]).get_reference().reference_fasta_filename()): self.log.log(name + '\n') friendly_name = grace.filesystem_friendly_name(name) good = [ True ] * len(seq) for working_dir in self.working_dirs: if is_core: suffix = '-depth.userplot' else: suffix = '-ambiguous-depth.userplot' data = trivia.read_unstranded_userplot( os.path.join(working_dir, friendly_name+suffix) ) assert len(seq) == len(data) for i in xrange(len(seq)): if good[i]: if is_core: good[i] = data[i] >= self.mincov else: good[i] = data[i] < self.mincov #Close holes start = -self.maxdiff-1 n_holes = 0 for i in xrange(len(seq)): if good[i]: if 0 < i-start <= self.maxdiff: for j in xrange(start,i): good[j] = True n_holes += 1 start = i+1 self.log.log('Closed '+grace.pretty_number(n_holes)+' holes\n') f = open( workspace/('%s-%s.fa' % (friendly_name,self.what)), 'wb') io.write_fasta(f, name, ''.join([ (seq[i] if good[i] else 'N') for i in xrange(len(seq)) ]) ) f.close() f = open( workspace/('%s-%s_masked.fa' % (friendly_name,self.what)), 'wb') io.write_fasta(f, name, ''.join([ (seq[i] if good[i] else seq[i].lower()) for i in xrange(len(seq)) ]) ) f.close() f_good = open( workspace/('%s-%s_parts.fa' % (friendly_name,self.what)), 'wb') f_nongood = open( workspace/('%s-non%s_parts.fa' % (friendly_name,self.what)), 'wb') start = 0 n_good = [0] n_good_bases = [0] def emit(i): if i-start < self.minsize: return if good[start]: n_good[0] += 1 n_good_bases[0] += i-start io.write_fasta( f_good if good[start] else f_nongood, '%s:%d..%d' % (name, start+1,i), seq[start:i] ) for i in xrange(1,len(seq)): if good[i] != good[start]: emit(i) start = i emit(len(seq)) f_nongood.close() f_good.close() self.log.log(grace.pretty_number(sum(good))+' bases are '+self.what+', of '+grace.pretty_number(len(seq))+' in reference sequence\n') self.log.log(grace.pretty_number(n_good[0])+' parts at least '+grace.pretty_number(self.minsize)+' bases long with '+grace.pretty_number(n_good_bases[0])+' total bases\n') self.log.log('\n')
def run(self): #assert not self.utr_only or self.utrs, '--utrs-only yes but no --utrs given' # Reference genome #chromosome_lengths = reference_directory.Reference(self.reference, must_exist=True).get_lengths() chromosomes = collections.OrderedDict(io.read_sequences(self.reference)) def get_interpeak_seq(peaks): start = min(item.transcription_stop for item in peaks) end = max(item.transcription_stop for item in peaks) if end-start > self.max_seq: return '' if peaks[0].strand >= 0: return chromosomes[peaks[0].seqid][start:end] else: return bio.reverse_complement(chromosomes[peaks[0].seqid][start:end]) def get_prepeak_seq(gene,peaks): if gene.strand >= 0: start = gene.utr_pos end = min(item.transcription_stop for item in peaks) if end-start > self.max_seq: return '' return chromosomes[gene.seqid][start:end] else: start = max(item.transcription_stop for item in peaks) end = gene.utr_pos if end-start > self.max_seq: return '' return bio.reverse_complement(chromosomes[gene.seqid][start:end]) # Normalization files if self.norm_file: norm_file = self.norm_file else: nesoni.Norm_from_counts(self.prefix+'-norm', self.counts).run() norm_file = self.prefix+'-norm.csv' norms = io.read_grouped_table(norm_file, [('All',str)])['All'] pair_norm_names = [ ] pair_norms = [ ] for i in xrange(len(norms)): pair_norm_names.append(norms.keys()[i]+'-peak1') pair_norms.append(norms.values()[i]) for i in xrange(len(norms)): pair_norm_names.append(norms.keys()[i]+'-peak2') pair_norms.append(norms.values()[i]) io.write_grouped_csv( self.prefix+'-pairs-norm.csv', [('All',io.named_list_type(pair_norm_names)(pair_norms))], comments=['#Normalization'], ) # Read data annotations = list(annotation.read_annotations(self.parents)) if self.utrs: utrs = list(annotation.read_annotations(self.utrs)) else: utrs = [ ] children = list(annotation.read_annotations(self.children)) count_table = io.read_grouped_table(self.counts, [ ('Count',int), ('Tail_count',int), ('Tail',_float_or_none), ('Proportion',_float_or_none), ('Annotation',str) ]) counts = count_table['Count'] tail_counts = count_table['Tail_count'] proportions = count_table['Proportion'] tails = count_table['Tail'] samples = counts.value_type().keys() sample_tags = { } for line in count_table.comments: if line.startswith('#sampleTags='): parts = line[len('#sampleTags='):].split(',') assert parts[0] not in sample_tags sample_tags[parts[0]] = parts for item in children: item.weight = sum( counts[item.get_id()][name] * float(norms[name]['Normalizing.multiplier']) for name in samples ) parents = [ ] id_to_parent = { } for item in annotations: if item.type != self.parent_type: continue assert item.get_id() not in id_to_parent, 'Duplicate id in parent file: '+item.get_id() parents.append(item) id_to_parent[item.get_id()] = item item.children = [ ] #item.cds = [ ] # Default utr if item.strand >= 0: item.utr_pos = item.end else: item.utr_pos = item.start if 'three_prime_UTR_start' in item.attr: if item.strand >= 0: item.utr_pos = int(item.attr['three_prime_UTR_start'])-1 else: item.utr_pos = int(item.attr['three_prime_UTR_start']) for item in utrs: assert item.attr['Parent'] in id_to_parent, 'Unknown gene '+item.attr['Parent'] id_to_parent[item.attr['Parent']].utr_pos = (item.start if item.strand >= 0 else item.end) for item in children: item.transcription_stop = item.end if item.strand >= 0 else item.start #End of transcription, 0-based, ie between-positions based if 'Parent' in item.attr: for item_parent in item.attr['Parent'].split(','): parent = id_to_parent[item_parent] parent.children.append(item) for item in parents: item.children.sort(key=_annotation_sorter) relevant = list(item.children) if self.utr_only: #if item.strand <= 0: # relative_utr_start = item.end - int(item.attr['three_prime_UTR_start']) #else: # relative_utr_start = int(item.attr['three_prime_UTR_start'])-1 - item.start # #def relative_start(peak): # return item.end-peak.end if item.strand < 0 else peak.start-item.start #relevant = [ peak for peak in relevant if relative_start(peak) >= relative_utr_start ] relevant = [ peak for peak in relevant if (peak.end >= item.utr_pos if item.strand >= 0 else peak.start <= item.utr_pos) ] if self.top: relevant.sort(key=lambda peak:peak.weight, reverse=True) relevant = relevant[:self.top] relevant.sort(key=_annotation_sorter) item.relevant_children = relevant # JSON output j_data = { } j_genes = j_data['genes'] = { } j_genes['__comment__'] = 'start is 0-based' j_genes['name'] = [ ] j_genes['chromosome'] = [ ] j_genes['strand'] = [ ] j_genes['start'] = [ ] j_genes['utr'] = [ ] j_genes['end'] = [ ] j_genes['gene'] = [ ] j_genes['product'] = [ ] j_genes['peaks'] = [ ] j_genes['relevant_peaks'] = [ ] #j_genes['cds'] = [ ] #j_genes['cds_start'] = [ ] #j_genes['cds_end'] = [ ] for item in parents: j_genes['name'].append( item.get_id() ) j_genes['chromosome'].append( item.seqid ) j_genes['strand'].append( item.strand ) j_genes['start'].append( item.start ) j_genes['utr'].append( item.utr_pos ) j_genes['end'].append( item.end ) j_genes['gene'].append( item.attr.get('Name',item.attr.get('gene','')) ) j_genes['product'].append( item.attr.get('Product',item.attr.get('product','')) ) j_genes['peaks'].append( [ item2.get_id() for item2 in item.children ] ) j_genes['relevant_peaks'].append( [ item2.get_id() for item2 in item.relevant_children ] ) #j_genes['cds'].append( item.cds ) #j_genes['cds_start'].append( item.cds_start ) #j_genes['cds_end'].append( item.cds_end ) j_peaks = j_data['peaks'] = { } j_peaks['__comment__'] = 'start is 0-based' j_peaks['name'] = [ ] j_peaks['chromosome'] = [ ] j_peaks['strand'] = [ ] j_peaks['start'] = [ ] j_peaks['end'] = [ ] j_peaks['parents'] = [ ] j_peaks['counts'] = [ ] j_peaks['tail_lengths'] = [ ] j_peaks['proportion_tailed'] = [ ] for item in children: j_peaks['name'].append( item.get_id() ) j_peaks['chromosome'].append( item.seqid ) j_peaks['strand'].append( item.strand ) j_peaks['start'].append( item.start ) j_peaks['end'].append( item.end ) j_peaks['parents'].append( item.attr['Parent'].split(',') if 'Parent' in item.attr else [ ]) j_peaks['counts'].append( counts[item.get_id()].values() ) j_peaks['tail_lengths'].append( count_table['Tail'][item.get_id()].values() ) j_peaks['proportion_tailed'].append( count_table['Proportion'][item.get_id()].values() ) j_samples = j_data['samples'] = { } j_samples['name'] = [ ] j_samples['tags'] = [ ] j_samples['normalizing_multiplier'] = [ ] for name in samples: j_samples['name'].append(name) j_samples['tags'].append(sample_tags[name]) j_samples['normalizing_multiplier'].append(float(norms[name]['Normalizing.multiplier'])) j_chromosomes = j_data['chromosomes'] = { } j_chromosomes['name'] = [ ] j_chromosomes['length'] = [ ] for name, seq in chromosomes.iteritems(): j_chromosomes['name'].append(name) j_chromosomes['length'].append(len(seq)) with open(self.prefix + '.json','wb') as f: json.dump(j_data, f) # Output paired peak file output_comments = [ '#Counts' ] output_samples = [ ] for item in samples: output_samples.append(item+'-peak1') output_comments.append('#sampleTags=' + ','.join([item+'-peak1','peak1']+sample_tags.get(item,[]))) for item in samples: output_samples.append(item+'-peak2') output_comments.append('#sampleTags=' + ','.join([item+'-peak2','peak2']+sample_tags.get(item,[]))) output_names = [ ] output_counts = [ ] output_tail_counts = [ ] output_proportions = [ ] output_tails = [ ] output_annotation_fields = [ 'gene', 'product', 'mean_tail_1', 'mean_tail_2', 'chromosome', 'strand', 'transcription_stops' ] #, 'interpeak_seq', ] output_annotations = [ ] for item in parents: peaks = item.relevant_children for i in xrange(len(peaks)-1): for j in xrange(i+1, len(peaks)): id_i = peaks[i].get_id() id_j = peaks[j].get_id() id_pair = item.get_id() + '-'+id_i+'-'+id_j output_names.append(id_pair) row = [ ] row.extend(counts[id_i].values()) row.extend(counts[id_j].values()) output_counts.append(filter(_text,row)) row = [ ] row.extend(tail_counts[id_i].values()) row.extend(tail_counts[id_j].values()) output_tail_counts.append(filter(_text,row)) row = [ ] row.extend(proportions[id_i].values()) row.extend(proportions[id_j].values()) output_proportions.append(filter(_text,row)) row = [ ] row.extend(tails[id_i].values()) row.extend(tails[id_j].values()) output_tails.append(filter(_text,row)) output_annotations.append([ item.attr.get('Name',item.attr.get('gene','')), item.attr.get('Product',item.attr.get('product','')), count_table['Annotation'][id_i]['mean-tail'], count_table['Annotation'][id_j]['mean-tail'], item.seqid, str(item.strand), '%d, %d' % (peaks[i].transcription_stop,peaks[j].transcription_stop), #get_interpeak_seq([peaks[i],peaks[j]]), ]) #output_count_table = io.named_matrix_type(output_names,output_samples)(output_counts) io.write_grouped_csv( self.prefix + '-pairs.csv', [ ('Count',io.named_matrix_type(output_names,output_samples)(output_counts)), ('Tail_count',io.named_matrix_type(output_names,output_samples)(output_tail_counts)), ('Proportion',io.named_matrix_type(output_names,output_samples)(output_proportions)), ('Tail',io.named_matrix_type(output_names,output_samples)(output_tails)), ('Annotation',io.named_matrix_type(output_names,output_annotation_fields)(output_annotations)), ], comments=output_comments, ) # # Chi Sq tests # # #for id in relation: # # peaks = relation[id] # # if len(peaks) < 2: continue # # mats = [ ] # genes = [ ] # products = [ ] # mean_tails = [ ] # prop_tails = [ ] # # peak_names = [ ] # chromosome_names = [ ] # strands = [ ] # transcription_stops = [ ] # interpeak_seqs = [ ] # prepeak_seqs = [ ] # # for parent in parents: # id = parent.get_id() # peaks = parent.relevant_children # if len(peaks) < 2: continue # # matrix = [ ] # for item in peaks: # matrix.append(counts[item.get_id()].values()) # # mats.append( # runr.R_literal(id) + ' = ' + # runr.R_literal(matrix) # ) # # genes.append(parent.attr.get('Name',parent.attr.get('gene',''))) # products.append(parent.attr.get('Product',parent.attr.get('product',''))) # # def format_mean(s): # if s == 'NA': return 'NA' # return '%.1f' % float(s) # mean_tails.append(', '.join( format_mean(count_table['Annotation'][item.get_id()]['mean-tail']) for item in peaks )) # # def format_prop(s): # if s == 'NA': return 'NA' # return '%.2f' % float(s) # prop_tails.append(', '.join( format_prop(count_table['Annotation'][item.get_id()]['proportion-with-tail']) for item in peaks )) # # peak_names.append(', '.join(item.get_id() for item in peaks)) # chromosome_names.append(parent.seqid) # strands.append(parent.strand) # transcription_stops.append(', '.join(str(item.transcription_stop) for item in peaks)) # interpeak_seqs.append(get_interpeak_seq(peaks)) # prepeak_seqs.append(get_prepeak_seq(parent,peaks)) # # #if len(mats) >= 10: break # # text = 'cat("Loading data into R+\n")\n' # text += 'data <- list(\n' + ',\n'.join(mats) + ')\n' # text += CHISQ # # runr.run_script(text, # OUTPUT_FILENAME=self.prefix+'.csv', # GENES = genes, # PRODUCTS = products, # MEAN_TAILS = mean_tails, # PROP_TAILS = prop_tails, # PEAK_NAMES = peak_names, # CHROMOSOME_NAMES = chromosome_names, # STRANDS = strands, # TRANSCRIPTION_STOPS = transcription_stops, # INTERPEAK_SEQS = interpeak_seqs, # PREPEAK_SEQS = prepeak_seqs, # ) #
def run(self): """ <sequence> <poly-A> <adaptor> <anything> """ clip_quality = chr(33+self.clip_quality) ignore_quality = chr(33+self.ignore_quality) with io.open_possibly_compressed_writer(self.prefix+'.fastq.gz') as out_file, \ io.open_possibly_compressed_writer(self.prefix+'.clips.gz') as out_clips_file: print >> out_clips_file, '#Read\tread length\tpoly-A start\tpoly-A end\tpoly-A start, ignoring adaptor\tpoly-A end, ignoring adaptor\tadaptor bases matched' n = 0 n_discarded = 0 n_clipped = 0 total_before = 0 total_clipped = 0 for filename in self.filenames: for name, seq, qual in io.read_sequences(filename, qualities='required'): # "Good quality" sequence ends at the first low quality base #good_quality_end = 0 #while good_quality_end < len(seq) and qual[good_quality_end] >= clip_quality: # good_quality_end += 1 goodness_score = 0 best_goodness_score = 0 good_quality_end = 0 i = 0 while True: if goodness_score > best_goodness_score: best_goodness_score = goodness_score good_quality_end = i if i >= len(seq): break if qual[i] >= clip_quality: goodness_score += 1 else: goodness_score -= 9 i += 1 best_score = 0 best_a_start = good_quality_end best_a_end = good_quality_end best_adaptor_bases = 0 best_aonly_score = 0 best_aonly_start = good_quality_end best_aonly_end = good_quality_end # Consider each possible start position for the poly(A) for a_start in xrange(good_quality_end): if a_start and seq[a_start-1] == 'A': continue # Consider each possible end position for the poly(A) a_end = a_start aonly_score = 0 while True: if aonly_score > best_aonly_score: best_aonly_score = aonly_score best_aonly_start = a_start best_aonly_end = a_end # The poly(A) should be followed by adaptor, # at least until the end of good quality sequence. # However if there is evidence of the adaptor beyond # the end of good quality, we still want to know that, # and count it towards the number of adaptor bases present. score = aonly_score adaptor_bases = 0 i = a_end while True: if (score > best_score and (i >= good_quality_end or i >= a_end+len(self.adaptor))): best_score = score best_a_start = a_start best_a_end = a_end best_adaptor_bases = adaptor_bases if i >= a_end+len(self.adaptor) or i >= len(seq): break if qual[i] >= ignore_quality: if seq[i] == self.adaptor[i-a_end]: score += 1 adaptor_bases += 1 else: score -= 4 i += 1 #if a_end >= len(seq): break # poly(A) tail only within good quality region. if a_end >= good_quality_end: break if qual[a_end] >= ignore_quality: if seq[a_end] == 'A': aonly_score += 1 else: aonly_score -= 4 if aonly_score <= 0: break a_end += 1 a_start = best_a_start a_end = best_a_end adaptor_bases = best_adaptor_bases aonly_start = best_aonly_start aonly_end = best_aonly_end if self.debug: # and a_end == a_start and a_end < len(seq)-10: print name print ''.join( 'I' if item<ignore_quality else ('C' if item<clip_quality else ' ') for item in qual ) print '-' * good_quality_end print seq print ' '*a_start + 'A'*(a_end-a_start) + self.adaptor + ".%d %d"%(adaptor_bases,best_score) #print ' '*aonly_start + 'A'*(aonly_end-aonly_start) + "." print sys.stdout.flush() n += 1 total_before += len(seq) # 0 - sequence name # 1 - sequence length # 2 - poly(A) start # 3 - poly(A) end # (4 - best run of As start, for debugging the need to detect adaptor seq) # (5 - best run of As end) # 6 - number of adaptor bases matched print >> out_clips_file, '%s\t%d\t%d\t%d\t%d\t%d\t%d' % (name, len(seq) , a_start, a_end, aonly_start, aonly_end, adaptor_bases) if a_start > self.length: if a_start < len(seq): n_clipped += 1 total_clipped += a_start print >> out_file, '@'+name print >> out_file, seq[:a_start] print >> out_file, '+' print >> out_file, qual[:a_start] else: n_discarded += 1 if n%10000 == 0: grace.status('Clip-runs ' + self.sample + ' ' + grace.pretty_number(n)) # + ' (' + grace.pretty_number(len(dstates)) + ' dstates)') grace.status('') self.log.datum(self.sample,'reads',n) if n: self.log.datum(self.sample,'mean length before poly-A/adaptor clipping',float(total_before)/n) self.log.datum(self.sample,'reads discarded as too short after poly-A/adaptor clipping',n_discarded) self.log.datum(self.sample,'reads poly-A/adaptor clipped and kept',n_clipped) if n_clipped: self.log.datum(self.sample,'mean length clipped',float(total_clipped)/n_clipped)
def run(self): references = {} for filename in self.reference_filenames: print >> sys.stderr, 'Load', filename for name, seq in io.read_sequences(filename): references[name] = seq reads = {} for filename in self.reads: print >> sys.stderr, 'Load', filename for name, seq, qual in io.read_sequences(filename, qualities='required'): reads[name] = (seq, qual) print >> sys.stderr, 'Begin' in_file = self.begin_input() out_file = self.begin_output() for line in in_file: line = line.rstrip() if line.startswith('@'): print >> out_file, line continue al = Alignment(line) if al.flag & FLAG_UNMAPPED: continue reverse = al.flag & FLAG_REVERSE if reverse: read_bases = rev_comp(al.seq) read_qual = al.qual[::-1] cigar = cigar_decode(al.cigar)[::-1] else: read_bases = al.seq read_qual = al.qual cigar = cigar_decode(al.cigar) al.extra = [ item for item in al.extra if not item.startswith('CQ:Z:') and not item.startswith('CS:Z:') ] + [ 'CQ:Z:' + reads[al.qname][1], 'CS:Z:' + reads[al.qname][0], ] ref = references[al.rname] seq_tail = reads[al.qname][0][len(al.seq) + 1:] qual_tail = reads[al.qname][1][len(al.seq):] n_tail = len(seq_tail) if reverse: if al.pos - 1 - n_tail < 0: continue #TODO: handle tail extending beyond end of reference bases_ref = rev_comp(ref[al.pos - 1 - n_tail:al.pos - 1 + 1]) else: if al.pos - 1 + al.length + n_tail > len(ref): continue #TODO: handle tail extending beyond end of reference bases_ref = ref[al.pos - 1 + al.length - 1:al.pos - 1 + al.length + n_tail] seq_ref = solid_encode(bases_ref) basic_score = alignment_score(qual_tail, seq_tail, seq_ref, self.quality) if n_tail: tail_score, tail_pos = max((alignment_score( qual_tail, seq_tail, solid_encode(bases_ref[:1 + i] + 'A' * (n_tail - i)), self.quality)[0], i) for i in xrange(n_tail + 1)) baseline = max( 0, alignment_score(qual_tail[:tail_pos], seq_tail[:tail_pos], seq_ref[:tail_pos], self.quality)[0]) if tail_score >= baseline + self.tail: #Record position of end of transcript in 'AA' (1-based position) if reverse: tail_refpos = al.pos - tail_pos else: tail_refpos = al.pos + al.length + tail_pos - 1 al.extra.append('AA:i:%d' % tail_refpos) #Record sequence's poly(A) tail length in AN: from the end of correspondence with the reference sequence to the end of good quality As estimated_tail_length = alignment_score( qual_tail, seq_tail, solid_encode(bases_ref[:1 + tail_pos] + 'A' * (n_tail - tail_pos)), self.quality)[1] - tail_pos if estimated_tail_length > 0: al.extra.append('AN:i:%d' % estimated_tail_length) if tail_pos: read_bases += solid_decode(read_bases[-1], seq_tail[:tail_pos]) read_qual += qual_tail[:tail_pos] cigar += 'M' * tail_pos al.length += tail_pos if reverse: al.pos -= tail_pos al.seq = rev_comp(read_bases) al.qual = read_qual[::-1] al.cigar = cigar_encode(cigar[::-1]) else: al.seq = read_bases al.qual = read_qual al.cigar = cigar_encode(cigar) print >> out_file, al self.end_output(out_file) self.end_input(in_file)
# python2.6 modify_sequence.py data/velvet_test_reference.fa >data/velvet_test_reference_modified.fa import sys, random from nesoni import io for name, seq in io.read_sequences(sys.argv[1]): j = 0 for i in xrange(0, len(seq) - 100, 100): original = seq[i] if j % 3 == 0: while True: new = random.choice('ACGT') if new != original: break seq = seq[:i] + new + seq[i + 1:] elif j % 3 == 1: n = (j // 3) % 9 + 1 seq = seq[:i] + seq[i + n:] else: n = (j // 3) % 9 + 1 seq = seq[:i] + ''.join(random.choice('ACGT') for k in xrange(n)) + seq[i:] j += 1 io.write_fasta(sys.stdout, name, seq)
def pastiche(args): if len(args) < 4: print USAGE return 1 mask_only, args = grace.get_option_value(args, '--mask', grace.as_bool, False) min_leftover, args = grace.get_option_value(args, '--min-leftover', int, 20) output_dir, args = args[0], args[1:] #, ref_filename, contig_filenames = args[0], args[1], args[2:] ref_filenames = [] contig_filenames = [] grace.execute(args, {'contigs': lambda args: contig_filenames.extend(args)}, lambda args: ref_filenames.extend(args)) assert ref_filenames, 'No reference sequences given' assert contig_filenames, 'No contig sequences given' contigs = dict([(name.split()[0], seq) for filename in contig_filenames for name, seq in io.read_sequences(filename)]) dir_contigs = {} for name in contigs: dir_contigs[name + '+'] = contigs[name] dir_contigs[name + '-'] = bio.reverse_complement(contigs[name]) dir_contigs_used = {} for name in dir_contigs: dir_contigs_used[name] = [False] * len(dir_contigs[name]) workspace = io.Workspace(output_dir) temp_prefix = workspace._object_filename('temp-pastiche') out_f = workspace.open('pastiche.fa', 'wb') for ref_filename in ref_filenames: for ref_name, ref_seq in io.read_sequences(ref_filename): ref_name = ref_name.split()[0] grace.status(ref_name) f = open(temp_prefix + '.fa', 'wb') io.write_fasta(f, 'ref', ref_seq) f.close() scores = [-1] * (len(ref_seq) * 2) strings = ['N', ''] * (len(ref_seq)) contexts = [None for i in xrange(len(ref_seq) * 2)] #MAXSCORE = len(ref_seq)+1 #for i in xrange(len(ref_seq)): # if ref_seq[i].upper() != 'N': # strings[i*2] = ref_seq[i] # scores[i*2] = MAXSCORE #for i in xrange(len(ref_seq)-1): # if ref_seq[i].upper() != 'N' and ref_seq[i+1].upper() != 'N': # scores[i*2+1] = MAXSCORE if mask_only: for i in xrange(len(ref_seq)): strings[i * 2] = ref_seq[i].lower() def put(position, dir_contig_name, start, end, score): if scores[position] < score: scores[position] = score strings[position] = dir_contigs[dir_contig_name][start:end] contexts[position] = (dir_contig_name, start, end, score) for contig_filename in contig_filenames: execute([ 'nucmer', '--prefix', temp_prefix, #'--maxmatch', #Very slow '--nosimplify', '--minmatch', '9', '--mincluster', '50', #'--maxgap', '1000', #'--breaklen', '1000', # Increasing this reduces Ns, but is slow #'--diagfactor', '1.0', temp_prefix + '.fa', contig_filename ]) for contig_name, contig_seq in io.read_sequences( contig_filename): contig_name = contig_name.split()[0] grace.status(ref_name + ' vs ' + contig_name) p = run([ 'show-aligns', temp_prefix + '.delta', 'ref', contig_name ], stderr=subprocess.PIPE) alignments = [] while True: line = p.stdout.readline() if not line: break if not line.startswith('-- BEGIN'): continue parts = line.split() ref_start = int(parts[5]) ref_end = int(parts[7]) query_start = int(parts[10]) query_end = int(parts[12]) #assert ref_start < ref_end #ref_start -= 1 #Zero based coordinates al_ref = [] al_query = [] while True: block = [] end = False while True: line = p.stdout.readline() if line.startswith('-- END'): end = True break if line == '\n': if block: break else: continue block.append(line) if end: break al_ref.append(block[0].split()[1]) al_query.append(block[1].split()[1]) al_ref = ''.join(al_ref) al_query = ''.join(al_query) if ref_start > ref_end: al_ref = bio.reverse_complement(al_ref) al_query = bio.reverse_complement(al_query) ref_start, ref_end = ref_end, ref_start query_start, query_end = query_end, query_start if query_start > query_end: dir_contig_name = contig_name + '-' query_start = len(contig_seq) + 1 - query_start query_end = len(contig_seq) + 1 - query_end else: dir_contig_name = contig_name + '+' ref_start -= 1 #Zero based coordinates query_start -= 1 #print al_ref #print al_query #Pretty dumb scoring scheme al_score = 0 for i in xrange(len(al_ref)): if al_ref[i] == al_query[i]: al_score += 1 #else: # al_score -= 1 #Pastiche alignment over reference ref_pos = ref_start query_pos = query_start al_pos = 0 while al_pos < len(al_ref): assert al_ref[al_pos] != '.' if al_query[al_pos] == '.': put(ref_pos * 2, dir_contig_name, query_pos, query_pos, al_score) else: assert al_query[al_pos].lower() == dir_contigs[ dir_contig_name][query_pos].lower() put(ref_pos * 2, dir_contig_name, query_pos, query_pos + 1, al_score) query_pos += 1 al_pos += 1 al_pos_end = al_pos query_pos_end = query_pos while al_pos_end < len( al_ref) and al_ref[al_pos_end] == '.': al_pos_end += 1 query_pos_end += 1 #put(ref_pos*2+1, al_query[al_pos:al_pos_end], al_score) assert al_query[al_pos:al_pos_end].lower( ) == dir_contigs[dir_contig_name][ query_pos:query_pos_end].lower() put(ref_pos * 2 + 1, dir_contig_name, query_pos, query_pos_end, al_score) al_pos = al_pos_end query_pos = query_pos_end ref_pos += 1 p.wait() grace.status(ref_name) result = ''.join(strings) io.write_fasta(out_f, ref_name, result) for context in contexts: if context is None: continue name, start, end, score = context for i in xrange(start, end): dir_contigs_used[name][i] = True #Interpolation #result = [ ] #i = 0 #while i < len(ref_seq): # if strings[i*2].upper() != 'N': # result.append(strings[i*2]) # result.append(strings[i*2+1]) # i += 1 # continue # # j = i # while strings[j*2].upper() == 'N': # j += 1 # # grace.status('') # print >> sys.stderr, 'interpolating', i+1,'..',j # # window = 20 #!!!!!!!!!!! # left_contexts = collections.defaultdict(lambda:0) # for i1 in xrange(max(0,i-window),i): # for context_name, context_start, context_end, context_score in contexts[i1*2]: # key = (context_name, context_end + i - i1) # left_contexts[key] = max(left_contexts[key],context_score) # # right_contexts = collections.defaultdict(lambda:0) # for j1 in xrange(j,min(j+window,len(ref_seq))): # for context_name, context_start, context_end, context_score in contexts[j1*2]: # key = (context_name, context_start + j - j1) # right_contexts[key] = max(left_contexts[key],context_score) # # #print >> sys.stderr, left_contexts # #print >> sys.stderr, right_contexts # # options = [ ] # # for (left_name, left_pos), left_score in left_contexts.items(): # for (right_name, right_pos), right_score in right_contexts.items(): # if left_name != right_name: continue # if right_pos < left_pos: continue # # if right_pos-left_pos > (j-i) * 4.0 + 10: continue #!!!!!!!!!!!!!!!!!!!!!!1 # if right_pos-left_pos < (j-i) * 0.25 - 10: continue # # score = float(min(right_pos-left_pos,j-i))/max(right_pos-left_pos,j-i) # score *= left_score + right_score # #print >> sys.stderr, left_name, right_pos-left_pos, j-i, score # options.append( (score, left_name, left_pos, right_pos) ) # # if options: # best = max(options, key=lambda option: option[0]) # print >> sys.stderr, '->', best # result.append( dir_contigs[best[1]][best[2]:best[3]].lower() ) # else: # print >> sys.stderr, '-> no good interpolation' # result.append( ref_seq[i:j] ) # # i = j # #result = ''.join(result) #io.write_fasta(sys.stdout, ref_name, result) #print >> sys.stderr, len(result), result.count('N') #for pos, size in N_runs: # out_size = len(''.join( strings[pos*2:pos*2+2] )) # print >> sys.stderr, pos, size, '->', out_size out_f.close() grace.status('') #for name, seq in io.read_sequences(ref_filename): # result = pastiche(seq, contigs_filename) # io.write_fasta(sys.stdout, name, result) leftover_f = workspace.open('leftovers.fa', 'wb') for name in sorted(contigs): used = [ (a or b) for a, b in zip(dir_contigs_used[name + '+'], dir_contigs_used[name + '-'][::-1]) ] i = 0 while i < len(used): j = i while j < len(used) and not used[j]: j += 1 if j - i > min_leftover: if i == 0 and j == len(used): out_name = name else: out_name = name + ':%d..%d' % (i + 1, j) io.write_fasta(leftover_f, out_name, contigs[name][i:j]) i = j + 1 leftover_f.close() for suffix in ['.fa', '.delta']: os.unlink(temp_prefix + suffix)
def run(self): assert self.reads or self.pairs or self.interleaved, 'No reads given' io.check_name_uniqueness(self.reads, self.pairs, self.interleaved) working = self.get_workspace() working.setup_reference(self.references, bowtie=True) working.update_param(snp_cost=2.0) reference = working.get_reference() log_file = open(self.log_filename(),'wb') with workspace.tempspace(dir=working.working_dir) as temp: n = [ 0 ] def tempname(): n[0] += 1 return temp/('%d.fq'%n[0]) def convert(filename): info = io.get_file_info(filename) ok = selection.matches('type-fastq:[compression-none/compression-gzip/compression-bzip2]', info) if ok: return filename result_name = tempname() with open(result_name,'wb') as f: for name, seq, qual in io.read_sequences(filename, qualities='required'): io.write_fastq(f, name, seq, qual) return result_name ones = [ ] twos = [ ] singles = [ ] for pair in self.pairs: assert len(pair) == 2, 'Need two files in each "pair:" section.' ones.append(convert(pair[0])) twos.append(convert(pair[1])) for item in self.interleaved: left_name = tempname() right_name = tempname() ones.append(left_name) twos.append(right_name) with open(left_name,'wb') as left, \ open(right_name,'wb') as right: reader = io.read_sequences(item, qualities='required') while True: try: name, seq, qual = reader.next() except StopIteration: break io.write_fastq(left, name,seq,qual) try: name, seq, qual = reader.next() except StopIteration: raise grace.Error('Interleaved file contains odd number of sequences') io.write_fastq(right, name,seq,qual) for item in self.reads: singles.append(convert(item)) cores = min(self.cores, legion.coordinator().get_cores()) command = ( [ 'bowtie2', '--threads', str(cores), '--rg-id', '1', '--rg', 'SM:'+working.name, ] + self.bowtie_options + [ '-x', reference.get_bowtie_index_prefix() ] ) commands = [ ] if ones: commands.append(command + [ '-1', ','.join(ones), '-2', ','.join(twos) ]) if singles: commands.append(command + [ '-U', ','.join(singles) ]) temp_bam_name = temp/'temp.bam' with io.pipe_to( ['samtools', 'view', '-S', '-b', '-'], stdout=open(temp_bam_name,'wb'), stderr=log_file ) as f: header_sent = False for command in commands: self.log.log('Running:\n' + ' '.join(command) + '\n') with io.pipe_from( command, stderr=log_file, cores=cores ) as f_out: for line in f_out: if not header_sent or not line.startswith('@'): f.write(line) header_sent = True #io.execute([ # 'samtools', 'sort', '-n', temp_bam_name, working/'alignments' # ]) sam.sort_bam(temp_bam_name, working/'alignments', by_name=True, cores=self.cores) log_file.close()
def main(args): grace.require_shrimp_1() n_cpus = grace.how_many_cpus() solid, args = grace.get_flag(args, '--solid') verbose, args = grace.get_flag(args, '--verbose') threshold, args = grace.get_option_value(args, '--threshold', str, '68%') stride, args = grace.get_option_value(args, '--stride', int, 1) max_shrimps, args = grace.get_option_value(args, '--cpus', int, n_cpus) batch_size, args = grace.get_option_value(args, '--batch-size', int, 5000000) input_reference_filenames = [] reads_filenames = [] shrimp_options = ['-h', threshold] if threshold.endswith('%'): threshold = -float(threshold[:-1]) / 100.0 else: threshold = int(threshold) output_dir = [] #As list so can write to from function. Gah. def front_command(args): grace.expect_no_further_options(args) if len(args) < 1: return output_dir.append(args[0]) input_reference_filenames.extend( [os.path.abspath(filename) for filename in args[1:]]) def reads_command(args): grace.expect_no_further_options(args) reads_filenames.extend([[os.path.abspath(filename)] for filename in args]) def pairs_command(args): grace.expect_no_further_options(args) assert len(args) == 2, 'Expected exactly two files in "pairs"' reads_filenames.append( [os.path.abspath(filename) for filename in args]) def shrimp_options_command(args): shrimp_options.extend(args) grace.execute( args, { 'reads': reads_command, '--reads': reads_command, 'pairs': pairs_command, 'shrimp-options': shrimp_options_command, '--shrimp-options': shrimp_options_command, }, front_command) if not output_dir: print >> sys.stderr, USAGE % n_cpus return 1 output_dir = output_dir[0] assert input_reference_filenames, 'No reference files given' assert reads_filenames, 'No read files given' for filename in itertools.chain(input_reference_filenames, *reads_filenames): assert os.path.exists(filename), '%s does not exist' % filename if not os.path.isdir(output_dir): os.mkdir(output_dir) if solid: shrimp = 'rmapper-cs' else: shrimp = 'rmapper-ls' reference_filename = os.path.join(output_dir, 'reference.fa') reference_file = open(reference_filename, 'wb') total_reference_sequences = 0 total_reference_bases = 0 for input_reference_filename in input_reference_filenames: for name, sequence in io.read_sequences(input_reference_filename): #Don't retain any comment name = name.split()[0] io.write_fasta(reference_file, name, sequence) total_reference_sequences += 1 total_reference_bases += len(sequence) reference_file.close() print '%s base%s in %s reference sequence%s' % ( grace.pretty_number(total_reference_bases), 's' if total_reference_bases != 1 else '', grace.pretty_number(total_reference_sequences), 's' if total_reference_sequences != 1 else '') assert total_reference_bases, 'Reference sequence file is empty' config = { 'references': input_reference_filenames, 'reads': reads_filenames, 'stride': stride, 'solid': solid, 'threshold': threshold, } config_file = open(os.path.join(output_dir, 'config.txt'), 'wb') pprint.pprint(config, config_file) config_file.close() output_filename = os.path.join(output_dir, 'shrimp_hits.txt.gz') output_file = gzip.open(output_filename, 'wb') unmapped_filename = os.path.join(output_dir, 'unmapped.fa.gz') unmapped_file = gzip.open(unmapped_filename, 'wb') dirty_filenames = set() dirty_filenames.add(output_filename) dirty_filenames.add(unmapped_filename) #warn_low_threshold = True try: #Cleanup temporary files N = [0] def do_shrimp(read_set): my_number = N[0] N[0] += 1 tempname = os.path.join(output_dir, 'temp%d-%d.fa' % (os.getpid(), my_number)) tempname_out = os.path.join( output_dir, 'temp%d-%d.txt' % (os.getpid(), my_number)) dirty_filenames.add(tempname) dirty_filenames.add(tempname_out) f = open(tempname, 'wb') for read_name, read_seq in read_set: print >> f, '>' + read_name print >> f, read_seq f.close() command = shrimp + ' ' + ' '.join(shrimp_options) + ' ' + \ tempname + ' ' + reference_filename + ' >' + tempname_out if not verbose: command += ' 2>/dev/null' #f = os.popen(command, 'r') child_pid = os.spawnl(os.P_NOWAIT, '/bin/sh', '/bin/sh', '-c', command) #print 'SHRiMP %d running' % my_number def finalize(): exit_status = os.waitpid(child_pid, 0)[1] assert exit_status == 0, 'Shrimp indicated an error' hits = {} # read_name -> [ hit line ] f = open(tempname_out, 'rb') for line in f: if line.startswith('>'): read_name = line.split(None, 1)[0][1:] if read_name not in hits: hits[read_name] = [] hits[read_name].append(line) f.close() for read_name, read_seq in read_set: if read_name in hits: for hit in hits[read_name]: output_file.write(hit) else: print >> unmapped_file, '>' + read_name print >> unmapped_file, read_seq output_file.flush() unmapped_file.flush() os.unlink(tempname) dirty_filenames.remove(tempname) os.unlink(tempname_out) dirty_filenames.remove(tempname_out) #print 'SHRiMP %d finished' % my_number return finalize shrimps = [] reader = iter_reads(config) read_count = 0 while True: read_set = [] read_set_bases = 0 #Read name should not include comment cruft # - SHRIMP passes this through # - might stuff up identification of pairs for read_name, read_seq in reader: read_name = read_name.split()[0] read_set.append((read_name, read_seq)) read_set_bases += len(read_seq) #if warn_low_threshold and len(read_seq)*7 < threshold: #Require 70% exact match # sys.stderr.write('\n*** WARNING: Short reads, consider reducing --threshold ***\n\n') # warn_low_threshold = False read_count += 1 if read_set_bases >= batch_size: break if not read_set: break if len(shrimps) >= max_shrimps: shrimps.pop(0)() shrimps.append(do_shrimp(read_set)) grace.status('SHRiMPing %s' % grace.pretty_number(read_count)) while shrimps: grace.status('Waiting for SHRiMPs to finish %d ' % len(shrimps)) shrimps.pop(0)() grace.status('') output_file.close() dirty_filenames.remove(output_filename) unmapped_file.close() dirty_filenames.remove(unmapped_filename) return 0 finally: for filename in dirty_filenames: if os.path.exists(filename): os.unlink(filename)
def main(args): mincov, args = grace.get_option_value(args, '--mincov', int, 1) maxdiff, args = grace.get_option_value(args, '--maxdiff', int, 16) minsize, args = grace.get_option_value(args, '--minsize', int, 200) what, args = grace.get_option_value(args, '--what', as_core_or_unique, 'core') is_core = (what == 'core') grace.expect_no_further_options(args) if len(args) < 2: print >> sys.stderr, HELP raise grace.Help_shown() output_dir, working_dirs = args[0], args[1:] assert not path.exists(path.join(output_dir, 'reference.fa')), \ 'Output directory not given' if not path.exists(output_dir): os.mkdir(output_dir) for name, seq in io.read_sequences(path.join(working_dirs[0],'reference.fa')): print name friendly_name = grace.filesystem_friendly_name(name) good = [ True ] * len(seq) for working_dir in working_dirs: if is_core: suffix = '-depth.userplot' else: suffix = '-ambiguous-depth.userplot' data = trivia.read_unstranded_userplot( os.path.join(working_dir, friendly_name+suffix) ) assert len(seq) == len(data) for i in xrange(len(seq)): if good[i]: if is_core: good[i] = data[i] >= mincov else: good[i] = data[i] < mincov #Close holes start = -maxdiff-1 n_holes = 0 for i in xrange(len(seq)): if good[i]: if 0 < i-start <= maxdiff: for j in xrange(start,i): good[j] = True n_holes += 1 start = i+1 print 'Closed', grace.pretty_number(n_holes), 'holes' f = open(path.join(output_dir, '%s-%s.fa' % (friendly_name,what)), 'wb') io.write_fasta(f, name, ''.join([ (seq[i] if good[i] else 'N') for i in xrange(len(seq)) ]) ) f.close() f = open(path.join(output_dir, '%s-%s_masked.fa' % (friendly_name,what)), 'wb') io.write_fasta(f, name, ''.join([ (seq[i] if good[i] else seq[i].lower()) for i in xrange(len(seq)) ]) ) f.close() f_good = open(path.join(output_dir, '%s-%s_parts.fa' % (friendly_name,what)), 'wb') f_nongood = open(path.join(output_dir, '%s-non%s_parts.fa' % (friendly_name,what)), 'wb') start = 0 n_good = [0] n_good_bases = [0] def emit(i): if i-start < minsize: return if good[start]: n_good[0] += 1 n_good_bases[0] += i-start io.write_fasta( f_good if good[start] else f_nongood, '%s:%d..%d' % (name, start+1,i), seq[start:i] ) for i in xrange(1,len(seq)): if good[i] != good[start]: emit(i) start = i emit(len(seq)) f_nongood.close() f_good.close() print grace.pretty_number(sum(good)), 'bases are '+what+', of', grace.pretty_number(len(seq)), 'in reference sequence' print grace.pretty_number(n_good[0]), 'parts at least', grace.pretty_number(minsize), 'bases long with', grace.pretty_number(n_good_bases[0]), 'total bases' print
def run(self): """ <sequence> <poly-A> <adaptor> <anything> """ clip_quality = chr(33+self.clip_quality) #ignore_quality = chr(33+self.ignore_quality) with io.open_possibly_compressed_writer(self.prefix+'.fastq.gz') as out_file, \ io.open_possibly_compressed_writer(self.prefix+'.clips.gz') as out_clips_file: print >> out_clips_file, '#Read\tread length\tpoly-A start\tpoly-A end\tpoly-A start, ignoring adaptor\tpoly-A end, ignoring adaptor\tadaptor bases matched' n = 0 n_discarded = 0 n_clipped = 0 total_before = 0 total_clipped = 0 for filename in self.filenames: for name, seq, qual in io.read_sequences(filename, qualities='required'): # "Good quality" sequence ends at the first low quality base #good_quality_end = 0 #while good_quality_end < len(seq) and qual[good_quality_end] >= clip_quality: # good_quality_end += 1 goodness_score = 0 best_goodness_score = 0 good_quality_end = 0 i = 0 while True: if goodness_score > best_goodness_score: best_goodness_score = goodness_score good_quality_end = i if i >= len(seq): break if qual[i] >= clip_quality: goodness_score += 1 else: goodness_score -= 9 i += 1 best_score = self.min_score-1 best_a_start = good_quality_end best_a_end = good_quality_end best_adaptor_bases = 0 best_aonly_score = 0 best_aonly_start = good_quality_end best_aonly_end = good_quality_end # Consider each possible start position for the poly(A) for a_start in xrange(len(seq)): if a_start and seq[a_start-1] == 'A': continue # Consider each possible end position for the poly(A) a_end = a_start aonly_score = 0 while True: if aonly_score > best_aonly_score: best_aonly_score = aonly_score best_aonly_start = a_start best_aonly_end = a_end # The poly(A) should be followed by adaptor, ## at least until the end of good quality sequence. # However if there is evidence of the adaptor beyond # the end of good quality, we still want to know that, # and count it towards the number of adaptor bases present. score = aonly_score adaptor_bases = 0 i = a_end abort_score = best_score-len(self.adaptor) abort_i = min(len(seq), a_end+len(self.adaptor)) while score >= abort_score: #if (score > best_score and # (i >= good_quality_end or i >= a_end+len(self.adaptor))): if score > best_score: best_score = score best_a_start = a_start best_a_end = a_end best_adaptor_bases = adaptor_bases if i >= abort_i: break if seq[i] == self.adaptor[i-a_end]: score += 1 adaptor_bases += 1 else: score -= 4 i += 1 #if a_end >= len(seq): break # Modified 2018-03-21 # poly(A) tail only within good quality region. #if a_end >= good_quality_end: break #if qual[a_end] >= ignore_quality: # if seq[a_end] == 'A': # aonly_score += 1 # else: # aonly_score -= 4 # if aonly_score <= 0: break if a_end >= len(seq): break if seq[a_end] == 'A': aonly_score += 1 else: #if qual[a_end] >= ignore_quality: aonly_score -= 4 #else: # aonly_score -= 1 a_end += 1 # 2018-03-21 # Look for tail starting after good quality, # however don't call a tail if starts after good quality if best_a_start > good_quality_end: best_a_start = good_quality_end best_a_end = good_quality_end best_adaptor_bases = 0 best_score = 0 a_start = best_a_start a_end = best_a_end adaptor_bases = best_adaptor_bases aonly_start = best_aonly_start aonly_end = best_aonly_end if self.debug: # and a_end == a_start and a_end < len(seq)-10: print name print ''.join( ('C' if item<clip_quality else ' ') for item in qual ) print '-' * good_quality_end print seq print ' '*a_start + 'A'*(a_end-a_start) + self.adaptor + ".%d %d"%(adaptor_bases,best_score) #print ' '*aonly_start + 'A'*(aonly_end-aonly_start) + "." print sys.stdout.flush() n += 1 total_before += len(seq) # 0 - sequence name # 1 - sequence length # 2 - poly(A) start # 3 - poly(A) end # (4 - best run of As start, for debugging the need to detect adaptor seq) # (5 - best run of As end) # 6 - number of adaptor bases matched print >> out_clips_file, '%s\t%d\t%d\t%d\t%d\t%d\t%d' % (name, len(seq) , a_start, a_end, aonly_start, aonly_end, adaptor_bases) if a_start >= self.length: if a_start < len(seq): n_clipped += 1 total_clipped += a_start print >> out_file, '@'+name print >> out_file, seq[:a_start] print >> out_file, '+' print >> out_file, qual[:a_start] else: n_discarded += 1 if n%10000 == 0: grace.status('Clip-runs ' + self.sample + ' ' + grace.pretty_number(n)) # + ' (' + grace.pretty_number(len(dstates)) + ' dstates)') # Option to do a quick subsample if self.only and self.only <= n: break grace.status('') self.log.datum(self.sample,'reads',n) if n: self.log.datum(self.sample,'mean length before poly-A/adaptor clipping',float(total_before)/n) self.log.datum(self.sample,'reads discarded as too short after poly-A/adaptor clipping',n_discarded) self.log.datum(self.sample,'reads poly-A/adaptor clipped and kept',n_clipped) if n_clipped: self.log.datum(self.sample,'mean length clipped',float(total_clipped)/n_clipped)