def sample_genome_like_peaks(in_peaks, out_files): """Sample from the genome, keeping the sample widths the same as peaks""" out_sample, out_locations = out_files[:2] peak_lengths = array( 'i', (stop - start for chrom, start, stop, strand in readBedLines(open(in_peaks)))) if len(peak_lengths) == 0: raise RuntimeError("Peaks file %s is empty!" % in_peaks) wb_genome = worldbase(cfg.get('DEFAULT', 'worldbase_genome')) s = sampling.sample_genome( wb_genome, peak_lengths, sampleSize=cfg.getint('motifs', 'motif_significance_sample_size'), excludeRepeat=cfg.getboolean('motifs', 'sampling_exclude_repeats'), excludeN=cfg.getboolean('motifs', 'sampling_exclude_N'), ignoreCharacters='_', weighted=True) with open(out_sample, 'w') as outfile: with open(out_locations, 'w') as outlocations: for index, line in enumerate(s): outfile.write('>%s\n%s\n' % (index, line)) outlocations.write('\t'.join([ line.id, str(line.start), str(line.stop), str(index), '0', '+' if line.orientation == 1 else '-' ]) + '\n')
def sample_genome_like_peaks(in_peaks, out_files): """Sample from the genome, keeping the sample widths the same as peaks""" out_sample, out_locations = out_files[:2] peak_lengths = array('i', (stop - start for chrom, start, stop, strand in readBedLines(open(in_peaks)))) if len(peak_lengths) == 0: raise RuntimeError("Peaks file %s is empty!" % in_peaks) wb_genome = worldbase(cfg.get('DEFAULT', 'worldbase_genome')) s = sampling.sample_genome(wb_genome, peak_lengths, sampleSize=cfg.getint('motifs', 'motif_significance_sample_size'), excludeRepeat=cfg.getboolean('motifs', 'sampling_exclude_repeats'), excludeN=cfg.getboolean('motifs', 'sampling_exclude_N'), ignoreCharacters='_', weighted=True) with open(out_sample, 'w') as outfile: with open(out_locations, 'w') as outlocations: for index, line in enumerate(s): outfile.write('>%s\n%s\n' % (index, line)) outlocations.write('\t'.join([line.id, str(line.start), str(line.stop), str(index), '0', '+' if line.orientation == 1 else '-']) + '\n')
def trim_regex(in_fastq, out_fastq, trim_pattern): """Search the reads for a regex, and trim everything matching the pattern and all succeeding sequence. """ pattern = re.compile(trim_pattern) with gzip.open(in_fastq) as infile: with gzip.open(out_fastq, 'w') as outfile: for header, seq, qual in parseFastq(infile): matches = [m.span() for m in pattern.finditer(seq)] if len(matches) > 0: # match to re found-- # trim the right-most hit and add the trimmed sequence to the read ID m = matches[-1] header = seq[m[0]:] + '_' + header seq = seq[:m[0]] qual = qual[:m[0]] if len(matches) > 0 or not cfg.getboolean('filtering', 'require_regex'): if len(seq) >= 10: # TODO: add adjustable min length outfile.write('@%s\n%s\n+%s\n%s\n' % (header, seq, header, qual))
def trim_regex(in_fastq, out_fastq, trim_pattern): """Search the reads for a regex, and trim everything matching the pattern and all succeeding sequence. """ pattern = re.compile(trim_pattern) with gzip.open(in_fastq) as infile: with gzip.open(out_fastq, 'w') as outfile: for header, seq, qual in parseFastq(infile): matches = [m.span() for m in pattern.finditer(seq)] if len(matches) > 0: # match to re found-- # trim the right-most hit and add the trimmed sequence to the read ID m = matches[-1] header = seq[m[0]:] + '_' + header seq = seq[:m[0]] qual = qual[:m[0]] if len(matches) > 0 or not cfg.getboolean( 'filtering', 'require_regex'): if len(seq) >= 10: # TODO: add adjustable min length outfile.write('@%s\n%s\n+%s\n%s\n' % (header, seq, header, qual))
from Bio import SeqIO from ruffus import (transform, follows, collate, files, split, merge, suffix, mkdir, jobs_limit, output_from) from ruffus.task import active_if from hts_waterworks.utils.ruffus_utils import (sys_call, main_logger as log, main_mutex as log_mtx) from hts_waterworks.bootstrap import cfg from hts_waterworks.utils.common import parseFastq # filtering original_reads = '*.fastq' prev_output = original_reads prev_suffix = '.fastq' @active_if(cfg.getboolean('filtering', 'convert_sanger_to_illumina')) @transform(prev_output, suffix(prev_suffix), '.fastq_illumina') def convert_fastq(in_fastq, out_fastq): 'convert sanger fastq format (phred-33) to illumina format (phred-64)' base_out = os.path.splitext(out_fastq)[0] records = SeqIO.parse(in_fastq, "fastq") with open(base_out, 'w') as outfile: SeqIO.write(records, outfile, "fastq-illumina") check_call('gzip %s' % base_out, shell=True) if cfg.getboolean('filtering', 'convert_sanger_to_illumina'): prev_output = convert_fastq prev_suffix = '' @active_if(cfg.getboolean('filtering', 'clip_adapter'))
with open(in_fasta) as infile: seqs = list(parseFastaLines(infile)) if len(seqs) <= cfg.get('motifs', 'motif_chunk_size'): num_chunks = 1 else: num_chunks = cfg.get('motifs', 'motif_num_chunks') # get a random sample of peaks for i in xrange(num_chunks): with open(name + '.small_sample.%s.fasta' % i, 'w') as outfile: subset = random.sample(seqs, min(len(seqs), cfg.getint('motifs', 'motif_chunk_size'))) outfile.writelines('>%s\n%s\n' % (s[0].strip(), s[1].strip()) for s in subset) # motif discovery @active_if(cfg.getboolean('motifs', 'run_meme')) @jobs_limit(cfg.getint('DEFAULT', 'max_throttled_jobs'), 'throttled') @transform(motif_select_random_seqs, #suffix('.fasta'), '.meme.discovered.motifs') #regex(r'(.*(?=_around).*(?=top).*).fasta$'), regex(r'(.*(?=top).*).fasta$'), r'\1.meme.discovered.motifs') def discover_meme_motifs(in_fasta, out_motifs): """Discover sequence motifs in peaks by running meme""" cmd = 'meme %s %s -oc %s_meme_out ' % (in_fasta, cfg.get('motifs', 'meme_params'), out_motifs) #if 'top' in in_fasta and 'around' in in_fasta: sys_call(cmd) motifs = sequence_motif.parseMemeMotifs('%s_meme_out/meme.txt' % out_motifs) pickle.dump(motifs, open(out_motifs, 'w'))
+ mapping.all_mappers_output + mapping.all_mappers_raw_reads, suffix(""), ".clipped.sorted", ) def clip_and_sort_peaks(in_bed, out_sorted): """Sort the bed file and constrain bed regions to chromosome sizes""" with tempfile.NamedTemporaryFile() as tmp_clipped: cmd = "bedClip %s %s.chrom.sizes %s" % (in_bed, genome_path(), tmp_clipped.name) sys_call(cmd) # cmd = 'bedSort %s %s' % (out_clipped, out_sorted) cmd = r"sort -t $'\t' -k 1,1 -k 2,2n -S 2G %s > %s" % (tmp_clipped.name, out_sorted) sys_call(cmd) @active_if(cfg.getboolean("visualization", "uniquefy_track")) @transform( [clip_and_sort_peaks] + mapping.all_mappers_output, suffix(""), ".unique", cfg.getint("visualization", "uniquefy_track_max_reads"), ) def bed_uniquefy(in_bed, out_bed, max_reads): "Given a sorted bed file, remove tags that are on the same start, strand" with open(in_bed) as infile: with open(out_bed, "w") as outfile: prev_start, prev_chrom = None, None plus_seen, minus_seen = 0, 0 for line in infile: fields = line.split("\t") chrom, start, stop = fields[:3]
def uniquefy_downsample_reads(in_files, out_files): """Uniquefy sequence reads then downsample so the total unique tag count in treatment and control is the same. This may generate many downsampled datasets. """ # WARNING: this is a circular dependency. It has to be included at runtime # Top-level import will cause this module to load only 1/2 way # we import here because we need to call this function directly, # and not just when using ruffus from hts_waterworks.visualize import bed_uniquefy if not cfg.getboolean('peaks', 'downsample_reads'): with log_mtx: log.debug('NOT downsampling the sequence reads!') else: in_treat, in_control = in_files out_treat_template = re.sub(r'mapped_reads$', 'matched_size_%s.mapped_reads', in_treat) out_control_template = re.sub(r'mapped_reads$', 'matched_size_%s.mapped_reads', in_control) if out_treat_template == in_treat: raise RuntimeError('regex substitution failed from %s to %s' % ( in_treat, out_treat_template)) if out_control_template == in_control: raise RuntimeError('regex substitution failed from %s to %s' % ( in_control, out_control_template)) tmp_t_sorted = tempfile.NamedTemporaryFile(delete=False).name tmp_c_sorted = tempfile.NamedTemporaryFile(delete=False).name tmp_t_unique = tempfile.NamedTemporaryFile(delete=False).name tmp_c_unique = tempfile.NamedTemporaryFile(delete=False).name # sort the reads bed_clip_and_sort(in_treat, tmp_t_sorted) bed_clip_and_sort(in_control, tmp_c_sorted) # uniquefy the reads bed_uniquefy(tmp_t_sorted, tmp_t_unique, cfg.getint('visualization', 'uniquefy_track_max_reads')) bed_uniquefy(tmp_c_sorted, tmp_c_unique, cfg.getint('visualization', 'uniquefy_track_max_reads')) total_treat = sum(1 for l in open(tmp_t_unique)) total_control = sum(1 for l in open(tmp_c_unique)) if total_treat == total_control: with log_mtx: log.debug('No downsampling required-- tag counts identical') else: # downsample num_down_sample times for i in xrange(cfg.getint('peaks', 'num_down_samples')): out_treat = out_treat_template % i out_control = out_control_template % i if total_treat > total_control: # reduce number of treatment reads inds_to_keep = set(random.sample(xrange(total_treat), total_control)) in_orig, out_orig = tmp_c_unique, out_control in_subset, out_subset = tmp_t_unique, out_treat else: # reduce number of control reads inds_to_keep = set(random.sample(xrange(total_control), total_treat)) in_orig, out_orig = tmp_t_unique, out_treat in_subset, out_subset = tmp_c_unique, out_control sys_call('cp %s %s' % (in_orig, out_orig)) # subset the tags with open(in_subset) as infile: with open(out_subset, 'w') as outfile: outfile.writelines(line for i, line in enumerate(infile) if i in inds_to_keep) for f in [tmp_t_sorted, tmp_t_unique, tmp_c_sorted, tmp_c_unique]: os.unlink(f)
import hts_waterworks.call_peaks as call_peaks import hts_waterworks.mapping as mapping @transform('%s.*.gtfgenes' % cfg.get('DEFAULT', 'genome'), suffix('.gtfgenes'), '_genes') def convert_gtf_genes_to_bed(in_gtf, out_gene_pred): """convert gtf genes to UCSC's genePred format""" sys_call('gtfToGenePred %s %s' % (in_gtf, out_gene_pred), file_log=False) @transform('%s.*.gff3genes' % cfg.get('DEFAULT', 'genome'), suffix('.gff3genes'), '_genes') def convert_gff3_genes_to_bed(in_gff3, out_gene_pred): """convert gff3 genes to UCSC's genePred format""" sys_call('gff3ToGenePred %s %s' % (in_gff3, out_gene_pred), file_log=False) @active_if(cfg.getboolean('genes','download_refseq')) @files(None, '%s.refseq_genes' % cfg.get('DEFAULT', 'genome')) def get_refseq_genes(_, out_genes): """Download refseq genes from UCSC and reformat as BED""" url = 'http://hgdownload.cse.ucsc.edu/goldenPath/%s/database/refGene.txt.gz' url = url % cfg.get('DEFAULT', 'genome') sys_call('wget -N -P . %s' % url) sys_call('gunzip -f refGene.txt.gz') sys_call('mv refGene.txt %s' % out_genes) @transform([get_refseq_genes, convert_gtf_genes_to_bed, convert_gff3_genes_to_bed], suffix('_genes'), '_genes.all') def refseq_genes_to_bed(in_genes, out_bed): """convert refseq genes file to BED format"""
@active_if(cfg.getint('PAS-Seq', 'min_read_count') > 0) @jobs_limit(cfg.getint('DEFAULT', 'max_throttled_jobs'), 'throttled') @transform(mapping.all_mappers_output, suffix('.mapped_reads'), '.overlap.mapped_reads', cfg.getint('PAS-Seq', 'min_read_count')) def remove_nonoverlapping_reads(in_bed, out_bed, min_read_count): """ Remove mapped reads that don't overlap with at least *min_read_count* reads """ cmd = "intersectBed -wa -c -a %s -b %s | awk '$(NF) >= %s' |" \ r"cut -f 1,2,3,4,5,6 > %s" % (in_bed, in_bed, min_read_count + 1, out_bed) sys_call(cmd, file_log=False) @active_if(cfg.getboolean('PAS-Seq', 'merge_adjacent_reads')) #@split(mapping.all_mappers_output, regex('(.*).mapped_reads$'), @split(remove_nonoverlapping_reads, regex('(.*).mapped_reads$'), [r'\1.merged.mapped_reads', r'\1.merged.pileup_reads'], cfg.getint('PAS-Seq', 'merge_window_width'), cfg.getint('PAS-Seq', 'merge_num_iterations'), r'\1.merged.mapped_reads', r'\1.merged.pileup_reads', cfg.getint('PAS-Seq', 'min_read_count')) def merge_adjacent_reads(in_bed, out_pattern, window_width, iterations, out_merged, out_pileup, min_read_count): """Reassign read ends to a weighted average of adjacent reads""" # helper functions for parsing bed files filter_lines = lambda l: l.strip() and (not l.startswith('#') or \ l.startswith('"')) read_bed_lines = lambda infile: itertools.ifilter(filter_lines, infile)
import shutil from ruffus import (transform, follows, collate, files, split, merge, add_inputs, regex, suffix, mkdir, jobs_limit, output_from) from ruffus.task import active_if from hts_waterworks.utils.ruffus_utils import (sys_call, main_logger as log, main_mutex as log_mtx) from hts_waterworks.bootstrap import cfg, get_chrom_sizes, genome_path import hts_waterworks.mapping as mapping import hts_waterworks.clip_seq as clip_seq from hts_waterworks.utils.common import (bedCommentFilter, readBedLines, parse_ucsc_range) @active_if(cfg.getboolean('peaks', 'run_macs')) @collate(mapping.all_mappers_output, regex(r'(.+)\.treat(.*)\.mapped_reads'), add_inputs(r'\1.control\2.mapped_reads'), r'\1.treat\2.macs.peaks', cfg.getfloat('peaks', 'max_FDR')) def run_macs(in_files, out_peaks, max_fdr): """Call peak with MACS (v1.3). Apply a maximum FDR threshold and treat centers as peak summits """ in_treat, in_control = in_files[0] matches = re.search(r'(.*\.treat)(.*)\.mapped_reads', in_treat).groups() name = matches[0] + matches[1] + '.macs.peaks' max_fdr = cfg.getfloat('peaks', 'max_FDR') cmd = 'macs -t %s -c %s --name=%s %s' % (in_treat, in_control, name, cfg.get('peaks', 'macs_params')) sys_call(cmd)
from ruffus import (transform, follows, files, split, merge, add_inputs, regex, suffix, jobs_limit, mkdir) from ruffus.task import active_if from pygr import worldbase, cnestedlist, seqdb import pybedtools from hts_waterworks.utils.ruffus_utils import (sys_call, main_logger as log, main_mutex as log_mtx) from hts_waterworks.bootstrap import (genome_path, get_genome, cfg, get_chrom_sizes) import hts_waterworks.preprocessing as preprocessing #: the references to map against for this run (genome, transcriptome, etc) reference_genomes = [genome_path()] if cfg.getboolean('mapping', 'map_to_transcriptome'): reference_genomes.append('*_genes.transcriptome.fasta') @follows(mkdir('mapped')) def make_mapping_dir(): pass @active_if(cfg.getboolean('mapping', 'map_to_transcriptome')) @split('*_genes', regex(r'(.*)_genes$'), [r'\1_genes.transcriptome.fasta', r'\1_genes.transcriptome.seqdb', r'\1_genes.transcriptome.msa']) def make_transcriptome(in_genes, out_files): """Splice UTR's and exons from gene annotations into a transcriptome. Creates a fasta-file of resulting genes and a gene to genome alignment.
if len(seqs) <= cfg.get('motifs', 'motif_chunk_size'): num_chunks = 1 else: num_chunks = cfg.get('motifs', 'motif_num_chunks') # get a random sample of peaks for i in xrange(num_chunks): with open(name + '.small_sample.%s.fasta' % i, 'w') as outfile: subset = random.sample( seqs, min(len(seqs), cfg.getint('motifs', 'motif_chunk_size'))) outfile.writelines('>%s\n%s\n' % (s[0].strip(), s[1].strip()) for s in subset) # motif discovery @active_if(cfg.getboolean('motifs', 'run_meme')) @jobs_limit(cfg.getint('DEFAULT', 'max_throttled_jobs'), 'throttled') @transform( motif_select_random_seqs, #suffix('.fasta'), '.meme.discovered.motifs') #regex(r'(.*(?=_around).*(?=top).*).fasta$'), regex(r'(.*(?=top).*).fasta$'), r'\1.meme.discovered.motifs') def discover_meme_motifs(in_fasta, out_motifs): """Discover sequence motifs in peaks by running meme""" cmd = 'meme %s %s -oc %s_meme_out ' % ( in_fasta, cfg.get('motifs', 'meme_params'), out_motifs) #if 'top' in in_fasta and 'around' in in_fasta: sys_call(cmd) motifs = sequence_motif.parseMemeMotifs('%s_meme_out/meme.txt' % out_motifs)
@transform('%s.*.gtfgenes' % cfg.get('DEFAULT', 'genome'), suffix('.gtfgenes'), '_genes') def convert_gtf_genes_to_bed(in_gtf, out_gene_pred): """convert gtf genes to UCSC's genePred format""" sys_call('gtfToGenePred %s %s' % (in_gtf, out_gene_pred), file_log=False) @transform('%s.*.gff3genes' % cfg.get('DEFAULT', 'genome'), suffix('.gff3genes'), '_genes') def convert_gff3_genes_to_bed(in_gff3, out_gene_pred): """convert gff3 genes to UCSC's genePred format""" sys_call('gff3ToGenePred %s %s' % (in_gff3, out_gene_pred), file_log=False) @active_if(cfg.getboolean('genes', 'download_refseq')) @files(None, '%s.refseq_genes' % cfg.get('DEFAULT', 'genome')) def get_refseq_genes(_, out_genes): """Download refseq genes from UCSC and reformat as BED""" url = 'http://hgdownload.cse.ucsc.edu/goldenPath/%s/database/refGene.txt.gz' url = url % cfg.get('DEFAULT', 'genome') sys_call('wget -N -P . %s' % url) sys_call('gunzip -f refGene.txt.gz') sys_call('mv refGene.txt %s' % out_genes) @transform( [get_refseq_genes, convert_gtf_genes_to_bed, convert_gff3_genes_to_bed], suffix('_genes'), '_genes.all') def refseq_genes_to_bed(in_genes, out_bed): """convert refseq genes file to BED format"""
@jobs_limit(cfg.get('DEFAULT', 'max_throttled_jobs'), 'throttled') @follows(bootstrap.get_chrom_sizes) @transform(call_peaks.all_peak_caller_functions + [pas_seq.remove_terminal_exon] + [clip_seq.search_genome_consensus] + mapping.all_mappers_output + mapping.all_mappers_raw_reads, suffix(''), '.clipped.sorted') def clip_and_sort_peaks(in_bed, out_sorted): """Sort the bed file and constrain bed regions to chromosome sizes""" with tempfile.NamedTemporaryFile() as tmp_clipped: cmd = 'bedClip %s %s.chrom.sizes %s' % (in_bed, genome_path(), tmp_clipped.name) sys_call(cmd) #cmd = 'bedSort %s %s' % (out_clipped, out_sorted) cmd = r"sort -t $'\t' -k 1,1 -k 2,2n -S 2G %s > %s" % (tmp_clipped.name, out_sorted) sys_call(cmd) @active_if(cfg.getboolean('visualization', 'uniquefy_track')) @transform([clip_and_sort_peaks] + mapping.all_mappers_output, suffix(''), '.unique', cfg.getint('visualization', 'uniquefy_track_max_reads')) def bed_uniquefy(in_bed, out_bed, max_reads): 'Given a sorted bed file, remove tags that are on the same start, strand' with open(in_bed) as infile: with open(out_bed, 'w') as outfile: prev_start, prev_chrom = None, None plus_seen, minus_seen = 0, 0 for line in infile: fields = line.split('\t') chrom, start, stop = fields[:3] if prev_start is None or prev_start != start or \ prev_chrom != chrom: prev_start, prev_chrom = start, chrom
from ruffus import (transform, follows, collate, files, split, merge, suffix, mkdir, jobs_limit, output_from) from ruffus.task import active_if from hts_waterworks.utils.ruffus_utils import (sys_call, main_logger as log, main_mutex as log_mtx) from hts_waterworks.bootstrap import cfg from hts_waterworks.utils.common import parseFastq # filtering original_reads = '*.fastq' prev_output = original_reads prev_suffix = '.fastq' @active_if(cfg.getboolean('filtering', 'convert_sanger_to_illumina')) @transform(prev_output, suffix(prev_suffix), '.fastq_illumina') def convert_fastq(in_fastq, out_fastq): 'convert sanger fastq format (phred-33) to illumina format (phred-64)' base_out = os.path.splitext(out_fastq)[0] records = SeqIO.parse(in_fastq, "fastq") with open(base_out, 'w') as outfile: SeqIO.write(records, outfile, "fastq-illumina") check_call('gzip %s' % base_out, shell=True) if cfg.getboolean('filtering', 'convert_sanger_to_illumina'): prev_output = convert_fastq prev_suffix = ''