Пример #1
0
def bed_to_bedgraph_by_strand(in_files, out_bedgraphs):
    "extend reads to the full fragment length and create a bedgraph from them"
    in_bed, in_chrom_sizes = in_files
    cmd = (
        """slopBed -i %s -s -r %s -l 0 -g %s | awk '{if ($6 == "+") print $0}' | """
        + "bedItemOverlapCount %s -chromSize=%s.chrom.sizes stdin > %s"
    ) % (
        in_bed,
        cfg.getint("DEFAULT", "fragment_size") - cfg.getint("DEFAULT", "tag_size"),
        in_chrom_sizes,
        cfg.get("DEFAULT", "genome"),
        genome_path(),
        out_bedgraph[0],
    )
    sys_call(cmd)

    cmd = (
        """slopBed -i %s -s -r %s -l 0 -g %s | awk '{if ($6 == "-") print $0}' | """
        + "bedItemOverlapCount %s -chromSize=%s.chrom.sizes stdin > %s"
    ) % (
        in_bed,
        cfg.getint("DEFAULT", "fragment_size") - cfg.getint("DEFAULT", "tag_size"),
        in_chrom_sizes,
        cfg.get("DEFAULT", "genome"),
        genome_path(),
        out_bedgraph[1],
    )
    sys_call(cmd)
Пример #2
0
def quest_to_wig(in_reads, out_glob, out_template, in_dir, chrom_sizes):
    in_template = in_dir + '/tracks/wig_profiles/%s.profile.wig.gz'
    for f in ['background_unnormalized', 'background_normalized', 'ChIP_normalized', 'ChIP_unnormalized']:
        in_file = in_template % f
        out_file = out_template % f
        #shutil.copy(in_file, out_file)
        sys_call('gunzip -c -d %s > %s' % (in_file, out_file))
Пример #3
0
def merge_comparison_types(in_files, out_merged):
    """concatenate the comparison types together for plotting"""
    cmd = 'cat %s > %s ' % (in_files[0], out_merged)
    sys_call(cmd, file_log=False)
    # skip the header for remaining files
    for f in in_files[1:]:
        cmd = 'sed 1d %s >> %s' % (f, out_merged)
        sys_call(cmd, file_log=False)
Пример #4
0
def remove_nonoverlapping_reads(in_bed, out_bed, min_read_count):
    """
    Remove mapped reads that don't overlap with at least *min_read_count* reads
    """
    cmd = "intersectBed -wa -c -a %s -b %s | awk '$(NF) >= %s' |" \
          r"cut -f 1,2,3,4,5,6 > %s" % (in_bed, in_bed, min_read_count + 1,
                                        out_bed)
    sys_call(cmd, file_log=False)
Пример #5
0
def run_quest(in_reads, out_peaks, chrom_sizes):
    """Run QuEST on the given treatment and control data"""
    in_treat = filter(lambda f: '.treat.' in f, in_reads)[0]
    in_control = filter(lambda f: '.control.' in f, in_reads)[0]
    sys_call('echo "y\n1\n2\ny\n" | generate_QuEST_parameters.pl -QuEST_align_ChIP %s '
             '-QuEST_align_RX_noIP %s -gt %s -ap %s_output -silent' %
             (in_treat, in_control, chrom_sizes, in_treat))
    shutil.copy('%s_output/calls/peak_caller.ChIP.out.accepted' % in_treat, out_peaks)
Пример #6
0
def clip_and_sort_peaks(in_bed, out_sorted):
    """Sort the bed file and constrain bed regions to chromosome sizes"""
    with tempfile.NamedTemporaryFile() as tmp_clipped:
        cmd = "bedClip %s %s.chrom.sizes %s" % (in_bed, genome_path(), tmp_clipped.name)
        sys_call(cmd)
        # cmd = 'bedSort %s %s' % (out_clipped, out_sorted)
        cmd = r"sort -t $'\t' -k 1,1 -k 2,2n -S 2G %s > %s" % (tmp_clipped.name, out_sorted)
        sys_call(cmd)
Пример #7
0
def discover_nmica_motifs(in_fasta, out_motifs):
    """Discover sequence motifs in peaks by running nestedMICA"""
    cmd = 'nminfer -seqs %s %s ' % (in_fasta, cfg.get('motifs', 'nmica_params'))
    sys_call(cmd)
    motifs_name = in_fasta.replace('.fasta', '.motifs.xms')
    sys_call('mv motifs.xms %s' % motifs_name)
    motifs = sequence_motif.parse_xms_motifs(motifs_name)
    pickle.dump(motifs, open(out_motifs, 'w'))
Пример #8
0
def discover_meme_motifs(in_fasta, out_motifs):
    """Discover sequence motifs in peaks by running meme"""
    cmd = 'meme %s %s -oc %s_meme_out ' % (
        in_fasta, cfg.get('motifs', 'meme_params'), out_motifs)
    #if 'top' in in_fasta and 'around' in in_fasta:
    sys_call(cmd)
    motifs = sequence_motif.parseMemeMotifs('%s_meme_out/meme.txt' %
                                            out_motifs)
    pickle.dump(motifs, open(out_motifs, 'w'))
Пример #9
0
def deploy_track_files(in_files, out_header):
    """Copy UCSC tracks to public url"""
    remote = cfg.get("visualization", "remote_ssh_dir")
    remote_host = remote.split(":")[0]
    remote_dir = remote.split(":")[1]
    for in_track in in_files:
        sys_call("ssh %s mkdir -p %s" % (remote_host, remote_dir))
        sys_call("scp %s %s" % (in_track, remote))
    touch(out_header)
Пример #10
0
def discover_meme_motifs(in_fasta, out_motifs):
    """Discover sequence motifs in peaks by running meme"""
    cmd = 'meme %s %s -oc %s_meme_out ' % (in_fasta,
                                           cfg.get('motifs', 'meme_params'),
                                           out_motifs)
    #if 'top' in in_fasta and 'around' in in_fasta:
    sys_call(cmd)
    motifs = sequence_motif.parseMemeMotifs('%s_meme_out/meme.txt' % out_motifs)
    pickle.dump(motifs, open(out_motifs, 'w'))
Пример #11
0
def deploy_track_files(in_files, out_header):
    """Copy UCSC tracks to public url"""
    remote = cfg.get('visualization', 'remote_ssh_dir')
    remote_host = remote.split(':')[0]
    remote_dir = remote.split(':')[1]
    for in_track in in_files:
        sys_call('ssh %s mkdir -p %s' % (remote_host, remote_dir))
        sys_call('scp %s %s' % (in_track, remote))
    touch(out_header)
Пример #12
0
def clip_and_sort_peaks(in_bed, out_sorted):
    """Sort the bed file and constrain bed regions to chromosome sizes"""
    with tempfile.NamedTemporaryFile() as tmp_clipped:
        cmd = 'bedClip %s %s.chrom.sizes %s' % (in_bed, genome_path(),
                                                tmp_clipped.name)
        sys_call(cmd)
        #cmd = 'bedSort %s %s' % (out_clipped, out_sorted)
        cmd = r"sort -t $'\t' -k 1,1 -k 2,2n -S 2G %s > %s" % (tmp_clipped.name, out_sorted)
        sys_call(cmd)
Пример #13
0
def merge_strands(in_files, out_merged):
    """concatenate the strand-specific analyses for plotting"""
    # output the first file in its entirety
    cmd = 'cat %s > %s ' % (in_files[0], out_merged)
    sys_call(cmd, file_log=False)
    # skip the header for remaining files
    for f in in_files[1:]:
        cmd = 'sed 1d %s >> %s' % (f, out_merged)
        sys_call(cmd, file_log=False)
Пример #14
0
def discover_nmica_motifs(in_fasta, out_motifs):
    """Discover sequence motifs in peaks by running nestedMICA"""
    cmd = 'nminfer -seqs %s %s ' % (in_fasta, cfg.get('motifs',
                                                      'nmica_params'))
    sys_call(cmd)
    motifs_name = in_fasta.replace('.fasta', '.motifs.xms')
    sys_call('mv motifs.xms %s' % motifs_name)
    motifs = sequence_motif.parse_xms_motifs(motifs_name)
    pickle.dump(motifs, open(out_motifs, 'w'))
Пример #15
0
def run_mosaik_align(in_files, out_align):
    'align reads to reference using MosaikAligner'
    # MosaikAligner -in sequence_archives/c_elegans_chr2_test.dat -out sequence_archives/c_elegans_chr2_test_aligned.dat -ia reference/c.elegans_chr2.dat -hs 14 -act 17 -mm 2 -m unique
    in_reads, in_genome_dat, in_genome_jump, _, _ = in_files
    in_genome_jump = in_genome_jump.replace('_keys.jmp', '')
    cmd = 'MosaikAligner -in %s -ia %s -j %s -out %s -hs %s  %s'
    cmd = cmd % (in_reads, in_genome_dat, in_genome_jump, out_align,
                   cfg.getint('mapping', 'mosaik_hash_size'),
                   cfg.get('mapping', 'mosaik_params'))
    sys_call(cmd)
Пример #16
0
def run_mosaik_align(in_files, out_align):
    'align reads to reference using MosaikAligner'
    # MosaikAligner -in sequence_archives/c_elegans_chr2_test.dat -out sequence_archives/c_elegans_chr2_test_aligned.dat -ia reference/c.elegans_chr2.dat -hs 14 -act 17 -mm 2 -m unique
    in_reads, in_genome_dat, in_genome_jump, _, _ = in_files
    in_genome_jump = in_genome_jump.replace('_keys.jmp', '')
    cmd = 'MosaikAligner -in %s -ia %s -j %s -out %s -hs %s  %s'
    cmd = cmd % (in_reads, in_genome_dat, in_genome_jump, out_align,
                   cfg.getint('mapping', 'mosaik_hash_size'),
                   cfg.get('mapping', 'mosaik_params'))
    sys_call(cmd)
Пример #17
0
def bed_to_bedgraph(in_files, out_bedgraph):
    'extend reads to the full fragment length and create a bedgraph from them'
    in_bed, in_chrom_sizes = in_files
    cmd = ('slopBed -i %s -s -r %s -l 0 -g %s | ' + \
            'bedItemOverlapCount %s -chromSize=%s.chrom.sizes stdin > %s') % (
                        in_bed,
                        cfg.getint('DEFAULT','fragment_size') - \
                                            cfg.getint('DEFAULT','tag_size'),
                        in_chrom_sizes, cfg.get('DEFAULT', 'genome'),
                        genome_path(), out_bedgraph)
    sys_call(cmd)
Пример #18
0
def get_microRNA(_, out_mirna):
    """retrieve microRNA genes from UCSC"""
    url = 'http://hgdownload.cse.ucsc.edu/goldenPath/%s/database/wgRna.txt.gz'
    url = url % cfg.get('DEFAULT', 'genome')
    sys_call('wget -N -P . %s' % url)
    sys_call('gunzip -f wgRna.txt.gz')
    with open(out_mirna, 'w') as outfile:
        for line in open('wgRna.txt'):
            (bin, chrom, start, end, name, score,
             strand, thickStart, thickEnd, type) = line.strip().split('\t')
            outfile.write('\t'.join([chrom, start, end, name + '_' + type, score, strand]) + '\n')
Пример #19
0
def get_ssaha2_hashtable(in_genome, out_ssaha2):
    """Use ssaha2Build to generate a hash table for the genetic
       sequences stored in an input .fasta file
        
       ssaha2Build writes five files to disk, each preceded by
       the hash name. Their file extensions are:
       .base, .body, .head, .name, .size
    """
    
    #TODO: add useful parameters to cmd and config file
    cmd = 'ssaha2Build -save %s %s' % (out_ssaha2, in_genome)
    sys_call(cmd)
Пример #20
0
def run_glitr(in_files, out_peaks):
    """Call peaks with GLITR"""
    in_treat = filter(lambda f: '.treat.' in f, in_files)[0]
    in_control = filter(lambda f: '.control.' in f, in_files)[0]
    glitr_dir = in_treat + '.GLITR_out'
    cmd = ('rm -r %s; mkdir %s; cd %s; GLITR.pl --CHIP=../%s ' + \
            '--CONTROL=../%s --GENOME=%s %s ') % (
                glitr_dir, glitr_dir, glitr_dir, in_treat, in_control,
                cfg.get('DEFAULT', 'genome').upper(),
                cfg.get('peaks', 'glitr_params'))
    sys_call(cmd)
    sys_call('cp %s/allChIP.FDR_*PercentFDR %s' % (glitr_dir, out_peaks))
Пример #21
0
def get_microRNA(_, out_mirna):
    """retrieve microRNA genes from UCSC"""
    url = 'http://hgdownload.cse.ucsc.edu/goldenPath/%s/database/wgRna.txt.gz'
    url = url % cfg.get('DEFAULT', 'genome')
    sys_call('wget -N -P . %s' % url)
    sys_call('gunzip -f wgRna.txt.gz')
    with open(out_mirna, 'w') as outfile:
        for line in open('wgRna.txt'):
            (bin, chrom, start, end, name, score, strand, thickStart, thickEnd,
             type) = line.strip().split('\t')
            outfile.write('\t'.join(
                [chrom, start, end, name + '_' + type, score, strand]) + '\n')
Пример #22
0
def bed_to_bedgraph(in_files, out_bedgraph):
    "extend reads to the full fragment length and create a bedgraph from them"
    in_bed, in_chrom_sizes = in_files
    cmd = ("slopBed -i %s -s -r %s -l 0 -g %s | " + "bedItemOverlapCount %s -chromSize=%s.chrom.sizes stdin > %s") % (
        in_bed,
        cfg.getint("DEFAULT", "fragment_size") - cfg.getint("DEFAULT", "tag_size"),
        in_chrom_sizes,
        cfg.get("DEFAULT", "genome"),
        genome_path(),
        out_bedgraph,
    )
    sys_call(cmd)
Пример #23
0
def get_ssaha2_hashtable(in_genome, out_ssaha2):
    """Use ssaha2Build to generate a hash table for the genetic
       sequences stored in an input .fasta file
        
       ssaha2Build writes five files to disk, each preceded by
       the hash name. Their file extensions are:
       .base, .body, .head, .name, .size
    """
    
    #TODO: add useful parameters to cmd and config file
    cmd = 'ssaha2Build -save %s %s' % (out_ssaha2, in_genome)
    sys_call(cmd)
Пример #24
0
def run_ssaha2(in_fastq, out_ssaha2):
    """ Runs ssaha2 command using the prebuilt hash table from
        get_ssaha2_hashtable. 
    
        The ssaha2 command maps DNA sequence reads onto a genomic 
        reference sequence using a combination of word hashing and 
        dynamic programming. (From ssaha2 manual)
    """
    #TODO: add useful parameters to cmd and config file
    #cmd = 'ssaha2 -outfile %s -save %s %s' % (out_ssaha2, hash_name, in_fastq)
    cmd = 'ssaha2 -outfile %s -disk 1 -save %s %s'
    cmd = cmd % (out_ssaha2, (cfg.get('mapping', 'ssaha2_hash_name')), in_fastq)
    sys_call(cmd)
Пример #25
0
def run_ssaha2(in_fastq, out_ssaha2):
    """ Runs ssaha2 command using the prebuilt hash table from
        get_ssaha2_hashtable. 
    
        The ssaha2 command maps DNA sequence reads onto a genomic 
        reference sequence using a combination of word hashing and 
        dynamic programming. (From ssaha2 manual)
    """
    #TODO: add useful parameters to cmd and config file
    #cmd = 'ssaha2 -outfile %s -save %s %s' % (out_ssaha2, hash_name, in_fastq)
    cmd = 'ssaha2 -outfile %s -disk 1 -save %s %s'
    cmd = cmd % (out_ssaha2, (cfg.get('mapping', 'ssaha2_hash_name')), in_fastq)
    sys_call(cmd)
Пример #26
0
def run_macs(in_files, out_peaks, max_fdr):
    """Call peak with MACS (v1.3).
    Apply a maximum FDR threshold and treat centers as peak summits
    
    """
    in_treat, in_control = in_files[0]
    matches = re.search(r'(.*\.treat)(.*)\.mapped_reads', in_treat).groups()
    name = matches[0] + matches[1] + '.macs.peaks'
    max_fdr = cfg.getfloat('peaks', 'max_FDR')
    cmd = 'macs -t %s -c %s --name=%s %s' % (in_treat, in_control, name,
                                               cfg.get('peaks', 'macs_params'))
    sys_call(cmd)
    
    # convert to proper bedfile- ints for score and + for strand
    with open(out_peaks, 'w') as outfile:
        with open(name + '_peaks.xls') as infile:
            for index, line in enumerate(itertools.ifilter(
                                        bedCommentFilter, infile)):
                fields = line.split('\t')
                if fields[0] == 'chr':
                    continue # skip header
                start = str(max(0, int(fields[1])))
                score = str(max(0, min(1000, int(float(fields[6])))))
                fdr = float(fields[8])
                if fdr <= max_fdr:
                    outfile.write('\t'.join([fields[0], start, fields[2],
                                        'MACS_peak_%s' % (index + 1), score]) +
                                    '\t+\n')
    # take region surrounding the peak center as the summit
    summit_size = cfg.getint('peaks', 'peak_summit_size')
    with open(out_peaks + '_summits.%s_around' % \
                        cfg.get('peaks', 'peak_summit_size'), 'w') as outfile:
        with open(name + '_peaks.xls') as infile:
            for index, line in enumerate(itertools.ifilter(bedCommentFilter,
                                                                    infile)):
                fields = line.strip().split('\t')
                if fields[0] == 'chr':
                    continue # skip header
                score = str(max(0, min(1000, int(float(fields[6])))))
                p_start, p_stop = max(0, int(fields[1])), int(fields[2])
                p_center = p_start + (p_stop - p_start) / 2
                s_start = p_center - summit_size / 2
                s_stop = p_center + summit_size / 2
                fdr = float(fields[8])
                if fdr <= max_fdr:
                    outfile.write('\t'.join([fields[0], str(s_start),
                                    str(s_stop),
                                    'MACS_peak_%s' % (index + 1), score])
                                        + '\t+\n')
Пример #27
0
def get_nearest_features(in_files, _, out_pattern):
    """Calculate the distance from each peak to the nearest features"""
    print in_files
    print out_pattern
    in_peaks, chrom_sizes, all_features = in_files[0], in_files[1], in_files[
        2:]
    if len(all_features) == 0:
        raise RuntimeError("No features present to compare to!")
    # get distances for each feature
    tmp_output = tempfile.NamedTemporaryFile(delete=False)
    for in_feature in all_features:
        distances = []
        all_distances = []
        cmd = 'closestBed -a %s -b %s -t first -D ref > %s' % (
            in_peaks, in_feature, tmp_output.name)
        sys_call(cmd, file_log=False)
        with open(tmp_output.name) as infile:
            for line in infile:
                if not line:
                    continue
                fields = line.strip().split('\t')
                dist = int(fields[-1])
                #if int(fields[1]) < int(fields[7]):
                #    dist *= -1
                distances.append(dist)
        all_distances.append(distances)

        cmd = 'shuffleBed -chrom -i %s -g %s | closestBed -a stdin -b %s -t first -D ref > %s' % (
            in_peaks, chrom_sizes, in_feature, tmp_output.name)
        sys_call(cmd, file_log=False)
        distances = []
        with open(tmp_output.name) as infile:
            for line in infile:
                if not line:
                    continue
                fields = line.strip().split('\t')
                dist = int(fields[-1])
                #if int(fields[1]) < int(fields[7]):
                #    dist *= -1
                distances.append(dist)
        all_distances.append(distances)
        with open(out_pattern % in_feature, 'w') as outfile:
            outfile.write('\t'.join([in_feature, 'Random']) + '\n')  # header
            for d in zip(*all_distances):
                outfile.write('\t'.join(map(str, d)) +
                              '\n')  # distance as column
    os.unlink(tmp_output.name)
Пример #28
0
def find_nearby_genes(in_files, out_genes):
    """report which genes are within a certain distance of a peak"""
    in_peaks, in_genes = in_files[0]
    tmp_output = tempfile.NamedTemporaryFile(delete=False).name
    cmd = 'closestBed -a %s -b %s -t first -d > %s' % (in_peaks, in_genes,
                                                       tmp_output)
    sys_call(cmd)
    with open(tmp_output) as infile:
        with open(out_genes, 'w') as outfile:
            for line in infile:
                if not line:
                    continue
                fields = line.strip().split('\t')
                dist = int(fields[-1])
                if abs(dist) <= cfg.getint('genes', 'nearby_genes_max_dist'):
                    outfile.write(line)
    os.unlink(tmp_output)
Пример #29
0
def find_nearby_genes(in_files, out_genes):
    """report which genes are within a certain distance of a peak"""
    in_peaks, in_genes = in_files[0]
    tmp_output = tempfile.NamedTemporaryFile(delete=False).name
    cmd = 'closestBed -a %s -b %s -t first -d > %s' % (in_peaks,
                                                       in_genes, tmp_output)
    sys_call(cmd)
    with open(tmp_output) as infile:
        with open(out_genes, 'w') as outfile:
            for line in infile:
                if not line:
                    continue
                fields = line.strip().split('\t')
                dist = int(fields[-1])
                if abs(dist) <= cfg.getint('genes', 'nearby_genes_max_dist'):
                    outfile.write(line)
    os.unlink(tmp_output)
Пример #30
0
def get_nearest_features(in_files, _, out_pattern):
    """Calculate the distance from each peak to the nearest features"""
    print in_files
    print out_pattern
    in_peaks, chrom_sizes, all_features = in_files[0], in_files[1], in_files[2:]
    if len(all_features) == 0:
        raise RuntimeError("No features present to compare to!")
    # get distances for each feature
    tmp_output = tempfile.NamedTemporaryFile(delete=False)
    for in_feature in all_features:
        distances = []
        all_distances = []
        cmd = 'closestBed -a %s -b %s -t first -D ref > %s' % (in_peaks, in_feature,
                                                           tmp_output.name)
        sys_call(cmd, file_log=False)
        with open(tmp_output.name) as infile:
            for line in infile:
                if not line:
                    continue
                fields = line.strip().split('\t')
                dist = int(fields[-1])
                #if int(fields[1]) < int(fields[7]):
                #    dist *= -1
                distances.append(dist)
        all_distances.append(distances)

        cmd = 'shuffleBed -chrom -i %s -g %s | closestBed -a stdin -b %s -t first -D ref > %s' % (
                            in_peaks, chrom_sizes, in_feature, tmp_output.name)
        sys_call(cmd, file_log=False)
        distances = []
        with open(tmp_output.name) as infile:
            for line in infile:
                if not line:
                    continue
                fields = line.strip().split('\t')
                dist = int(fields[-1])
                #if int(fields[1]) < int(fields[7]):
                #    dist *= -1
                distances.append(dist)
        all_distances.append(distances)
        with open(out_pattern % in_feature, 'w') as outfile:
            outfile.write('\t'.join([in_feature, 'Random']) + '\n')  # header
            for d in zip(*all_distances):
                outfile.write('\t'.join(map(str, d)) + '\n') # distance as column
    os.unlink(tmp_output.name)
Пример #31
0
def run_macs14(in_files, out_peaks, max_fdr):
    """Call peaks using MACS (v1.4). Apply a maximum FDR threshold."""
    in_treat, in_control = in_files[0]
    matches = re.search(r'(.*\.treat)(.*)\.mapped_reads', in_treat).groups()
    name = matches[0] + matches[1] + '.macs14.peaks'
    cmd = 'macs14 -t %s -c %s --name=%s %s --diag' % (in_treat, in_control, name,
                                             cfg.get('peaks', 'macs14_params'))
    sys_call(cmd)
    peaks_to_keep = set()
    # convert to proper bedfile- ints for score and + for strand
    with open(out_peaks, 'w') as outfile:
        with open(name + '_peaks.xls') as infile:
            for index, line in enumerate(itertools.ifilter(bedCommentFilter,
                                                                    infile)):
                fields = line.split('\t')
                if fields[0] == 'chr':
                    continue # skip header
                start = str(max(0, int(fields[1])))
                score = str(max(0, min(1000, int(float(fields[6])))))
                fdr = float(fields[8])
                if fdr <= max_fdr:
                    outfile.write('\t'.join([fields[0], start, fields[2],
                                        'MACS14_peak_%s' % (index + 1), score])
                                                + '\t+\n')
                    peaks_to_keep.add(index)
    # take region surrounding the peak summit
    summit_size = cfg.getint('peaks', 'peak_summit_size')
    with open(out_peaks + '_summits.%s_around' % \
                        cfg.get('peaks', 'peak_summit_size'), 'w') as outfile:
        with open(name + '_summits.bed') as infile:
            for index, line in enumerate(itertools.ifilter(bedCommentFilter,
                                                                    infile)):
                fields = line.strip().split('\t')
                if fields[0] == 'chr':
                    continue # skip header
                # score is number of reads at summit
                score = str(max(0, min(1000, int(float(fields[-1])))))
                start = str(max(0, int(fields[1]) - summit_size / 2))
                stop = str(int(fields[2]) + summit_size / 2)
                if index in peaks_to_keep:
                    outfile.write('\t'.join([fields[0], start, stop,
                                        'MACS_peak_%s' % (index + 1), score])
                                            + '\t+\n')
Пример #32
0
def bed_to_bedgraph_by_strand(in_files, out_bedgraphs):
    'extend reads to the full fragment length and create a bedgraph from them'
    in_bed, in_chrom_sizes = in_files
    cmd = ("""slopBed -i %s -s -r %s -l 0 -g %s | awk '{if ($6 == "+") print $0}' | """ + \
            'bedItemOverlapCount %s -chromSize=%s.chrom.sizes stdin > %s') % (
                        in_bed,
                        cfg.getint('DEFAULT','fragment_size') - \
                                            cfg.getint('DEFAULT','tag_size'),
                        in_chrom_sizes, cfg.get('DEFAULT', 'genome'),
                        genome_path(), out_bedgraph[0])
    sys_call(cmd)
    
    cmd = ("""slopBed -i %s -s -r %s -l 0 -g %s | awk '{if ($6 == "-") print $0}' | """ + \
            'bedItemOverlapCount %s -chromSize=%s.chrom.sizes stdin > %s') % (
                        in_bed,
                        cfg.getint('DEFAULT','fragment_size') - \
                                            cfg.getint('DEFAULT','tag_size'),
                        in_chrom_sizes, cfg.get('DEFAULT', 'genome'),
                        genome_path(), out_bedgraph[1])
    sys_call(cmd)
Пример #33
0
def get_refseq_genes(_, out_genes):
    """Download refseq genes from UCSC and reformat as BED"""
    url = 'http://hgdownload.cse.ucsc.edu/goldenPath/%s/database/refGene.txt.gz'
    url = url % cfg.get('DEFAULT', 'genome')
    sys_call('wget -N -P . %s' % url)
    sys_call('gunzip -f refGene.txt.gz')
    sys_call('mv refGene.txt %s' % out_genes)
Пример #34
0
def get_refseq_genes(_, out_genes):
    """Download refseq genes from UCSC and reformat as BED"""
    url = 'http://hgdownload.cse.ucsc.edu/goldenPath/%s/database/refGene.txt.gz'
    url = url % cfg.get('DEFAULT', 'genome')
    sys_call('wget -N -P . %s' % url)
    sys_call('gunzip -f refGene.txt.gz')
    sys_call('mv refGene.txt %s' % out_genes)
Пример #35
0
def run_macs14_no_control(in_treat, out_peaks):
    """Call peaks using MACS (v1.4) without control data"""
    cmd = 'macs14 -t %s --name=%s %s' % (in_treat, out_peaks,
                                         cfg.get('peaks', 'macs14_params'))
    sys_call(cmd)
    peaks_to_keep = set()
    # convert to proper bedfile- ints for score and + for strand
    with open(out_peaks, 'w') as outfile:
        with open(out_peaks + '_peaks.xls') as infile:
            for index, line in enumerate(itertools.ifilter(bedCommentFilter,
                                                                    infile)):
                fields = line.split('\t')
                if fields[0] == 'chr':
                    continue # skip header
                start = str(max(0, int(fields[1])))
                score = str(max(0, min(1000, int(float(fields[6])))))
                outfile.write('\t'.join([fields[0], start, fields[2],
                                        'MACS14_peak_%s' % (index + 1), score])
                                                + '\t+\n')
                peaks_to_keep.add(index)
    # take region surrounding the peak summit
    summit_size = cfg.getint('peaks', 'peak_summit_size')
    with open(out_peaks + '_summits.%s_around' % \
                        cfg.get('peaks', 'peak_summit_size'), 'w') as outfile:
        with open(out_peaks + '_summits.bed') as infile:
            for index, line in enumerate(itertools.ifilter(bedCommentFilter,
                                                                    infile)):
                fields = line.strip().split('\t')
                if fields[0] == 'chr':
                    continue # skip header
                # score is number of reads at summit
                score = str(max(0, min(1000, int(float(fields[-1])))))
                start = str(max(0, int(fields[1]) - summit_size / 2))
                stop = str(int(fields[2]) + summit_size / 2)
                if index in peaks_to_keep:
                    outfile.write('\t'.join([fields[0], start, stop,
                                        'MACS_peak_%s' % (index + 1), score])
                                            + '\t+\n')
Пример #36
0
def maq_map_reads(in_files, out_map):
    """ Use maq match to align the reads to the reference.
    Input files are in binary format and output is in .map format.
    """
    cmd = 'maq match %s %s %s' % (out_map, in_files[0], in_files[1])
    sys_call(cmd)
Пример #37
0
def get_polyA_DB(_, out_db, genome_build):
    cmd = r"curl 'http://hgdownload.cse.ucsc.edu/goldenPath/%s/database/polyaDb.txt.gz' | gunzip - | cut -d $'\t' -f 2- > %s"
    cmd = cmd % (genome_build, out_db)
    sys_call(cmd, file_log=False)
Пример #38
0
def wig_to_bigwig(in_wig, out_bigwig):
    """Convert the wig file to a bigwig file"""
    cmd = 'wigToBigWig %s %s.chrom.sizes %s' % (in_wig, genome_path(), out_bigwig)
    sys_call(cmd)
Пример #39
0
def bedgraph_to_bigwig(in_bedgraph, out_bigwig):
    """Convert the bedgraph file to .bigwig for viewing on UCSC"""
    cmd = 'bedGraphToBigWig %s %s.chrom.sizes %s' % (in_bedgraph, genome_path(),
                                                     out_bigwig)
    sys_call(cmd)
Пример #40
0
def wig_to_bigwig(in_wig, out_bigwig):
    """Convert the wig file to a bigwig file"""
    cmd = "wigToBigWig %s %s.chrom.sizes %s" % (in_wig, genome_path(), out_bigwig)
    sys_call(cmd)
Пример #41
0
def remove_terminal_exon(in_files, out_bed):
    """Remove all exons but the last one using intersectBed"""
    in_bed, exon_file = in_files
    cmd = 'intersectBed -v -a %s -b %s > %s' % (in_bed, exon_file, out_bed)
    sys_call(cmd, file_log=False)
Пример #42
0
def convert_gff3_genes_to_bed(in_gff3, out_gene_pred):
    """convert gff3 genes to UCSC's genePred format"""
    sys_call('gff3ToGenePred %s %s' % (in_gff3, out_gene_pred), file_log=False)
Пример #43
0
def maq_index_reads(in_fastq, out_bfq):
    """ Use maq fastq2bfq to convert read sequences in .fastq
    format to BFQ format, which is a binary representation.
    """
    cmd = 'maq fastq2bfq %s %s' % (in_fastq, out_bfq)
    sys_call(cmd)
Пример #44
0
def bedgraph_to_bigwig(in_bedgraph, out_bigwig):
    """Convert the bedgraph file to .bigwig for viewing on UCSC"""
    cmd = "bedGraphToBigWig %s %s.chrom.sizes %s" % (in_bedgraph, genome_path(), out_bigwig)
    sys_call(cmd)
Пример #45
0
def uniquefy_downsample_reads(in_files, out_files):
    """Uniquefy sequence reads then downsample so the total unique tag count in
    treatment and control is the same.  This may generate many downsampled datasets.
    """
    # WARNING: this is a circular dependency.  It has to be included at runtime
    #    Top-level import will cause this module to load only 1/2 way
    #    we import here because we need to call this function directly,
    #    and not just when using ruffus
    from hts_waterworks.visualize import bed_uniquefy
    if not cfg.getboolean('peaks', 'downsample_reads'):
        with log_mtx:
            log.debug('NOT downsampling the sequence reads!')
    else:
        in_treat, in_control = in_files
        out_treat_template = re.sub(r'mapped_reads$',
                                    'matched_size_%s.mapped_reads', in_treat)
        out_control_template = re.sub(r'mapped_reads$',
                                    'matched_size_%s.mapped_reads', in_control)
        if out_treat_template == in_treat:
            raise RuntimeError('regex substitution failed from %s to %s' % (
                                                in_treat, out_treat_template))
        if out_control_template == in_control:
            raise RuntimeError('regex substitution failed from %s to %s' % (
                                            in_control, out_control_template))
        tmp_t_sorted = tempfile.NamedTemporaryFile(delete=False).name
        tmp_c_sorted = tempfile.NamedTemporaryFile(delete=False).name
        tmp_t_unique = tempfile.NamedTemporaryFile(delete=False).name
        tmp_c_unique = tempfile.NamedTemporaryFile(delete=False).name
        
        # sort the reads
        bed_clip_and_sort(in_treat, tmp_t_sorted)
        bed_clip_and_sort(in_control, tmp_c_sorted)
        
        # uniquefy the reads
        bed_uniquefy(tmp_t_sorted, tmp_t_unique,
                     cfg.getint('visualization', 'uniquefy_track_max_reads'))
        bed_uniquefy(tmp_c_sorted, tmp_c_unique,
                     cfg.getint('visualization', 'uniquefy_track_max_reads'))
        
        total_treat = sum(1 for l in open(tmp_t_unique))
        total_control = sum(1 for l in open(tmp_c_unique))
        if total_treat == total_control:
            with log_mtx:
                log.debug('No downsampling required-- tag counts identical')
        else:
            # downsample num_down_sample times
            for i in xrange(cfg.getint('peaks', 'num_down_samples')):
                out_treat = out_treat_template % i
                out_control = out_control_template % i
                if total_treat > total_control:
                    # reduce number of treatment reads
                    inds_to_keep = set(random.sample(xrange(total_treat),
                                                                total_control))
                    in_orig, out_orig = tmp_c_unique, out_control
                    in_subset, out_subset = tmp_t_unique, out_treat
                else:
                    # reduce number of control reads
                    inds_to_keep = set(random.sample(xrange(total_control),
                                                     total_treat))
                    in_orig, out_orig = tmp_t_unique, out_treat
                    in_subset, out_subset = tmp_c_unique, out_control
                sys_call('cp %s %s' % (in_orig, out_orig))
                # subset the tags
                with open(in_subset) as infile:
                    with open(out_subset, 'w') as outfile:
                        outfile.writelines(line for i, line in enumerate(infile) 
                                                        if i in inds_to_keep)
        for f in [tmp_t_sorted, tmp_t_unique, tmp_c_sorted, tmp_c_unique]:
            os.unlink(f)
Пример #46
0
def maq_view_reads(in_map, out_map):
    """ Use maq mapview to generate a human readable .map
    format.
    """
    cmd = 'maq mapview %s > %s' % (in_map, out_map)
    sys_call(cmd)
Пример #47
0
def bed_to_bigbed(in_bed, out_bigbed):
    """Convert a BED file to .bigbed for viewing on UCSC browser"""
    cmd = "bedToBigBed %s %s.chrom.sizes %s" % (in_bed, genome_path(), out_bigbed)
    sys_call(cmd)
Пример #48
0
def uniquefy_downsample_reads(in_files, out_files):
    """Uniquefy sequence reads then downsample so the total unique tag count in
    treatment and control is the same.  This may generate many downsampled datasets.
    """
    # WARNING: this is a circular dependency.  It has to be included at runtime
    #    Top-level import will cause this module to load only 1/2 way
    #    we import here because we need to call this function directly,
    #    and not just when using ruffus
    from hts_waterworks.visualize import bed_uniquefy
    if not cfg.getboolean('peaks', 'downsample_reads'):
        with log_mtx:
            log.debug('NOT downsampling the sequence reads!')
    else:
        in_treat, in_control = in_files
        out_treat_template = re.sub(r'mapped_reads$',
                                    'matched_size_%s.mapped_reads', in_treat)
        out_control_template = re.sub(r'mapped_reads$',
                                    'matched_size_%s.mapped_reads', in_control)
        if out_treat_template == in_treat:
            raise RuntimeError('regex substitution failed from %s to %s' % (
                                                in_treat, out_treat_template))
        if out_control_template == in_control:
            raise RuntimeError('regex substitution failed from %s to %s' % (
                                            in_control, out_control_template))
        tmp_t_sorted = tempfile.NamedTemporaryFile(delete=False).name
        tmp_c_sorted = tempfile.NamedTemporaryFile(delete=False).name
        tmp_t_unique = tempfile.NamedTemporaryFile(delete=False).name
        tmp_c_unique = tempfile.NamedTemporaryFile(delete=False).name
        
        # sort the reads
        bed_clip_and_sort(in_treat, tmp_t_sorted)
        bed_clip_and_sort(in_control, tmp_c_sorted)
        
        # uniquefy the reads
        bed_uniquefy(tmp_t_sorted, tmp_t_unique,
                     cfg.getint('visualization', 'uniquefy_track_max_reads'))
        bed_uniquefy(tmp_c_sorted, tmp_c_unique,
                     cfg.getint('visualization', 'uniquefy_track_max_reads'))
        
        total_treat = sum(1 for l in open(tmp_t_unique))
        total_control = sum(1 for l in open(tmp_c_unique))
        if total_treat == total_control:
            with log_mtx:
                log.debug('No downsampling required-- tag counts identical')
        else:
            # downsample num_down_sample times
            for i in xrange(cfg.getint('peaks', 'num_down_samples')):
                out_treat = out_treat_template % i
                out_control = out_control_template % i
                if total_treat > total_control:
                    # reduce number of treatment reads
                    inds_to_keep = set(random.sample(xrange(total_treat),
                                                                total_control))
                    in_orig, out_orig = tmp_c_unique, out_control
                    in_subset, out_subset = tmp_t_unique, out_treat
                else:
                    # reduce number of control reads
                    inds_to_keep = set(random.sample(xrange(total_control),
                                                     total_treat))
                    in_orig, out_orig = tmp_t_unique, out_treat
                    in_subset, out_subset = tmp_c_unique, out_control
                sys_call('cp %s %s' % (in_orig, out_orig))
                # subset the tags
                with open(in_subset) as infile:
                    with open(out_subset, 'w') as outfile:
                        outfile.writelines(line for i, line in enumerate(infile) 
                                                        if i in inds_to_keep)
        for f in [tmp_t_sorted, tmp_t_unique, tmp_c_sorted, tmp_c_unique]:
            os.unlink(f)
Пример #49
0
def bed_to_bigbed(in_bed, out_bigbed):
    """Convert a BED file to .bigbed for viewing on UCSC browser"""
    cmd = 'bedToBigBed %s %s.chrom.sizes %s' % (in_bed,
                                                genome_path(), out_bigbed)
    sys_call(cmd)
Пример #50
0
def maq_index_reference(in_fasta, out_bfa):
    """ Use maq fasta2bfa to convert reference sequences in .fasta
    format to BFA format, which is a binary representation.
    """
    cmd = 'maq fasta2bfa %s %s' % (in_fasta, out_bfa)
    sys_call(cmd)
Пример #51
0
def merge_adjacent_reads(in_bed, out_pattern, window_width, iterations,
                         out_merged, out_pileup, min_read_count):
    """Reassign read ends to a weighted average of adjacent reads"""
    # helper functions for parsing bed files
    filter_lines = lambda l: l.strip() and (not l.startswith('#') or \
                                            l.startswith('"'))
    read_bed_lines = lambda infile: itertools.ifilter(filter_lines, infile)
    
    # sort the input by chrom, stop
    tmpfile = in_bed + '.merged_adjacent_sorted'
    cmd = r"sort -t $'\t' -k 1,1 -k 3g,3 %s > %s" % (in_bed, tmpfile)
    print cmd
    sys_call(cmd, file_log=False)
    p_file = tmpfile
    outfile_pileup = None  # used on last iteration to generate the final pileup
    
    for i in range(iterations):
        print 'merge iteration %s' % i
        # read in from output of previous iteration
        infile = read_bed_lines(open(p_file))
        
        # output to a temp file except on the last iteration
        if i != iterations - 1:
            p_file = in_bed + '.merge_adjacent_%s' % i
        else:
            p_file = out_merged
            outfile_pileup = open(out_pileup, 'w')
        outfile = open(p_file, 'w')

        # parse first line
        (chrom, start, stop, name,
                        score, strand) = infile.next().strip().split('\t')[:6]
        if strand == '+':
            p_chrom, p_stops, p_names, p_strands = (chrom, [int(stop)],
                                                    [name], [strand])
        else:
            p_chrom, p_stops, p_names, p_strands = (chrom, [int(start)],
                                                    [name], [strand])
        print 'first line:', chrom, start, stop, name, score, strand
        
        for index, line in enumerate(infile):
            try:
                (chrom, start, stop,
                    name, score, strand) = line.strip().split('\t')[:6]
            except:
                print index, 'this line:', line
                raise
            if strand == '+':
                stop = int(stop)
            else:
                stop = int(start) + 1
            # is next read too far from first recorded?
            if p_chrom != chrom or (len(p_stops) > 0 and
                                    abs(p_stops[0] - stop) > window_width):
                if len(p_stops) == 0 or len(p_names) == 0:
                    print 'error!'
                    print line
                    print p_stops, p_names, p_strands
                    raise
                if len(p_stops) > min_read_count:
                    avg = int(round(sum(p_stops) / float(len(p_stops))))
                    # write out reads in this cluster, using avg as coordinate
                    outfile.writelines('\t'.join([p_chrom, str(max(0, avg-1)), str(avg),
                                             n_name, '0', n_strand]) + '\n'
                                  for n_name, n_strand in zip(p_names, p_strands))
                    if outfile_pileup is not None:
                        outfile_pileup.write('\t'.join([p_chrom, str(max(0, avg-1)), str(avg),
                                               p_names[0], str(len(p_stops)),
                                               p_strands[0]]) + '\n')
                # reset our record
                p_chrom = chrom
                p_stops = [stop]
                p_names =  [name]
                p_strands = [strand]
            # otherwise, the next read is within the window, on same chrom
            else:
                p_stops.append(stop)
                p_names.append(name)
                p_strands.append(strand)

        # output anything left in queue after EOF
        if len(p_stops) > 0:
            avg = int(round(sum(p_stops) / float(len(p_stops))))
            # write out reads in this cluster, using avg as coordinate
            outfile.writelines('\t'.join([chrom, str(max(0, avg-1)), str(avg),
                                     n_name, '0', n_strand]) + '\n'
                          for n_name, n_strand in zip(p_names, p_strands))
            if outfile_pileup is not None:
                outfile_pileup.write('\t'.join([chrom, str(max(0, avg-1)), str(avg),
                                           p_names[0], str(len(p_stops)),
                                           p_strands[0]]) + '\n')
        if outfile_pileup is not None:
            outfile_pileup.close()
        outfile.close()