Exemplo n.º 1
0
def calculate_conversions(loci,
                          samples,
                          max_k=1,
                          antisense=False,
                          genome_fa=None,
                          read_select_fn=None):
    """
    conv[k-1][sampleID][featID][base][position] = [(wt, mut, other), ...]
    where k = number of linked bases
    """

    try:
        chr_seqs = ru.load_chr_seqs(genome_fa)
    except IOError:
        sys.exit('***ERROR: Genome fasta not found: %s, exiting' % genome_fa)

    all_profiles = []
    bases = ['C', 'G']

    for k_index in range(max_k):
        k = k_index + 1
        profiles = {}
        for counter, (feat_id, feat_data) in enumerate(loci.items()):
            chr, strand, exons = feat_data[:3]
            start = exons[0][0]
            end = exons[-1][1]
            coord = '%s:%d-%d' % (chr, start, end)
            tx_coords = ru.tx_indexing(exons, strand == '-')
            tx_coords_inv = ru.tx_indexing(exons, strand == '-', inverse=True)

            ref_chr_seq = chr_seqs[chr]

            if antisense:
                strand = ru.anti_strand_str[strand]

            for sample_label, bamfile_dict in sorted(samples.items()):
                if sample_label not in profiles:
                    profiles[sample_label] = {}

                if feat_id not in profiles[sample_label]:
                    profiles[sample_label][feat_id] = {}
                    profiles[sample_label][feat_id]['C'] = [
                        [] for i in range(1 + max(tx_coords.values()))
                    ]
                    profiles[sample_label][feat_id]['G'] = [
                        [] for i in range(1 + max(tx_coords.values()))
                    ]

                bamfiles = bamfile_dict[strand]
                for btuple in bamfiles:
                    for bamfile, base in zip(btuple, bases):
                        conv = conversion_blocks(base, [bamfile],
                                                 coord,
                                                 strand,
                                                 ref_chr_seq,
                                                 SAMFLAGS[strand],
                                                 read_select_fn,
                                                 k=k,
                                                 Fsamflag=FSAMFLAGS[strand])

                        for i, x in enumerate(
                                profiles[sample_label][feat_id][base]):
                            genomic_coord = tx_coords_inv.get(i, -1)
                            if genomic_coord > -1:
                                profiles[sample_label][feat_id][base][
                                    i].append(
                                        conv.get(genomic_coord, [0, 0, 0]))

            #Status counter
            if counter % 50 == 0:
                sys.stderr.write(
                    '%d out of %d regions completed for linked_bases = %d\n' %
                    (counter, len(loci), k))
        all_profiles.append(profiles)
    return all_profiles
Exemplo n.º 2
0
def calculate_frag_stats(loci, samples, buffer=75, antisense=False):
    """
    Calculates the count, mean, median, sd of apparent fragment length
    per locus

    Return is a tuple of dict: frag_stats[sampleID][featID] = (N, mean, median, sd)

    and combined_stats[sampleID] = (N, mean, median, sd)
    """

    frag_lengths = {}  #per feature per sample
    combined_lengths = {}  #pooled over all features per sample

    frag_stats = {}
    combined_stats = {}
    combined_hist = {}

    frag_counts = {}  #per feature per sample

    for k, (feat_id, feat_data) in enumerate(loci.items()):
        chr, strand, exons = feat_data[:3]
        start = exons[0][0]
        end = exons[-1][1]
        coord = '%s:%d-%d' % (chr, start, end)
        tx_coords = ru.tx_indexing(exons, strand == '-')
        tx_coords_inv = ru.tx_indexing(exons, strand == '-', inverse=True)

        feat_len = max(tx_coords_inv.keys())
        if antisense:
            strand = ru.anti_strand_str[strand]

        for sample_label, bamfile_dict in sorted(samples.items()):
            if sample_label not in frag_lengths:
                frag_lengths[sample_label] = {}
                combined_lengths[sample_label] = []
                frag_stats[sample_label] = {}
                combined_stats[sample_label] = []
                combined_hist[sample_label] = {}
                frag_counts[sample_label] = {}

            if feat_id not in frag_lengths[sample_label]:
                frag_lengths[sample_label][feat_id] = []
                frag_counts[sample_label][feat_id] = {}

            bamfiles = [
                bfile for btuple in bamfile_dict[strand] for bfile in btuple
            ]
            length_list = []
            samflags = SAMFLAGS[strand]
            Fsamflag = FSAMFLAGS[strand]

            gc = tx_pileup(bamfiles,
                           coord,
                           tx_coords,
                           samflags,
                           raw_frags=True,
                           Fsamflag=Fsamflag)
            for s, e in gc:
                if s > buffer and e < feat_len - buffer:
                    length_list.append(e - s)
                frag_counts[sample_label][feat_id][(
                    s, e)] = frag_counts[sample_label][feat_id].get(
                        (s, e), 0) + 1

            #update
            frag_lengths[sample_label][feat_id] += length_list
            combined_lengths[sample_label] += length_list

        #Statistics per feature
        for sample_label in frag_lengths.keys():
            a = frag_lengths[sample_label][feat_id]
            if len(a) > 0:
                frag_stats[sample_label][feat_id] = (len(a), np.mean(a),
                                                     np.median(a), np.std(a))
            else:
                frag_stats[sample_label][feat_id] = (0, float('nan'),
                                                     float('nan'),
                                                     float('nan'))

            #Number of times each fragment appears
            a = list(frag_counts[sample_label][feat_id].values())
            frag_counts[sample_label][feat_id] = np.bincount(a)

    #Statistics per sample
    for sample_label, numbers in combined_lengths.items():
        if len(numbers) > 0:
            combined_stats[sample_label] = (len(numbers), np.mean(numbers),
                                            np.median(numbers),
                                            np.std(numbers))
        else:
            combined_stats[sample_label] = (0, float('nan'), float('nan'),
                                            float('nan'))
        combined_hist[sample_label] = np.bincount(numbers)

    return frag_stats, combined_stats, combined_hist, frag_counts
Exemplo n.º 3
0
def joint_conversions(loci,
                      profiles,
                      expt,
                      ctrl,
                      outfile=None,
                      z_thresh=1,
                      g_fdr=0.05,
                      find_depletions=True):
    """
    E.g., mono, di, tri base alleles simultaneously and non-redundantly.
    """
    if outfile:
        fw = open(outfile, 'w')

    convs = []
    try:
        for i, profile in enumerate(profiles):
            convs.append(
                conversions(loci,
                            profile,
                            expt,
                            ctrl,
                            z_thresh=z_thresh,
                            g_fdr=g_fdr,
                            mult=i + 1,
                            find_depletions=find_depletions))
    except:
        sys.exit(
            '***ERROR: Improperly formatted data file? (should be a _conversions_ pickle)'
        )


#    fields = ['feat_id', 'label', 'coord', 'strand', 'base', 'mult', 'pos', 'z_expt', 'log_diff', 'g_pool', 'g_pool_p', 'g_pool_padj', 'g_het_expt', 'g_het_expt_p', 'g_het_expt_padj', 'g_het_ctrl', 'g_het_ctrl_p', 'g_het_ctrl_padj', 'seq']
    fields = [
        'feat_id', 'label', 'coord', 'strand', 'base', 'linked_bases', 'pos',
        'z_expt', 'log_diff', 'g_pool', 'g_pool_p', 'g_pool_padj', 'seq'
    ]
    if outfile:
        fw.write('\t'.join(fields) + '\n')

    joint_convs = {}
    for feat_id, feat_data in loci.items():
        chr, strand, exons, seq = feat_data
        tx_coords = ru.tx_indexing(exons, strand == '-', inverse=True)

        positions = {}
        for conv_dict in convs:
            for conv in conv_dict[feat_id]:
                for index in conv['pos']:
                    if index not in positions:
                        positions[index] = []
                    positions[index].append((conv['g_pool_padj'], conv))
        filtered_positions = {}
        for index, conv_list in positions.items():
            #Keep the conversion with lowest p value
            conv_list.sort(key=lambda x: x[0])
            best = conv_list[0][1]

            if best['label'] not in filtered_positions:
                filtered_positions[best['label']] = (best['pos'], best)
            for pval, conv in conv_list[1:]:
                filtered_positions[conv['label']] = ((-1, ), None)

        joint_convs[feat_id] = []
        for index, conv in sorted(filtered_positions.values()):
            if conv:
                #Genomic coordinate
                g_start = tx_coords[conv['pos'][0]]
                g_end = tx_coords[conv['pos'][-1]]
                if g_start > g_end:
                    g_start, g_end = g_end, g_start
                conv['coord'] = '%s:%d-%d' % (chr, g_start + 1, g_end + 1)
                conv['strand'] = strand

                joint_convs[feat_id].append(conv)

                if outfile:
                    out_line = [ru.pretty_str(conv[x]) for x in fields]
                    fw.write('\t'.join(out_line) + '\n')

    if outfile:
        fw.close()

    return joint_convs
Exemplo n.º 4
0
def calculate_coverage(loci,
                       samples,
                       antisense=False,
                       frag_sizes=None,
                       sampling_rate=1,
                       paired=True,
                       rmdup_strictest=False,
                       dup_limit=0,
                       strict_dup_filtering=False):
    """
    For each sample, calculates coverage spanning each read pair
    (including the insert). Insert is assumed to be the region
    from the end of read 1 to start of read 2, ignoring any
    potential intron.

    For bisulfite, coverage is calculated separately for each conversion event.

    Coverage is stored in the profiles dict. profiles[sampleID][featID][position] = count_list
    Totals are totals[sampleID] = [+ counts, - counts]
    
    """
    profiles = {}
    total_counts = {}

    for k, (feat_id, feat_data) in enumerate(loci.items()):
        chr, strand, exons = feat_data[:3]
        start = exons[0][0]
        end = exons[-1][1]
        coord = '%s:%d-%d' % (chr, start, end)
        tx_coords = ru.tx_indexing(exons, strand == '-')

        if antisense:
            strand = ru.anti_strand_str[strand]

        for sample_label, bamfile_dict in sorted(samples.items()):
            if sample_label not in profiles:
                profiles[sample_label] = {}
                total_counts[sample_label] = [[], []]  #strand separated +, -

            if feat_id not in profiles[sample_label]:
                profiles[sample_label][feat_id] = [
                    [] for i in range(1 + max(tx_coords.values()))
                ]

            bamfiles = [
                bfile for btuple in bamfile_dict[strand] for bfile in btuple
            ]
            for bamfile in bamfiles:
                sample_cov = tx_pileup(
                    [bamfile],
                    coord,
                    tx_coords,
                    SAMFLAGS[strand],
                    frag_sizes=frag_sizes,
                    dup_limit=dup_limit,
                    strict_dup_filtering=strict_dup_filtering,
                    sampling_rate=sampling_rate,
                    Fsamflag=FSAMFLAGS[strand])
                for i, x in enumerate(profiles[sample_label][feat_id]):
                    profiles[sample_label][feat_id][i].append(sample_cov[i])

        #Update total counts
        if strand == '+':
            j = 0
        else:
            j = 1

        for sample_label, feat_dict in profiles.items():
            for counts in feat_dict[feat_id]:
                if not counts:
                    continue
                if len(total_counts[sample_label][j]) == 0:
                    total_counts[sample_label][j] = np.array(counts)
                else:
                    total_counts[sample_label][j] = np.add(
                        total_counts[sample_label][j], counts)

        #Status counter
        if k % 50 == 0:
            sys.stderr.write('%d out of %d regions completed\n' %
                             (k, len(loci)))

    return profiles, total_counts
Exemplo n.º 5
0
def region_finder(loci,
                  profiles,
                  expt,
                  ctrl,
                  ratio_fn=profile_ratio,
                  count_fn=raw_profile_replicates,
                  motifs=None,
                  lfc_thresh=math.log(1.25, 2),
                  low_conf_lfc_thresh=math.log(1.25, 2),
                  min_peak_width=5,
                  min_ctrl_coverage=0,
                  min_expt_coverage=0,
                  outfile_root='',
                  peaks_only=False,
                  do_bedgraph=True,
                  write_bedgraph_header=True):
    """
    Iterate through ratios to identify responsive regions
    (peaks/valleys)

    Minimal return: regions[feat_id] = [(start, end, label), ...]
    """
    regions = {}
    ratio_cache = []

    try:
        pr = ratio_fn(loci, profiles, expt, ctrl)
    except:
        sys.exit(
            '***ERROR: Improperly formatted data file? (should be a _coverage_ pickle)'
        )

    for feat_id, feat_data in loci.items():
        chr, strand, exons, seq = feat_data
        tx_coords = ru.tx_indexing(exons, strand == '-', inverse=True)

        #Find responsive regions
        rr = responsive_regions(pr[feat_id], threshold=low_conf_lfc_thresh)

        #Update the ratio list
        if do_bedgraph:
            ratio_cache += bedgraph_entries(chr, pr[feat_id], tx_coords)

        #Identify motif locations
        feat_motifs = motif_locs(seq, motifs, window=1)

        #Get raw counts
        all_expt_counts = count_fn(feat_id, profiles, expt)
        all_ctrl_counts = count_fn(feat_id, profiles, ctrl)

        #For each responsive region, calculate raw reads
        for r_data in rr:
            #Get raw counts at the critical point
            r_data['expt_counts'] = all_expt_counts[r_data['crit_pt']]
            r_data['ctrl_counts'] = all_ctrl_counts[r_data['crit_pt']]

            r_data['sum_expt_counts'] = sum(r_data['expt_counts'])
            r_data['sum_ctrl_counts'] = sum(r_data['ctrl_counts'])

            if r_data['sum_ctrl_counts'] < min_ctrl_coverage or \
               r_data['sum_expt_counts'] < min_expt_coverage or \
               r_data['width'] < min_peak_width or \
               abs(r_data['value']) < lfc_thresh or \
               (peaks_only and r_data['type'] == 'valley'):
                continue

            r_data['expt'] = expt
            r_data['ctrl'] = ctrl
            r_data['feat_id'] = feat_id
            r_data['long_label'] = r_data['feat_id'] + '_' + r_data['label']
            r_data['dist_from_end'] = min(r_data['bound'][0] - 1,
                                          len(seq) - r_data['bound'][1] - 1)
            r_data['crit_pt_dist_from_end'] = min(r_data['crit_pt'],
                                                  len(seq) - r_data['crit_pt'])

            g_start, g_end = tx_coords[r_data['bound'][0]], tx_coords[
                r_data['bound'][1]]
            if strand == '-':
                g_start, g_end = g_end, g_start

            r_data['chr'] = chr
            r_data['genomic_start'] = g_start
            r_data['genomic_end'] = g_end + 1
            r_data['strand'] = strand
            r_data['coord'] = '%s:%d-%d' % (chr, g_start + 1, g_end + 1)

            r_data['seq'] = seq[r_data['bound'][0]:(1 + r_data['bound'][1])]

            #Get the sequence of the summit
            flank_amt = 10
            summit_seqs = []
            min_s = len(seq)
            max_e = 0
            for s, e in r_data['summit']:
                sseq = seq[s:e]
                l_bound = max(0, s - flank_amt)
                flank_l = seq[l_bound:s].lower()
                r_bound = min(e + flank_amt, len(seq))
                flank_r = seq[e:r_bound].lower()
                summit_seqs.append(flank_l + sseq + flank_r)
                if l_bound < min_s:
                    min_s = l_bound
                if r_bound > max_e:
                    max_e = r_bound

            r_data['summit_seq'] = summit_seqs

            #15nt +/-
            flank_addition = 15
            r_data['spanning_summit_seq'] = seq[
                max(0, min_s - flank_addition):min(max_e +
                                                   flank_addition, len(seq))]

            #Annotate motifs
            overlapping_motifs = []
            for s, e, short_name, m in feat_motifs:
                if r_data['bound'][0] < s and e < r_data['bound'][1]:
                    overlapping_motifs.append((s, short_name))
            r_data['motifs'] = overlapping_motifs

            if feat_id not in regions:
                regions[feat_id] = []
            regions[feat_id].append(r_data)

    if outfile_root:
        output_region_data(regions, outfile_root + '.txt')
        output_region_bed(regions, outfile_root + '.bed')
        output_region_seq(regions, outfile_root, do_valley=not peaks_only)

        if do_bedgraph:
            if write_bedgraph_header:
                header_name = header_name = '%s/%s' % (expt, ctrl)
            else:
                header_name = ''
            output_region_bedgraph(ratio_cache,
                                   outfile_root + '.bedgraph',
                                   header_name=header_name)
    return regions