Пример #1
0
def sanity_check_lifted_nagalakshmi_file(fn):
    gtf_fn = '/home/jah/projects/arlen/data/organisms/saccharomyces_cerevisiae/EF4/transcriptome/genes.gtf'
    CDSs = gtf.get_CDSs(gtf_fn)
    gtf_dict = {t.name: t for t in CDSs}
    genes = read_nagalakshmi_file(fn)
    discrepancies_after = defaultdict(list)
    for name in genes:
        if name not in gtf_dict:
            print name, 'not in gtf_dict'
            continue

        start, end = genes[name]['SGD_Start'], genes[name]['SGD_End']
        chrom = genes[name]['Chrom']
        if start > end:
            start, end = end, start

        end -= 1

        if start != gtf_dict[name].start or end != gtf_dict[name].end:
            #print name, chrom
            #print start, end
            #print gtf_dict[name].start, gtf_dict[name].end
            #raw_input()
            discrepancies_after[chrom].append(name)

    return discrepancies_after
Пример #2
0
def sanity_check_lifted_nagalakshmi_file(fn):
    gtf_fn = '/home/jah/projects/arlen/data/organisms/saccharomyces_cerevisiae/EF4/transcriptome/genes.gtf'
    CDSs = gtf.get_CDSs(gtf_fn)
    gtf_dict = {t.name: t for t in CDSs}
    genes = read_nagalakshmi_file(fn)
    discrepancies_after = defaultdict(list)
    for name in genes:
        if name not in gtf_dict:
            print name, 'not in gtf_dict'
            continue

        start, end = genes[name]['SGD_Start'], genes[name]['SGD_End']
        chrom = genes[name]['Chrom']
        if start > end:
            start, end = end, start

        end -= 1

        if start != gtf_dict[name].start or end != gtf_dict[name].end:
            #print name, chrom
            #print start, end
            #print gtf_dict[name].start, gtf_dict[name].end
            #raw_input()
            discrepancies_after[chrom].append(name)

    return discrepancies_after
Пример #3
0
def sanity_check_nagalakshmi_file():
    gtf_fn = '/home/jah/projects/arlen/data/organisms/saccharomyces_cerevisiae/SGD1.01/transcriptome/genes.gtf'
    CDSs = gtf.get_CDSs(gtf_fn)
    gtf_dict = {t.name: t for t in CDSs}
    genes = read_nagalakshmi_file('nagalakshmi_annotations.txt')
    discrepancies_after = {}
    for name in genes:
        if name not in gtf_dict:
            print name, 'not in gtf_dict'
            continue

        start, end = genes[name]['SGD_Start'] - 1, genes[name]['SGD_End'] - 1
        chrom = genes[name]['Chrom']
        if start > end:
            start, end = end, start

        if start != gtf_dict[name].start or end != gtf_dict[name].end:
            #print name, chrom
            #print start, end
            #print gtf_dict[name].start, gtf_dict[name].end
            #print
            if chrom not in discrepancies_after:
                discrepancies_after[chrom] = start
            else:
                discrepancies_after[chrom] = min(start,
                                                 discrepancies_after[chrom])

    return discrepancies_after
Пример #4
0
def sanity_check_nagalakshmi_file():
    gtf_fn = '/home/jah/projects/arlen/data/organisms/saccharomyces_cerevisiae/SGD1.01/transcriptome/genes.gtf'
    CDSs = gtf.get_CDSs(gtf_fn)
    gtf_dict = {t.name: t for t in CDSs}
    genes = read_nagalakshmi_file('nagalakshmi_annotations.txt')
    discrepancies_after = {}
    for name in genes:
        if name not in gtf_dict:
            print name, 'not in gtf_dict'
            continue

        start, end = genes[name]['SGD_Start'] - 1, genes[name]['SGD_End'] - 1
        chrom = genes[name]['Chrom']
        if start > end:
            start, end = end, start

        if start != gtf_dict[name].start or end != gtf_dict[name].end:
            #print name, chrom
            #print start, end
            #print gtf_dict[name].start, gtf_dict[name].end
            #print
            if chrom not in discrepancies_after:
                discrepancies_after[chrom] = start
            else:
                discrepancies_after[chrom] = min(start, discrepancies_after[chrom])

    return discrepancies_after
Пример #5
0
def sanity_check_weinberg_file():
    gtf_fn = '/home/jah/projects/arlen/data/organisms/saccharomyces_cerevisiae/EF4/transcriptome/genes.gtf'
    CDSs = gtf.get_CDSs(gtf_fn)
    gtf_dict = {t.name: t for t in CDSs}
    genes = read_weinberg_file()
    discrepancies_after = defaultdict(list)
    for name in genes:
        if name not in gtf_dict:
            print name, 'not in gtf_dict'
            continue

        start, end = genes[name]['CdsStart'], genes[name]['CdsEnd']
        chrom = genes[name]['Chromosome']

        end -= 1

        if start != gtf_dict[name].start or end != gtf_dict[name].end:
            print name, chrom
            print start, end
            print gtf_dict[name].start, gtf_dict[name].end
            raw_input()
            discrepancies_after[chrom].append(name)

    return discrepancies_after
Пример #6
0
def sanity_check_weinberg_file():
    gtf_fn = '/home/jah/projects/arlen/data/organisms/saccharomyces_cerevisiae/EF4/transcriptome/genes.gtf'
    CDSs = gtf.get_CDSs(gtf_fn)
    gtf_dict = {t.name: t for t in CDSs}
    genes = read_weinberg_file()
    discrepancies_after = defaultdict(list)
    for name in genes:
        if name not in gtf_dict:
            print name, 'not in gtf_dict'
            continue

        start, end = genes[name]['CdsStart'], genes[name]['CdsEnd']
        chrom = genes[name]['Chromosome']
        
        end -= 1

        if start != gtf_dict[name].start or end != gtf_dict[name].end:
            print name, chrom
            print start, end
            print gtf_dict[name].start, gtf_dict[name].end
            raw_input()
            discrepancies_after[chrom].append(name)

    return discrepancies_after
Пример #7
0
def plot_frameshifts(
    gtf_fn,
    bam_fns,
    gene_name,
    exp_name,
    genome_dir,
    show_fractions=False,
):
    codon_buffer = 10
    start_codon = 'ATG'
    stop_codons = {'TAA', 'TAG', 'TGA'}
    A_site_offset = 5

    CDSs = {c.name: c for c in gtf.get_CDSs(gtf_fn, genome_dir, '/dev/null')}
    lengths = [28]

    transcript = CDSs[gene_name]

    left_buffer = 30 + codon_buffer * 3
    right_buffer = (codon_buffer + 1) * 3
    transcript.build_extent_maps(left_buffer, right_buffer)

    experiment_counts = []
    for bam_fn in bam_fns:
        counts = positions.get_Transcript_extent_position_counts(
            transcript,
            bam_fn,
            lengths,
            left_buffer=left_buffer,
            right_buffer=right_buffer,
        )
        experiment_counts.append(counts)

    # Get the sequence of the extent.
    extent_sequence = transcript.get_extent_sequence(
        left_buffer=left_buffer,
        right_buffer=right_buffer,
    )

    for length in lengths:
        A_site_offset = positions.A_site_offsets['yeast'][length]

        length_counts = reduce(
            operator.add, [counts[length] for counts in experiment_counts])
        codon_numbers = np.arange(-codon_buffer,
                                  transcript.extent_length / 3 + codon_buffer)

        # frame_counts_list[i, j] will be the number of RPF's starting at frame i of
        # codon j
        frame_counts_list = np.zeros((3, len(codon_numbers)), int)
        start_codon_locations = [[] for _ in range(3)]
        stop_codon_locations = [[] for _ in range(3)]

        for c, codon_number in enumerate(codon_numbers):
            codon_start = 3 * codon_number
            for frame in range(3):
                frame_counts_list[frame,
                                  c] = length_counts['start', codon_start +
                                                     frame - A_site_offset]
                codon = extent_sequence['start', codon_start +
                                        frame:codon_start + frame + 3]
                codon = ''.join(codon)

                if codon == start_codon:
                    start_codon_locations[frame].append(codon_number)

                if codon in stop_codons:
                    stop_codon_locations[frame].append(codon_number)

        if show_fractions:
            fig, axs = plt.subplots(4, 1, sharex=True)
            cumulative_ax = axs[0]
            frame_axs = axs[1:]
        else:
            fig, frame_axs = plt.subplots(3, 1, sharex=True)

        for frame, (ax, frame_counts) in enumerate(
                zip(frame_axs, frame_counts_list)):
            nonzero_codon_numbers = [
                c_n for c_n, f_c in zip(codon_numbers, frame_counts)
                if f_c != 0
            ]
            nonzero_frame_counts = [
                f_c for c_n, f_c in zip(codon_numbers, frame_counts)
                if f_c != 0
            ]
            ax.plot(nonzero_codon_numbers, nonzero_frame_counts, '.')
            ax.set_ylim(0, frame_counts_list.max() + 1)
            ax.set_xlim(codon_numbers[0], codon_numbers[-1])
            ax.set_title('Frame {0}'.format(frame))
            ax.set_ylabel('Read counts')

            for x in start_codon_locations[frame]:
                ax.axvspan(x - 0.5,
                           x + 0.5,
                           facecolor='green',
                           edgecolor='none',
                           alpha=0.2)

            for x in stop_codon_locations[frame]:
                ax.axvspan(x - 0.5,
                           x + 0.5,
                           facecolor='red',
                           edgecolor='none',
                           alpha=0.2)

        frame_axs[-1].set_xlabel('Codons from start codon')

        if show_fractions:
            frames_so_far = frame_counts_list.cumsum(axis=1)
            fraction_frames_so_far = np.true_divide(
                frames_so_far,
                np.maximum(1, frames_so_far.sum(axis=0)),
            )

            frames_remaining = np.fliplr(
                np.fliplr(frame_counts_list).cumsum(axis=1))
            fraction_frames_remaining = np.true_divide(
                frames_remaining,
                np.maximum(1, frames_remaining.sum(axis=0)),
            )

            for frame in [0, 1, 2]:
                so_far = fraction_frames_so_far[frame]
                remaining = fraction_frames_remaining[frame]
                color = colors[frame]
                cumulative_ax.plot(codon_numbers,
                                   so_far,
                                   color=color,
                                   label='{0} so far'.format(frame))
                cumulative_ax.plot(codon_numbers,
                                   remaining,
                                   color=color,
                                   linestyle='--',
                                   label='{0} remaining'.format(frame))

            cumulative_ax.set_xlim(codon_numbers[0])
            cumulative_ax.set_ylim(-0.02, 1.02)
            cumulative_ax.set_ylabel('Fraction of reads in extent')

            cumulative_ax.legend(loc='upper right', framealpha=0.5)

        fig.suptitle('{2}\n{0}\nlength {1} fragments'.format(
            gene_name, length, exp_name))

        return fig
Пример #8
0
def plot_frameshifts(gtf_fn, bam_fns, gene_name, exp_name, genome_dir, show_fractions=False):
    codon_buffer = 10
    start_codon = "ATG"
    stop_codons = {"TAA", "TAG", "TGA"}
    A_site_offset = 5

    CDSs = {c.name: c for c in gtf.get_CDSs(gtf_fn, genome_dir, "/dev/null")}
    lengths = [28]

    transcript = CDSs[gene_name]

    left_buffer = 30 + codon_buffer * 3
    right_buffer = (codon_buffer + 1) * 3
    transcript.build_extent_maps(left_buffer, right_buffer)

    experiment_counts = []
    for bam_fn in bam_fns:
        counts = positions.get_Transcript_extent_position_counts(
            transcript, bam_fn, lengths, left_buffer=left_buffer, right_buffer=right_buffer
        )
        experiment_counts.append(counts)

    # Get the sequence of the extent.
    extent_sequence = transcript.get_extent_sequence(left_buffer=left_buffer, right_buffer=right_buffer)

    for length in lengths:
        A_site_offset = positions.A_site_offsets["yeast"][length]

        length_counts = reduce(operator.add, [counts[length] for counts in experiment_counts])
        codon_numbers = np.arange(-codon_buffer, transcript.extent_length / 3 + codon_buffer)

        # frame_counts_list[i, j] will be the number of RPF's starting at frame i of
        # codon j
        frame_counts_list = np.zeros((3, len(codon_numbers)), int)
        start_codon_locations = [[] for _ in range(3)]
        stop_codon_locations = [[] for _ in range(3)]

        for c, codon_number in enumerate(codon_numbers):
            codon_start = 3 * codon_number
            for frame in range(3):
                frame_counts_list[frame, c] = length_counts["start", codon_start + frame - A_site_offset]
                codon = extent_sequence["start", codon_start + frame : codon_start + frame + 3]
                codon = "".join(codon)

                if codon == start_codon:
                    start_codon_locations[frame].append(codon_number)

                if codon in stop_codons:
                    stop_codon_locations[frame].append(codon_number)

        if show_fractions:
            fig, axs = plt.subplots(4, 1, sharex=True)
            cumulative_ax = axs[0]
            frame_axs = axs[1:]
        else:
            fig, frame_axs = plt.subplots(3, 1, sharex=True)

        for frame, (ax, frame_counts) in enumerate(zip(frame_axs, frame_counts_list)):
            nonzero_codon_numbers = [c_n for c_n, f_c in zip(codon_numbers, frame_counts) if f_c != 0]
            nonzero_frame_counts = [f_c for c_n, f_c in zip(codon_numbers, frame_counts) if f_c != 0]
            ax.plot(nonzero_codon_numbers, nonzero_frame_counts, ".")
            ax.set_ylim(0, frame_counts_list.max() + 1)
            ax.set_xlim(codon_numbers[0], codon_numbers[-1])
            ax.set_title("Frame {0}".format(frame))
            ax.set_ylabel("Read counts")

            for x in start_codon_locations[frame]:
                ax.axvspan(x - 0.5, x + 0.5, facecolor="green", edgecolor="none", alpha=0.2)

            for x in stop_codon_locations[frame]:
                ax.axvspan(x - 0.5, x + 0.5, facecolor="red", edgecolor="none", alpha=0.2)

        frame_axs[-1].set_xlabel("Codons from start codon")

        if show_fractions:
            frames_so_far = frame_counts_list.cumsum(axis=1)
            fraction_frames_so_far = np.true_divide(frames_so_far, np.maximum(1, frames_so_far.sum(axis=0)))

            frames_remaining = np.fliplr(np.fliplr(frame_counts_list).cumsum(axis=1))
            fraction_frames_remaining = np.true_divide(frames_remaining, np.maximum(1, frames_remaining.sum(axis=0)))

            for frame in [0, 1, 2]:
                so_far = fraction_frames_so_far[frame]
                remaining = fraction_frames_remaining[frame]
                color = colors[frame]
                cumulative_ax.plot(codon_numbers, so_far, color=color, label="{0} so far".format(frame))
                cumulative_ax.plot(
                    codon_numbers, remaining, color=color, linestyle="--", label="{0} remaining".format(frame)
                )

            cumulative_ax.set_xlim(codon_numbers[0])
            cumulative_ax.set_ylim(-0.02, 1.02)
            cumulative_ax.set_ylabel("Fraction of reads in extent")

            cumulative_ax.legend(loc="upper right", framealpha=0.5)

        fig.suptitle("{2}\n{0}\nlength {1} fragments".format(gene_name, length, exp_name))

        return fig