Exemplo n.º 1
0
def user_defined_exons(tmp_sg, line):
    chr, strand = utils.get_chr(
        line[utils.TARGET]), line[utils.TARGET][0]  # get chr and strand
    upstream_exon = utils.get_pos(
        line[utils.UPSTREAM_EXON])  # get user-defined flanking exons
    downstream_exon = utils.get_pos(line[utils.DOWNSTREAM_EXON])
    first_primer, second_primer = utils.get_primer_coordinates(
        line[utils.PRIMER_COORD])

    # get possible exons for primer amplification
    tmp = sorted(tmp_sg.get_graph().nodes(), key=lambda x: (x[0], x[1]))
    first_ex = utils.find_first_exon(first_primer, tmp)
    last_ex = utils.find_last_exon(second_primer, tmp)
    my_exons = tmp[first_ex:last_ex + 1]
    # if tmp_sg.strand == '+':
    #     my_exons = tmp[tmp.index(upstream_exon):tmp.index(downstream_exon) + 1]
    # else:
    #     my_exons = tmp[tmp.index(downstream_exon):tmp.index(upstream_exon) + 1]

    # Use correct tx's and estimate counts/psi
    all_paths = algs.AllPaths(
        tmp_sg,
        my_exons,
        utils.get_pos(line[utils.TARGET]),  # tuple (start, end)
        chr=chr,
        strand=strand)
    # all_paths.trim_tx_paths()
    fexon = upstream_exon if strand == "+" else downstream_exon
    lexon = downstream_exon if strand == "+" else upstream_exon
    all_paths.trim_tx_paths_using_primers(first_primer, second_primer, fexon,
                                          lexon)
    all_paths.set_all_path_coordinates()
    paths, counts = all_paths.estimate_counts()  # run EM algorithm
    return paths, counts
Exemplo n.º 2
0
def user_defined_exons(tmp_sg, line):
    chr, strand = utils.get_chr(line[utils.TARGET]), line[utils.TARGET][0]  # get chr and strand
    upstream_exon = utils.get_pos(line[utils.UPSTREAM_EXON])  # get user-defined flanking exons
    downstream_exon = utils.get_pos(line[utils.DOWNSTREAM_EXON])
    first_primer, second_primer = utils.get_primer_coordinates(line[utils.PRIMER_COORD])

    # get possible exons for primer amplification
    tmp = sorted(tmp_sg.get_graph().nodes(), key=lambda x: (x[0], x[1]))
    first_ex = utils.find_first_exon(first_primer, tmp)
    last_ex = utils.find_last_exon(second_primer, tmp)
    my_exons = tmp[first_ex:last_ex + 1]
    # if tmp_sg.strand == '+':
    #     my_exons = tmp[tmp.index(upstream_exon):tmp.index(downstream_exon) + 1]
    # else:
    #     my_exons = tmp[tmp.index(downstream_exon):tmp.index(upstream_exon) + 1]

    # Use correct tx's and estimate counts/psi
    all_paths = algs.AllPaths(tmp_sg,
                              my_exons,
                              utils.get_pos(line[utils.TARGET]),  # tuple (start, end)
                              chr=chr,
                              strand=strand)
    # all_paths.trim_tx_paths()
    fexon = upstream_exon if strand == "+" else downstream_exon
    lexon = downstream_exon if strand == "+" else upstream_exon
    all_paths.trim_tx_paths_using_primers(first_primer, second_primer, fexon, lexon)
    all_paths.set_all_path_coordinates()
    paths, counts = all_paths.estimate_counts()  # run EM algorithm
    return paths, counts
Exemplo n.º 3
0
def save_isforms_and_counts(line, options):
    # get information about each row
    ID, target_coordinate = line[:2]
    strand = target_coordinate[0]
    chr = utils.get_chr(target_coordinate[1:])
    tmp_start, tmp_end = utils.get_pos(target_coordinate)
    logging.debug('Saving isoform and count information for event %s . . .' %
                  ID)

    # get information from GTF annotation
    gene_dict, gene_name = retrieve_gene_information(options, strand, chr,
                                                     tmp_start, tmp_end)

    # get edge weights
    edge_weights_list = [
        sam_obj.extractSamRegion(chr, gene_dict['start'], gene_dict['end'])
        for sam_obj in options['rnaseq']
    ]

    # construct splice graph for each BAM file
    bam_splice_graphs = sg.construct_splice_graph(edge_weights_list,
                                                  gene_dict,
                                                  chr,
                                                  strand,
                                                  options['read_threshold'],
                                                  options['min_jct_count'],
                                                  output_type='list',
                                                  both=options['both_flag'])

    for bam_ix, my_splice_graph in enumerate(bam_splice_graphs):
        # this case is meant for user-defined flanking exons
        if line[utils.PSI_UP] == '-1' and line[utils.PSI_DOWN] == '-1':
            # find path and count information
            paths, counts = user_defined_exons(my_splice_graph, line)

            # filter out single exon paths
            # my_tmp = [(path, count) for path, count in zip(paths, counts) if len(path) > 1]
            # paths, counts = zip(*my_tmp)
        # this case is meant for automatic choice of flanking exons
        else:
            paths, counts = primerseq_defined_exons(my_splice_graph, line,
                                                    options['psi'])
        utils.save_path_info('%s.%d' % (ID, bam_ix),
                             paths,
                             counts,
                             save_dir='tmp/indiv_isoforms/')
    logging.debug(
        'Finished saving isoform and count information for event %s.' % ID)
Exemplo n.º 4
0
def save_isforms_and_counts(line, options):
    # get information about each row
    ID, target_coordinate = line[:2]
    strand = target_coordinate[0]
    chr = utils.get_chr(target_coordinate[1:])
    tmp_start, tmp_end = utils.get_pos(target_coordinate)
    logging.debug('Saving isoform and count information for event %s . . .' % ID)

    # get information from GTF annotation
    gene_dict, gene_name = retrieve_gene_information(options,
                                                     strand, chr, tmp_start, tmp_end)

    # get edge weights
    edge_weights_list = [sam_obj.extractSamRegion(chr, gene_dict['start'], gene_dict['end'])
                         for sam_obj in options['rnaseq']]

    # construct splice graph for each BAM file
    bam_splice_graphs = sg.construct_splice_graph(edge_weights_list,
                                                  gene_dict,
                                                  chr,
                                                  strand,
                                                  options['read_threshold'],
                                                  options['min_jct_count'],
                                                  output_type='list',
                                                  both=options['both_flag'])

    for bam_ix, my_splice_graph in enumerate(bam_splice_graphs):
        # this case is meant for user-defined flanking exons
        if line[utils.PSI_UP] == '-1' and line[utils.PSI_DOWN] == '-1':
            # find path and count information
            paths, counts = user_defined_exons(my_splice_graph, line)

            # filter out single exon paths
            # my_tmp = [(path, count) for path, count in zip(paths, counts) if len(path) > 1]
            # paths, counts = zip(*my_tmp)
        # this case is meant for automatic choice of flanking exons
        else:
            paths, counts = primerseq_defined_exons(my_splice_graph, line, options['psi'])
        utils.save_path_info('%s.%d' % (ID, bam_ix),
                             paths, counts,
                             save_dir='tmp/indiv_isoforms/')
    logging.debug('Finished saving isoform and count information for event %s.' % ID)
Exemplo n.º 5
0
def read_depth_plot(my_bigwigs, output, options):
    if type(options['position']) == type(list()):
        chr = utils.get_chr(options['position'][0])
        start, stop = zip(
            *map(lambda x: utils.get_pos(x), options['position']))
    else:
        chr = utils.get_chr(options['position'])
        start, stop = utils.get_pos(options['position'])
    bigwigs = my_bigwigs.split(',')
    num_subplots = len(bigwigs)  # num of bam files equals number of subplots
    fig, axes = plt.subplots(num_subplots,
                             1,
                             sharex=True,
                             sharey=True,
                             figsize=(6, options['size'] * num_subplots))
    gray = (0.9, 0.9, 0.9)

    # iterate over subplots (bigwig files)
    max_count_holder = 0
    if num_subplots == 1:
        # axes.set_title('Read Depth Plot on %s' % chr)
        iterable = [axes]
    else:
        # axes.flat[0].set_title('Read Depth Plot on %s' % chr)
        iterable = axes.flat
    for i, ax in enumerate(iterable):
        #ax.locator_params(nbins=2)
        ax.yaxis.set_label_text('')

        # set bg
        ax.patch.set_facecolor(gray)
        ax.patch.set_edgecolor(gray)
        ax.grid()

        # plot/label
        max_count, real_start, real_stop = generate_plot(
            ax, bigwigs[i], chr, start, stop, options)  # does the actual work
        draw_text(
            ax, '%s -- ' % options['gene'] +
            os.path.splitext(os.path.basename(bigwigs[i]))[0])

        # format options
        ax.xaxis.grid(color='white', linestyle='--', linewidth=1.5)
        ax.yaxis.grid(color='white', linestyle='--', linewidth=1.5)
        ax.xaxis.set_major_formatter(DropFormatter())
        ax.yaxis.set_major_formatter(DropFormatter())
        ax.set_axisbelow(True)

        # hide some ugly lines
        for line in ax.xaxis.get_ticklines() + ax.yaxis.get_ticklines():
            line.set_color(gray)

        # set y-axis
        if max_count > max_count_holder:
            ax.set_ylim(0, 1.5 * max_count)
            ax.set_yticks([
                0,
                int(.375 * max_count),
                int(.75 * max_count),
                int(1.125 * max_count),
                int(1.5 * max_count)
            ])
            max_count_holder = max_count

        # set x-axis options
        ax.set_xlim(real_start, real_stop)  # set x limits
        ax.set_xticks([real_start, real_stop])  # explicitly set ticks
        ax.xaxis.set_ticklabels(
            map(addCommas,
                [real_start, real_stop]))  # make nice looking text for labels
        ax.get_xticklabels()[0].set_horizontalalignment('left')
        ax.get_xticklabels()[1].set_horizontalalignment('right')

        # make text box to display chromosome information
        if i == num_subplots - 1:
            offset_text(ax, '%s:' % chr, 3, (-.15, -.17))

        # adjust spacing between subplots
        fig.subplots_adjust(wspace=0.05, hspace=0.05, bottom=.12)

        # save figure
        plt.savefig(output)
Exemplo n.º 6
0
def main(options, args_output='tmp/debug.json'):
    """
    The gtf main function is the function designed to be called from other
    scripts. It iterates through each target exons and returns the necessary
    information for primer design.
    """
    genome, args_gtf, args_target = options['fasta'], options['gtf'], options['target']

    # the sam object interfaces with the user specified BAM/SAM file!!!
    sam_obj_list = options['rnaseq']

    # iterate through each target exon
    output = []  # output from program
    for line in args_target:  # was line in handle
        name, line = line  # bad style of reassignment
        tgt = line[0]
        strand = tgt[0]
        tmp_start, tmp_end = get_pos(tgt)
        chr = get_chr(tgt[1:])  # [1:] since strand is first character
        USER_DEFINED_FLANKING_EXONS = True if len(line) == 3 else False
        if USER_DEFINED_FLANKING_EXONS:
            up_exon = utils.get_pos(line[1])  # user's upstream exon
            down_exon = utils.get_pos(line[2])  # user's downstream exon
        else:
            up_exon = None  # user did not provide upstream exon
            down_exon = None  # user did not provide downstream exon

        # This try block is to catch assertions made about the graph. If a
        # PrimerSeqError is raised it only impacts a single target for primer
        # design so complete exiting of the program is not warranted.
        try:
            # if the gtf doesn't have a valid gene_id attribute then use
            # the first method otherwise use the second method.
            if options['no_gene_id']:
                gene_dict, gene_name = get_weakly_connected_tx(args_gtf, strand, chr, tmp_start, tmp_end)  # hopefully filter out junk
            else:
                gene_dict, gene_name = get_from_gtf_using_gene_name(args_gtf, strand, chr, tmp_start, tmp_end)

            # extract all edge weights only once
            edge_weights_list = [sam_obj.extractSamRegion(chr, gene_dict['start'], gene_dict['end'])
                                 for sam_obj in sam_obj_list]

            # The following options['both_flag'] determines how the splice graph is constructed.
            # The splice graph can be either constructed from annotation junctions
            # where options['both_flag']==False or RNA-Seq + annotation junctions when
            # options['both_flag']==True.

            # single pooled count data splice graph
            splice_graph = construct_splice_graph(edge_weights_list,
                                                  gene_dict,
                                                  chr,
                                                  strand,
                                                  options['read_threshold'],
                                                  options['min_jct_count'],
                                                  output_type='single',
                                                  both=options['both_flag'])
            # Second, get a splice graph for each BAM file
            single_bam_splice_graphs = construct_splice_graph(edge_weights_list,
                                                              gene_dict,
                                                              chr,
                                                              strand,
                                                              options['read_threshold'],
                                                              options['min_jct_count'],
                                                              output_type='list',
                                                              both=options['both_flag'])

            ### Logic for choosing methodology of primer design ###
            # user-defined flanking exon case
            if up_exon and down_exon:
                if gene_dict['target'] not in gene_dict['exons']:
                    raise utils.PrimerSeqError('Error: target exon was not found in gtf annotation')
                elif up_exon not in gene_dict['exons']:
                    raise utils.PrimerSeqError('Error: upstream exon not in gtf annotation')
                elif down_exon not in gene_dict['exons']:
                    raise utils.PrimerSeqError('Error: downstream exon not in gtf annotation')
                tmp = predefined_exons_case(name,  # ID for exon (need to save as json)
                                            gene_dict['target'],  # target exon tuple (start, end)
                                            splice_graph,  # SpliceGraph object
                                            genome,  # pygr genome variable
                                            up_exon,  # upstream flanking exon
                                            down_exon)  # downstream flanking exon
            # always included case
            elif options['psi'] > .9999:
                # note this function ignores edge weights
                tmp = get_flanking_biconnected_exons(tgt, gene_dict['target'],
                                                     splice_graph,
                                                     genome)
            # user specified a sufficient psi value to call constitutive exons
            else:
                tmp = get_sufficient_psi_exons(tgt, gene_dict['target'],
                                               splice_graph,
                                               genome,
                                               name,
                                               options['psi'],
                                               up_exon,
                                               down_exon)  # note, this function utilizes edge wieghts
            ### End methodology specific primer design ###

            # Error msgs are of length one, so only do psi calculations for
            # non-error msgs
            if len(tmp) > 1:
                # edit target psi value
                tmp_all_paths = tmp[-4]  # CAREFUL the index for the AllPaths object may change
                tmp[2] = calculate_target_psi(gene_dict['target'],
                                              single_bam_splice_graphs,
                                              tmp_all_paths.component,
                                              up_exon=None,
                                              down_exon=None)
                                              # up_exon=up_exon,
                                              # down_exon=down_exon)  # CAREFUL index for psi_target may change
                tmp.append(gene_name)

            # append result to output list
            output.append(tmp)
        except (utils.PrimerSeqError,):
            t, v, trace = sys.exc_info()
            output.append([str(v)])  # just append assertion msg

    return output
Exemplo n.º 7
0
def main(options, args_output='tmp/debug.json'):
    """
    The gtf main function is the function designed to be called from other
    scripts. It iterates through each target exons and returns the necessary
    information for primer design.
    """
    genome, args_gtf, args_target = options['fasta'], options['gtf'], options[
        'target']

    # the sam object interfaces with the user specified BAM/SAM file!!!
    sam_obj_list = options['rnaseq']

    # iterate through each target exon
    output = []  # output from program
    for line in args_target:  # was line in handle
        name, line = line  # bad style of reassignment
        tgt = line[0]
        strand = tgt[0]
        tmp_start, tmp_end = get_pos(tgt)
        chr = get_chr(tgt[1:])  # [1:] since strand is first character
        USER_DEFINED_FLANKING_EXONS = True if len(line) == 3 else False
        if USER_DEFINED_FLANKING_EXONS:
            up_exon = utils.get_pos(line[1])  # user's upstream exon
            down_exon = utils.get_pos(line[2])  # user's downstream exon
        else:
            up_exon = None  # user did not provide upstream exon
            down_exon = None  # user did not provide downstream exon

        # This try block is to catch assertions made about the graph. If a
        # PrimerSeqError is raised it only impacts a single target for primer
        # design so complete exiting of the program is not warranted.
        try:
            # if the gtf doesn't have a valid gene_id attribute then use
            # the first method otherwise use the second method.
            if options['no_gene_id']:
                gene_dict, gene_name = get_weakly_connected_tx(
                    args_gtf, strand, chr, tmp_start,
                    tmp_end)  # hopefully filter out junk
            else:
                gene_dict, gene_name = get_from_gtf_using_gene_name(
                    args_gtf, strand, chr, tmp_start, tmp_end)

            # extract all edge weights only once
            edge_weights_list = [
                sam_obj.extractSamRegion(chr, gene_dict['start'],
                                         gene_dict['end'])
                for sam_obj in sam_obj_list
            ]

            # The following options['both_flag'] determines how the splice graph is constructed.
            # The splice graph can be either constructed from annotation junctions
            # where options['both_flag']==False or RNA-Seq + annotation junctions when
            # options['both_flag']==True.

            # single pooled count data splice graph
            splice_graph = construct_splice_graph(edge_weights_list,
                                                  gene_dict,
                                                  chr,
                                                  strand,
                                                  options['read_threshold'],
                                                  options['min_jct_count'],
                                                  output_type='single',
                                                  both=options['both_flag'])
            # Second, get a splice graph for each BAM file
            single_bam_splice_graphs = construct_splice_graph(
                edge_weights_list,
                gene_dict,
                chr,
                strand,
                options['read_threshold'],
                options['min_jct_count'],
                output_type='list',
                both=options['both_flag'])

            ### Logic for choosing methodology of primer design ###
            # user-defined flanking exon case
            if up_exon and down_exon:
                if gene_dict['target'] not in gene_dict['exons']:
                    raise utils.PrimerSeqError(
                        'Error: target exon was not found in gtf annotation')
                elif up_exon not in gene_dict['exons']:
                    raise utils.PrimerSeqError(
                        'Error: upstream exon not in gtf annotation')
                elif down_exon not in gene_dict['exons']:
                    raise utils.PrimerSeqError(
                        'Error: downstream exon not in gtf annotation')
                tmp = predefined_exons_case(
                    name,  # ID for exon (need to save as json)
                    gene_dict['target'],  # target exon tuple (start, end)
                    splice_graph,  # SpliceGraph object
                    genome,  # pygr genome variable
                    up_exon,  # upstream flanking exon
                    down_exon)  # downstream flanking exon
            # always included case
            elif options['psi'] > .9999:
                # note this function ignores edge weights
                tmp = get_flanking_biconnected_exons(tgt, gene_dict['target'],
                                                     splice_graph, genome)
            # user specified a sufficient psi value to call constitutive exons
            else:
                tmp = get_sufficient_psi_exons(
                    tgt, gene_dict['target'], splice_graph, genome, name,
                    options['psi'], up_exon,
                    down_exon)  # note, this function utilizes edge wieghts
            ### End methodology specific primer design ###

            # Error msgs are of length one, so only do psi calculations for
            # non-error msgs
            if len(tmp) > 1:
                # edit target psi value
                tmp_all_paths = tmp[
                    -4]  # CAREFUL the index for the AllPaths object may change
                tmp[2] = calculate_target_psi(gene_dict['target'],
                                              single_bam_splice_graphs,
                                              tmp_all_paths.component,
                                              up_exon=None,
                                              down_exon=None)
                # up_exon=up_exon,
                # down_exon=down_exon)  # CAREFUL index for psi_target may change
                tmp.append(gene_name)

            # append result to output list
            output.append(tmp)
        except (utils.PrimerSeqError, ):
            t, v, trace = sys.exc_info()
            output.append([str(v)])  # just append assertion msg

    return output
Exemplo n.º 8
0
def read_depth_plot(my_bigwigs, output, options):
    if type(options['position']) == type(list()):
        chr = utils.get_chr(options['position'][0])
        start, stop = zip(*map(lambda x: utils.get_pos(x), options['position']))
    else:
        chr = utils.get_chr(options['position'])
        start, stop = utils.get_pos(options['position'])
    bigwigs = my_bigwigs.split(',')
    num_subplots = len(bigwigs)  # num of bam files equals number of subplots
    fig, axes = plt.subplots(num_subplots, 1, sharex=True, sharey=True, figsize=(6, options['size'] * num_subplots))
    gray = (0.9, 0.9, 0.9)

    # iterate over subplots (bigwig files)
    max_count_holder = 0
    if num_subplots == 1:
        # axes.set_title('Read Depth Plot on %s' % chr)
        iterable = [axes]
    else:
        # axes.flat[0].set_title('Read Depth Plot on %s' % chr)
        iterable = axes.flat
    for i, ax in enumerate(iterable):
        #ax.locator_params(nbins=2)
        ax.yaxis.set_label_text('')

        # set bg
        ax.patch.set_facecolor(gray)
        ax.patch.set_edgecolor(gray)
        ax.grid()

        # plot/label
        max_count, real_start, real_stop = generate_plot(ax, bigwigs[i], chr, start, stop, options)  # does the actual work
        draw_text(ax, '%s -- ' % options['gene'] + os.path.splitext(os.path.basename(bigwigs[i]))[0])

        # format options
        ax.xaxis.grid(color='white', linestyle='--', linewidth=1.5)
        ax.yaxis.grid(color='white', linestyle='--', linewidth=1.5)
        ax.xaxis.set_major_formatter(DropFormatter())
        ax.yaxis.set_major_formatter(DropFormatter())
        ax.set_axisbelow(True)

        # hide some ugly lines
        for line in ax.xaxis.get_ticklines() + ax.yaxis.get_ticklines():
            line.set_color(gray)

        # set y-axis
        if max_count > max_count_holder:
            ax.set_ylim(0, 1.5 * max_count)
            ax.set_yticks([0, int( .375 * max_count ), int( .75 * max_count ), int( 1.125 * max_count ), int(1.5 * max_count)])
            max_count_holder = max_count

        # set x-axis options
        ax.set_xlim(real_start, real_stop)     # set x limits
        ax.set_xticks([real_start, real_stop])   # explicitly set ticks
        ax.xaxis.set_ticklabels(map(addCommas, [real_start, real_stop]))   # make nice looking text for labels
        ax.get_xticklabels()[0].set_horizontalalignment('left')
        ax.get_xticklabels()[1].set_horizontalalignment('right')

        # make text box to display chromosome information
        if i == num_subplots - 1:
            offset_text(ax, '%s:' % chr, 3, (-.15, -.17))

        # adjust spacing between subplots
        fig.subplots_adjust(wspace=0.05, hspace=0.05, bottom=.12)

        # save figure
        plt.savefig(output)