def user_defined_exons(tmp_sg, line): chr, strand = utils.get_chr( line[utils.TARGET]), line[utils.TARGET][0] # get chr and strand upstream_exon = utils.get_pos( line[utils.UPSTREAM_EXON]) # get user-defined flanking exons downstream_exon = utils.get_pos(line[utils.DOWNSTREAM_EXON]) first_primer, second_primer = utils.get_primer_coordinates( line[utils.PRIMER_COORD]) # get possible exons for primer amplification tmp = sorted(tmp_sg.get_graph().nodes(), key=lambda x: (x[0], x[1])) first_ex = utils.find_first_exon(first_primer, tmp) last_ex = utils.find_last_exon(second_primer, tmp) my_exons = tmp[first_ex:last_ex + 1] # if tmp_sg.strand == '+': # my_exons = tmp[tmp.index(upstream_exon):tmp.index(downstream_exon) + 1] # else: # my_exons = tmp[tmp.index(downstream_exon):tmp.index(upstream_exon) + 1] # Use correct tx's and estimate counts/psi all_paths = algs.AllPaths( tmp_sg, my_exons, utils.get_pos(line[utils.TARGET]), # tuple (start, end) chr=chr, strand=strand) # all_paths.trim_tx_paths() fexon = upstream_exon if strand == "+" else downstream_exon lexon = downstream_exon if strand == "+" else upstream_exon all_paths.trim_tx_paths_using_primers(first_primer, second_primer, fexon, lexon) all_paths.set_all_path_coordinates() paths, counts = all_paths.estimate_counts() # run EM algorithm return paths, counts
def user_defined_exons(tmp_sg, line): chr, strand = utils.get_chr(line[utils.TARGET]), line[utils.TARGET][0] # get chr and strand upstream_exon = utils.get_pos(line[utils.UPSTREAM_EXON]) # get user-defined flanking exons downstream_exon = utils.get_pos(line[utils.DOWNSTREAM_EXON]) first_primer, second_primer = utils.get_primer_coordinates(line[utils.PRIMER_COORD]) # get possible exons for primer amplification tmp = sorted(tmp_sg.get_graph().nodes(), key=lambda x: (x[0], x[1])) first_ex = utils.find_first_exon(first_primer, tmp) last_ex = utils.find_last_exon(second_primer, tmp) my_exons = tmp[first_ex:last_ex + 1] # if tmp_sg.strand == '+': # my_exons = tmp[tmp.index(upstream_exon):tmp.index(downstream_exon) + 1] # else: # my_exons = tmp[tmp.index(downstream_exon):tmp.index(upstream_exon) + 1] # Use correct tx's and estimate counts/psi all_paths = algs.AllPaths(tmp_sg, my_exons, utils.get_pos(line[utils.TARGET]), # tuple (start, end) chr=chr, strand=strand) # all_paths.trim_tx_paths() fexon = upstream_exon if strand == "+" else downstream_exon lexon = downstream_exon if strand == "+" else upstream_exon all_paths.trim_tx_paths_using_primers(first_primer, second_primer, fexon, lexon) all_paths.set_all_path_coordinates() paths, counts = all_paths.estimate_counts() # run EM algorithm return paths, counts
def save_isforms_and_counts(line, options): # get information about each row ID, target_coordinate = line[:2] strand = target_coordinate[0] chr = utils.get_chr(target_coordinate[1:]) tmp_start, tmp_end = utils.get_pos(target_coordinate) logging.debug('Saving isoform and count information for event %s . . .' % ID) # get information from GTF annotation gene_dict, gene_name = retrieve_gene_information(options, strand, chr, tmp_start, tmp_end) # get edge weights edge_weights_list = [ sam_obj.extractSamRegion(chr, gene_dict['start'], gene_dict['end']) for sam_obj in options['rnaseq'] ] # construct splice graph for each BAM file bam_splice_graphs = sg.construct_splice_graph(edge_weights_list, gene_dict, chr, strand, options['read_threshold'], options['min_jct_count'], output_type='list', both=options['both_flag']) for bam_ix, my_splice_graph in enumerate(bam_splice_graphs): # this case is meant for user-defined flanking exons if line[utils.PSI_UP] == '-1' and line[utils.PSI_DOWN] == '-1': # find path and count information paths, counts = user_defined_exons(my_splice_graph, line) # filter out single exon paths # my_tmp = [(path, count) for path, count in zip(paths, counts) if len(path) > 1] # paths, counts = zip(*my_tmp) # this case is meant for automatic choice of flanking exons else: paths, counts = primerseq_defined_exons(my_splice_graph, line, options['psi']) utils.save_path_info('%s.%d' % (ID, bam_ix), paths, counts, save_dir='tmp/indiv_isoforms/') logging.debug( 'Finished saving isoform and count information for event %s.' % ID)
def save_isforms_and_counts(line, options): # get information about each row ID, target_coordinate = line[:2] strand = target_coordinate[0] chr = utils.get_chr(target_coordinate[1:]) tmp_start, tmp_end = utils.get_pos(target_coordinate) logging.debug('Saving isoform and count information for event %s . . .' % ID) # get information from GTF annotation gene_dict, gene_name = retrieve_gene_information(options, strand, chr, tmp_start, tmp_end) # get edge weights edge_weights_list = [sam_obj.extractSamRegion(chr, gene_dict['start'], gene_dict['end']) for sam_obj in options['rnaseq']] # construct splice graph for each BAM file bam_splice_graphs = sg.construct_splice_graph(edge_weights_list, gene_dict, chr, strand, options['read_threshold'], options['min_jct_count'], output_type='list', both=options['both_flag']) for bam_ix, my_splice_graph in enumerate(bam_splice_graphs): # this case is meant for user-defined flanking exons if line[utils.PSI_UP] == '-1' and line[utils.PSI_DOWN] == '-1': # find path and count information paths, counts = user_defined_exons(my_splice_graph, line) # filter out single exon paths # my_tmp = [(path, count) for path, count in zip(paths, counts) if len(path) > 1] # paths, counts = zip(*my_tmp) # this case is meant for automatic choice of flanking exons else: paths, counts = primerseq_defined_exons(my_splice_graph, line, options['psi']) utils.save_path_info('%s.%d' % (ID, bam_ix), paths, counts, save_dir='tmp/indiv_isoforms/') logging.debug('Finished saving isoform and count information for event %s.' % ID)
def read_depth_plot(my_bigwigs, output, options): if type(options['position']) == type(list()): chr = utils.get_chr(options['position'][0]) start, stop = zip( *map(lambda x: utils.get_pos(x), options['position'])) else: chr = utils.get_chr(options['position']) start, stop = utils.get_pos(options['position']) bigwigs = my_bigwigs.split(',') num_subplots = len(bigwigs) # num of bam files equals number of subplots fig, axes = plt.subplots(num_subplots, 1, sharex=True, sharey=True, figsize=(6, options['size'] * num_subplots)) gray = (0.9, 0.9, 0.9) # iterate over subplots (bigwig files) max_count_holder = 0 if num_subplots == 1: # axes.set_title('Read Depth Plot on %s' % chr) iterable = [axes] else: # axes.flat[0].set_title('Read Depth Plot on %s' % chr) iterable = axes.flat for i, ax in enumerate(iterable): #ax.locator_params(nbins=2) ax.yaxis.set_label_text('') # set bg ax.patch.set_facecolor(gray) ax.patch.set_edgecolor(gray) ax.grid() # plot/label max_count, real_start, real_stop = generate_plot( ax, bigwigs[i], chr, start, stop, options) # does the actual work draw_text( ax, '%s -- ' % options['gene'] + os.path.splitext(os.path.basename(bigwigs[i]))[0]) # format options ax.xaxis.grid(color='white', linestyle='--', linewidth=1.5) ax.yaxis.grid(color='white', linestyle='--', linewidth=1.5) ax.xaxis.set_major_formatter(DropFormatter()) ax.yaxis.set_major_formatter(DropFormatter()) ax.set_axisbelow(True) # hide some ugly lines for line in ax.xaxis.get_ticklines() + ax.yaxis.get_ticklines(): line.set_color(gray) # set y-axis if max_count > max_count_holder: ax.set_ylim(0, 1.5 * max_count) ax.set_yticks([ 0, int(.375 * max_count), int(.75 * max_count), int(1.125 * max_count), int(1.5 * max_count) ]) max_count_holder = max_count # set x-axis options ax.set_xlim(real_start, real_stop) # set x limits ax.set_xticks([real_start, real_stop]) # explicitly set ticks ax.xaxis.set_ticklabels( map(addCommas, [real_start, real_stop])) # make nice looking text for labels ax.get_xticklabels()[0].set_horizontalalignment('left') ax.get_xticklabels()[1].set_horizontalalignment('right') # make text box to display chromosome information if i == num_subplots - 1: offset_text(ax, '%s:' % chr, 3, (-.15, -.17)) # adjust spacing between subplots fig.subplots_adjust(wspace=0.05, hspace=0.05, bottom=.12) # save figure plt.savefig(output)
def main(options, args_output='tmp/debug.json'): """ The gtf main function is the function designed to be called from other scripts. It iterates through each target exons and returns the necessary information for primer design. """ genome, args_gtf, args_target = options['fasta'], options['gtf'], options['target'] # the sam object interfaces with the user specified BAM/SAM file!!! sam_obj_list = options['rnaseq'] # iterate through each target exon output = [] # output from program for line in args_target: # was line in handle name, line = line # bad style of reassignment tgt = line[0] strand = tgt[0] tmp_start, tmp_end = get_pos(tgt) chr = get_chr(tgt[1:]) # [1:] since strand is first character USER_DEFINED_FLANKING_EXONS = True if len(line) == 3 else False if USER_DEFINED_FLANKING_EXONS: up_exon = utils.get_pos(line[1]) # user's upstream exon down_exon = utils.get_pos(line[2]) # user's downstream exon else: up_exon = None # user did not provide upstream exon down_exon = None # user did not provide downstream exon # This try block is to catch assertions made about the graph. If a # PrimerSeqError is raised it only impacts a single target for primer # design so complete exiting of the program is not warranted. try: # if the gtf doesn't have a valid gene_id attribute then use # the first method otherwise use the second method. if options['no_gene_id']: gene_dict, gene_name = get_weakly_connected_tx(args_gtf, strand, chr, tmp_start, tmp_end) # hopefully filter out junk else: gene_dict, gene_name = get_from_gtf_using_gene_name(args_gtf, strand, chr, tmp_start, tmp_end) # extract all edge weights only once edge_weights_list = [sam_obj.extractSamRegion(chr, gene_dict['start'], gene_dict['end']) for sam_obj in sam_obj_list] # The following options['both_flag'] determines how the splice graph is constructed. # The splice graph can be either constructed from annotation junctions # where options['both_flag']==False or RNA-Seq + annotation junctions when # options['both_flag']==True. # single pooled count data splice graph splice_graph = construct_splice_graph(edge_weights_list, gene_dict, chr, strand, options['read_threshold'], options['min_jct_count'], output_type='single', both=options['both_flag']) # Second, get a splice graph for each BAM file single_bam_splice_graphs = construct_splice_graph(edge_weights_list, gene_dict, chr, strand, options['read_threshold'], options['min_jct_count'], output_type='list', both=options['both_flag']) ### Logic for choosing methodology of primer design ### # user-defined flanking exon case if up_exon and down_exon: if gene_dict['target'] not in gene_dict['exons']: raise utils.PrimerSeqError('Error: target exon was not found in gtf annotation') elif up_exon not in gene_dict['exons']: raise utils.PrimerSeqError('Error: upstream exon not in gtf annotation') elif down_exon not in gene_dict['exons']: raise utils.PrimerSeqError('Error: downstream exon not in gtf annotation') tmp = predefined_exons_case(name, # ID for exon (need to save as json) gene_dict['target'], # target exon tuple (start, end) splice_graph, # SpliceGraph object genome, # pygr genome variable up_exon, # upstream flanking exon down_exon) # downstream flanking exon # always included case elif options['psi'] > .9999: # note this function ignores edge weights tmp = get_flanking_biconnected_exons(tgt, gene_dict['target'], splice_graph, genome) # user specified a sufficient psi value to call constitutive exons else: tmp = get_sufficient_psi_exons(tgt, gene_dict['target'], splice_graph, genome, name, options['psi'], up_exon, down_exon) # note, this function utilizes edge wieghts ### End methodology specific primer design ### # Error msgs are of length one, so only do psi calculations for # non-error msgs if len(tmp) > 1: # edit target psi value tmp_all_paths = tmp[-4] # CAREFUL the index for the AllPaths object may change tmp[2] = calculate_target_psi(gene_dict['target'], single_bam_splice_graphs, tmp_all_paths.component, up_exon=None, down_exon=None) # up_exon=up_exon, # down_exon=down_exon) # CAREFUL index for psi_target may change tmp.append(gene_name) # append result to output list output.append(tmp) except (utils.PrimerSeqError,): t, v, trace = sys.exc_info() output.append([str(v)]) # just append assertion msg return output
def main(options, args_output='tmp/debug.json'): """ The gtf main function is the function designed to be called from other scripts. It iterates through each target exons and returns the necessary information for primer design. """ genome, args_gtf, args_target = options['fasta'], options['gtf'], options[ 'target'] # the sam object interfaces with the user specified BAM/SAM file!!! sam_obj_list = options['rnaseq'] # iterate through each target exon output = [] # output from program for line in args_target: # was line in handle name, line = line # bad style of reassignment tgt = line[0] strand = tgt[0] tmp_start, tmp_end = get_pos(tgt) chr = get_chr(tgt[1:]) # [1:] since strand is first character USER_DEFINED_FLANKING_EXONS = True if len(line) == 3 else False if USER_DEFINED_FLANKING_EXONS: up_exon = utils.get_pos(line[1]) # user's upstream exon down_exon = utils.get_pos(line[2]) # user's downstream exon else: up_exon = None # user did not provide upstream exon down_exon = None # user did not provide downstream exon # This try block is to catch assertions made about the graph. If a # PrimerSeqError is raised it only impacts a single target for primer # design so complete exiting of the program is not warranted. try: # if the gtf doesn't have a valid gene_id attribute then use # the first method otherwise use the second method. if options['no_gene_id']: gene_dict, gene_name = get_weakly_connected_tx( args_gtf, strand, chr, tmp_start, tmp_end) # hopefully filter out junk else: gene_dict, gene_name = get_from_gtf_using_gene_name( args_gtf, strand, chr, tmp_start, tmp_end) # extract all edge weights only once edge_weights_list = [ sam_obj.extractSamRegion(chr, gene_dict['start'], gene_dict['end']) for sam_obj in sam_obj_list ] # The following options['both_flag'] determines how the splice graph is constructed. # The splice graph can be either constructed from annotation junctions # where options['both_flag']==False or RNA-Seq + annotation junctions when # options['both_flag']==True. # single pooled count data splice graph splice_graph = construct_splice_graph(edge_weights_list, gene_dict, chr, strand, options['read_threshold'], options['min_jct_count'], output_type='single', both=options['both_flag']) # Second, get a splice graph for each BAM file single_bam_splice_graphs = construct_splice_graph( edge_weights_list, gene_dict, chr, strand, options['read_threshold'], options['min_jct_count'], output_type='list', both=options['both_flag']) ### Logic for choosing methodology of primer design ### # user-defined flanking exon case if up_exon and down_exon: if gene_dict['target'] not in gene_dict['exons']: raise utils.PrimerSeqError( 'Error: target exon was not found in gtf annotation') elif up_exon not in gene_dict['exons']: raise utils.PrimerSeqError( 'Error: upstream exon not in gtf annotation') elif down_exon not in gene_dict['exons']: raise utils.PrimerSeqError( 'Error: downstream exon not in gtf annotation') tmp = predefined_exons_case( name, # ID for exon (need to save as json) gene_dict['target'], # target exon tuple (start, end) splice_graph, # SpliceGraph object genome, # pygr genome variable up_exon, # upstream flanking exon down_exon) # downstream flanking exon # always included case elif options['psi'] > .9999: # note this function ignores edge weights tmp = get_flanking_biconnected_exons(tgt, gene_dict['target'], splice_graph, genome) # user specified a sufficient psi value to call constitutive exons else: tmp = get_sufficient_psi_exons( tgt, gene_dict['target'], splice_graph, genome, name, options['psi'], up_exon, down_exon) # note, this function utilizes edge wieghts ### End methodology specific primer design ### # Error msgs are of length one, so only do psi calculations for # non-error msgs if len(tmp) > 1: # edit target psi value tmp_all_paths = tmp[ -4] # CAREFUL the index for the AllPaths object may change tmp[2] = calculate_target_psi(gene_dict['target'], single_bam_splice_graphs, tmp_all_paths.component, up_exon=None, down_exon=None) # up_exon=up_exon, # down_exon=down_exon) # CAREFUL index for psi_target may change tmp.append(gene_name) # append result to output list output.append(tmp) except (utils.PrimerSeqError, ): t, v, trace = sys.exc_info() output.append([str(v)]) # just append assertion msg return output
def read_depth_plot(my_bigwigs, output, options): if type(options['position']) == type(list()): chr = utils.get_chr(options['position'][0]) start, stop = zip(*map(lambda x: utils.get_pos(x), options['position'])) else: chr = utils.get_chr(options['position']) start, stop = utils.get_pos(options['position']) bigwigs = my_bigwigs.split(',') num_subplots = len(bigwigs) # num of bam files equals number of subplots fig, axes = plt.subplots(num_subplots, 1, sharex=True, sharey=True, figsize=(6, options['size'] * num_subplots)) gray = (0.9, 0.9, 0.9) # iterate over subplots (bigwig files) max_count_holder = 0 if num_subplots == 1: # axes.set_title('Read Depth Plot on %s' % chr) iterable = [axes] else: # axes.flat[0].set_title('Read Depth Plot on %s' % chr) iterable = axes.flat for i, ax in enumerate(iterable): #ax.locator_params(nbins=2) ax.yaxis.set_label_text('') # set bg ax.patch.set_facecolor(gray) ax.patch.set_edgecolor(gray) ax.grid() # plot/label max_count, real_start, real_stop = generate_plot(ax, bigwigs[i], chr, start, stop, options) # does the actual work draw_text(ax, '%s -- ' % options['gene'] + os.path.splitext(os.path.basename(bigwigs[i]))[0]) # format options ax.xaxis.grid(color='white', linestyle='--', linewidth=1.5) ax.yaxis.grid(color='white', linestyle='--', linewidth=1.5) ax.xaxis.set_major_formatter(DropFormatter()) ax.yaxis.set_major_formatter(DropFormatter()) ax.set_axisbelow(True) # hide some ugly lines for line in ax.xaxis.get_ticklines() + ax.yaxis.get_ticklines(): line.set_color(gray) # set y-axis if max_count > max_count_holder: ax.set_ylim(0, 1.5 * max_count) ax.set_yticks([0, int( .375 * max_count ), int( .75 * max_count ), int( 1.125 * max_count ), int(1.5 * max_count)]) max_count_holder = max_count # set x-axis options ax.set_xlim(real_start, real_stop) # set x limits ax.set_xticks([real_start, real_stop]) # explicitly set ticks ax.xaxis.set_ticklabels(map(addCommas, [real_start, real_stop])) # make nice looking text for labels ax.get_xticklabels()[0].set_horizontalalignment('left') ax.get_xticklabels()[1].set_horizontalalignment('right') # make text box to display chromosome information if i == num_subplots - 1: offset_text(ax, '%s:' % chr, 3, (-.15, -.17)) # adjust spacing between subplots fig.subplots_adjust(wspace=0.05, hspace=0.05, bottom=.12) # save figure plt.savefig(output)