Exemplo n.º 1
0
def plot_distributions(meme_gerp_scores, null_gerp_scores, mylabels, mytitle):

    for gerp_scores, mylabel in zip([meme_gerp_scores, null_gerp_scores],
                                    mylabels):
        plot_functions.plot_density(gerp_scores, mytitle, mylabel)
    plt.legend()
    plt.show()
def plot_distributions(meme_gerp_scores, null_gerp_scores, mylabels, mytitle):
    
    for gerp_scores, mylabel in zip([meme_gerp_scores, 
                                     null_gerp_scores], 
                                    mylabels):
        plot_functions.plot_density(gerp_scores, mytitle, mylabel)
    plt.legend()
    plt.show()
Exemplo n.º 3
0
def main():
    usage = 'usage: %prog [opt] directory1 directory2'\
        '\nTwo arguments must be specified in command line:\n'\
        '1) Directory of interest containing .summary files\n'\
        '2) Directory to which to compare containing .summary files (control)\n'
    parser = OptionParser(usage=usage)
    (_, args) = parser.parse_args()

    if len(args) != 2:
        print 'Requires 2 arguments to be specified in command line'
        print usage
        sys.exit()
    # parse args
    dir0 = args[0]
    dir1 = args[1]

    # get all files containing *.summary in each directory. Those are
    # anchor input files.
    ext = '.summary'
    anchor_files0 = \
        [os.path.join(dir0, f) for f in os.listdir(dir0) if f.endswith(ext)]
    anchor_files1 = \
        [os.path.join(dir1, f) for f in os.listdir(dir1) if f.endswith(ext)]

    # Read each file, retrieving the lengths of each amino acid sequence
    aa_lengths0 = []
    aa_lengths1 = []
    for file0 in anchor_files0:
        aa_lengths0 += get_aa_length_from_anchor_file(file0)
    for file1 in anchor_files1:
        aa_lengths1 += get_aa_length_from_anchor_file(file1)

    for id, aa in zip(['xeno', 'control'], [aa_lengths0, aa_lengths1]):
        print 'Mean for %s' % id
        print sum(aa) / float(len(aa))
        print 'Median for %s' % id
        print sorted(aa)[len(aa) // 2]
        print '[min,max] for %s' % id
        print '[%s,%s]' % (min(aa), max(aa))
    plot_functions.plot_density(
        [aa_lengths0, aa_lengths1],
        mytitle='Density plot of exon lengths',
        xlabel='Nucleotide length',
        ylabel='Density',
        labels_lists=['Cassette Exons', 'Constitutive Exons'],
        smoothness=0.1,
        legend_pos=1,
        ymin=0,
        ymax=0.025,
        xmin=0,
        xmax=200)
def main():
    usage = 'usage: %prog [opt] directory1 directory2'\
        '\nTwo arguments must be specified in command line:\n'\
        '1) Directory of interest containing .summary files\n'\
        '2) Directory to which to compare containing .summary files (control)\n'
    parser = OptionParser(usage=usage)
    (_, args) = parser.parse_args()

    if len(args) != 2:
        print 'Requires 2 arguments to be specified in command line'
        print usage
        sys.exit()
    # parse args
    dir0 = args[0]
    dir1 = args[1]

    # get all files containing *.summary in each directory. Those are
    # anchor input files.
    ext = '.summary'
    anchor_files0 = \
        [os.path.join(dir0, f) for f in os.listdir(dir0) if f.endswith(ext)]
    anchor_files1 = \
        [os.path.join(dir1, f) for f in os.listdir(dir1) if f.endswith(ext)]

    # Read each file, retrieving the lengths of each amino acid sequence
    aa_lengths0 = []
    aa_lengths1 = []
    for file0 in anchor_files0:
        aa_lengths0 += get_aa_length_from_anchor_file(file0)
    for file1 in anchor_files1:
        aa_lengths1 += get_aa_length_from_anchor_file(file1)

    for id, aa in zip(['xeno', 'control'], [aa_lengths0, aa_lengths1]):
        print 'Mean for %s' %id
        print sum(aa)/float(len(aa))
        print 'Median for %s' %id
        print sorted(aa)[len(aa)//2]
        print '[min,max] for %s' %id
        print '[%s,%s]' %(min(aa), max(aa))
    plot_functions.plot_density([aa_lengths0, aa_lengths1],
                                mytitle='Density plot of exon lengths',
                                xlabel='Nucleotide length',
                                ylabel='Density',
                                labels_lists=['Cassette Exons',
                                             'Constitutive Exons'],
                                smoothness=0.1,
                                legend_pos=1,
                                ymin=0, ymax=0.025,
                                xmin=0, xmax=200)
def main():
    usage = 'usage: %prog meme_gerp_summary_pkl\n'\
        'Requires one argument:\n'\
        '1) pkl file from summarize_meme_results'
    parser = OptionParser(usage=usage)
    parser.add_option('-r', '--plot_raw_locations', dest='raw_locations',
                      default=False,
                      help='Boolean value. True=horizontal '\
                        'line segment plot. False=density plot')
    (options, args) = parser.parse_args()

    if len(args) != 1:
        print 'Requires 1 argument to be specified in commandline'
        print usage
        sys.exit()
    pklpath = args[0]

    if options.raw_locations in ['True', 'TRUE', True]:
        raw_locations = True
        print 'Plotting raw locations...'
    elif options.raw_locations in ['False', 'FALSE', False]:
        raw_locations = False
        print 'Plotting density plot...'
    else:
        print '--plot_raw_locations option must be '\
            'True or False. %s found.' %options.raw_locations
        sys.exit()

    # get dics from pkl
    meme_dic = get_dic_from_pklpath(pklpath)
    print meme_dic

    event_count = 0    # used as y-axis locater...
    # init offsetters
    offset_length = 100
    offsets = [0, 110, 220, 330, 440]
    '''
    Set Motif 1 to #CC6666, Motif 2 to #33CCCC Motif 3 to "green"
    The colors you want will depend on the discovered meme motif number.
    '''

    plot_settings_dic = {'intron_1_5p': {'offset': offsets[0],
                                         'color': ['#CC6666', '#CC6666', '#CC6666']},
                         'intron_1_3p': {'offset': offsets[1],
                                         'color': ['green', 'black', 'yellow']},
                         'intron_2_5p': {'offset': offsets[2],
                                         'color': ['red']},
                         'intron_2_3p': {'offset': offsets[3],
                                         'color': ['#33CCCC', 'black']}}

    # collect plot information: start, end, color, y position
    # into a plot dic.

    plot_dic = {'start': [],
                'end': [],
                'color': [],
                'ypos': [],
                'motif_number': []}
    for event in meme_dic:
        for region in meme_dic[event]:
            start = meme_dic[event][region]['motif_relative_start'][0]
            end = meme_dic[event][region]['motif_relative_end'][0]
            motif_number = meme_dic[event][region]['motif_number'][0]
            # offset start and end depending on region
            start += plot_settings_dic[region]['offset']
            end += plot_settings_dic[region]['offset']
            ypos = event_count/10.0
            try:
                # subtract motif number by 1 to get 0-based numbering
                color = plot_settings_dic[region]['color'][motif_number-1]
            except IndexError:
                print region, motif_number
                print 'Ran out of colors, using yellow as default.'
                color = 'yellow'
            for key, value in \
                zip(['start', 'end', 'color', 'ypos', 'motif_number'],
                    [start, end, color, ypos, '%s:Motif %s'%(region, motif_number)]):
                plot_dic[key].append(value)
            event_count += 1

    if raw_locations is False:
        # begin: get lists of starts, colors, labels for density plot
        density_plot_dic = {}
        for motif_number, color, start in \
            zip(plot_dic['motif_number'], plot_dic['color'], plot_dic['start']):
            if motif_number not in density_plot_dic:
                density_plot_dic[motif_number] = {'densitystarts': []}
            density_plot_dic[motif_number]['densitycolors'] = color
            density_plot_dic[motif_number]['densitystarts'].append(start)
        starts_list = []
        labels_list = []
        colors_list = []
        for motif_number in density_plot_dic:
            labels_list.append(motif_number)
            colors_list.append(density_plot_dic[motif_number]['densitycolors'])
            starts_list.append(density_plot_dic[motif_number]['densitystarts'])
        # add number of sites in labels
        labels_with_nsites = []
        motif_labels = ['Motif %s' %n for n in range(1, len(starts_list) + 1)]
        for labellist, startlist in zip(motif_labels, starts_list):
            n_sites = len(startlist)
            labels_with_nsites.append('%s (%s sites)' %(labellist, n_sites))
        for startlist in starts_list:
            print 'Number of guys: %s' %len(startlist)
            print 'Min/Max: %s/%s' %(min(startlist), max(startlist))
        # end: get lists of starts, colors, labels for density plot
        # begin: init figure
        fig = plt.figure()
        ax = fig.add_subplot(111)
        # end: init figure
        # add rectangles representing exons
        rect_height=0.002
        rect_length=10
        rectstarts = [offsets[0], offsets[2], offsets[4]]
        rcolors = ['cyan', 'yellow', 'cyan']
        for start, color in zip(rectstarts, rcolors):
            patch = add_rectangles(start, height=rect_height, length=rect_length, color=color)
            ax.add_patch(patch)
        # draw intron lines connecting exons
        istarts = [offsets[0], offsets[1], offsets[2], offsets[3]]
        iends = [offsets[1] - rect_length, offsets[2] - rect_length, offsets[3] - rect_length, offsets[4] - rect_length]
        for start, end in zip(istarts, iends):
            plt.hlines(y=-rect_height/2., xmin=start, xmax=end,
                       color='black', linewidths=1.5)
        # draw vertical lines representing break in intron
        breakstarts = [iends[0], istarts[1], iends[2], istarts[3]]
        for bstart in breakstarts:
            plt.vlines(bstart, ymin=-rect_height, ymax=0,
                       color='black', linewidths=1)
        plot_functions.plot_density(values_lists=starts_list,
                                    labels_lists=labels_with_nsites,
                                    colors_list=colors_list,
                                    mytitle='Intronic distribution of MEME motifs',
                                    xlabel='Genic region',
                                    ylabel='Density',
                                    xmin=-20,
                                    xmax=450,
                                    smoothness=0.1,
                                    legend_pos=2,
                                    ymin=-0.01,
                                    ymax=0.075,
                                    showplot=False)
        # dont show xaxis
        plt.setp(ax.get_xticklabels(), visible=False)
        plt.show()
    else:
        plot_functions.plot_hline_segments(starts=plot_dic['start'],
                                           stops=plot_dic['end'],
                                           ypos=plot_dic['ypos'],
                                           colors=plot_dic['color'],
                                           labels=plot_dic['motif_number'])
def main():
    usage = 'usage: %prog meme_gerp_summary_pkl\n'\
        'Requires one argument:\n'\
        '1) pkl file from summarize_meme_results'
    parser = OptionParser(usage=usage)
    parser.add_option('-r', '--plot_raw_locations', dest='raw_locations',
                      default=False,
                      help='Boolean value. True=horizontal '\
                        'line segment plot. False=density plot')
    (options, args) = parser.parse_args()

    if len(args) != 1:
        print 'Requires 1 argument to be specified in commandline'
        print usage
        sys.exit()
    pklpath = args[0]

    if options.raw_locations in ['True', 'TRUE', True]:
        raw_locations = True
        print 'Plotting raw locations...'
    elif options.raw_locations in ['False', 'FALSE', False]:
        raw_locations = False
        print 'Plotting density plot...'
    else:
        print '--plot_raw_locations option must be '\
            'True or False. %s found.' %options.raw_locations
        sys.exit()

    # get dics from pkl
    meme_dic = get_dic_from_pklpath(pklpath)
    print meme_dic

    event_count = 0  # used as y-axis locater...
    # init offsetters
    offset_length = 100
    offsets = [0, 110, 220, 330, 440]
    '''
    Set Motif 1 to #CC6666, Motif 2 to #33CCCC Motif 3 to "green"
    The colors you want will depend on the discovered meme motif number.
    '''

    plot_settings_dic = {
        'intron_1_5p': {
            'offset': offsets[0],
            'color': ['#CC6666', '#CC6666', '#CC6666']
        },
        'intron_1_3p': {
            'offset': offsets[1],
            'color': ['green', 'black', 'yellow']
        },
        'intron_2_5p': {
            'offset': offsets[2],
            'color': ['red']
        },
        'intron_2_3p': {
            'offset': offsets[3],
            'color': ['#33CCCC', 'black']
        }
    }

    # collect plot information: start, end, color, y position
    # into a plot dic.

    plot_dic = {
        'start': [],
        'end': [],
        'color': [],
        'ypos': [],
        'motif_number': []
    }
    for event in meme_dic:
        for region in meme_dic[event]:
            start = meme_dic[event][region]['motif_relative_start'][0]
            end = meme_dic[event][region]['motif_relative_end'][0]
            motif_number = meme_dic[event][region]['motif_number'][0]
            # offset start and end depending on region
            start += plot_settings_dic[region]['offset']
            end += plot_settings_dic[region]['offset']
            ypos = event_count / 10.0
            try:
                # subtract motif number by 1 to get 0-based numbering
                color = plot_settings_dic[region]['color'][motif_number - 1]
            except IndexError:
                print region, motif_number
                print 'Ran out of colors, using yellow as default.'
                color = 'yellow'
            for key, value in \
                zip(['start', 'end', 'color', 'ypos', 'motif_number'],
                    [start, end, color, ypos, '%s:Motif %s'%(region, motif_number)]):
                plot_dic[key].append(value)
            event_count += 1

    if raw_locations is False:
        # begin: get lists of starts, colors, labels for density plot
        density_plot_dic = {}
        for motif_number, color, start in \
            zip(plot_dic['motif_number'], plot_dic['color'], plot_dic['start']):
            if motif_number not in density_plot_dic:
                density_plot_dic[motif_number] = {'densitystarts': []}
            density_plot_dic[motif_number]['densitycolors'] = color
            density_plot_dic[motif_number]['densitystarts'].append(start)
        starts_list = []
        labels_list = []
        colors_list = []
        for motif_number in density_plot_dic:
            labels_list.append(motif_number)
            colors_list.append(density_plot_dic[motif_number]['densitycolors'])
            starts_list.append(density_plot_dic[motif_number]['densitystarts'])
        # add number of sites in labels
        labels_with_nsites = []
        motif_labels = ['Motif %s' % n for n in range(1, len(starts_list) + 1)]
        for labellist, startlist in zip(motif_labels, starts_list):
            n_sites = len(startlist)
            labels_with_nsites.append('%s (%s sites)' % (labellist, n_sites))
        for startlist in starts_list:
            print 'Number of guys: %s' % len(startlist)
            print 'Min/Max: %s/%s' % (min(startlist), max(startlist))
        # end: get lists of starts, colors, labels for density plot
        # begin: init figure
        fig = plt.figure()
        ax = fig.add_subplot(111)
        # end: init figure
        # add rectangles representing exons
        rect_height = 0.002
        rect_length = 10
        rectstarts = [offsets[0], offsets[2], offsets[4]]
        rcolors = ['cyan', 'yellow', 'cyan']
        for start, color in zip(rectstarts, rcolors):
            patch = add_rectangles(start,
                                   height=rect_height,
                                   length=rect_length,
                                   color=color)
            ax.add_patch(patch)
        # draw intron lines connecting exons
        istarts = [offsets[0], offsets[1], offsets[2], offsets[3]]
        iends = [
            offsets[1] - rect_length, offsets[2] - rect_length,
            offsets[3] - rect_length, offsets[4] - rect_length
        ]
        for start, end in zip(istarts, iends):
            plt.hlines(y=-rect_height / 2.,
                       xmin=start,
                       xmax=end,
                       color='black',
                       linewidths=1.5)
        # draw vertical lines representing break in intron
        breakstarts = [iends[0], istarts[1], iends[2], istarts[3]]
        for bstart in breakstarts:
            plt.vlines(bstart,
                       ymin=-rect_height,
                       ymax=0,
                       color='black',
                       linewidths=1)
        plot_functions.plot_density(
            values_lists=starts_list,
            labels_lists=labels_with_nsites,
            colors_list=colors_list,
            mytitle='Intronic distribution of MEME motifs',
            xlabel='Genic region',
            ylabel='Density',
            xmin=-20,
            xmax=450,
            smoothness=0.1,
            legend_pos=2,
            ymin=-0.01,
            ymax=0.075,
            showplot=False)
        # dont show xaxis
        plt.setp(ax.get_xticklabels(), visible=False)
        plt.show()
    else:
        plot_functions.plot_hline_segments(starts=plot_dic['start'],
                                           stops=plot_dic['end'],
                                           ypos=plot_dic['ypos'],
                                           colors=plot_dic['color'],
                                           labels=plot_dic['motif_number'])
Exemplo n.º 7
0
def main():
    usage = 'usage: %prog meme_gerp_genename_filepath output_filepath\n'\
        'Requires two input arguments:\n'\
        '1) textfile output from '\
            'summarize_meme_results_with_gerp_scores\n'\
        '2) inclusion fasta file\n'\
        '3) exclusion fasta file\n'\
        '4) meme dir containing meme results'
    parser = OptionParser(usage=usage)
    (_, args) = parser.parse_args()

    if len(args) < 5:
        print 'Four arguments need to be specified in command line.\n'
        print usage
        sys.exit()
    meme_summarypath = args[0]
    incl_fasta = args[1]
    excl_fasta = args[2]
    meme_dir = args[3]

    # define column name suffix (string after colon in colname)
    gerp_str = 'avg_rs_score'
    motif_numb_str = 'motif_number'
    # miso event has no col name suffix, this is entire colname
    miso_colname = 'miso_event'
    # define rel path to tomtom files from meme dir
    rel_path = os.path.join('rbp_matches', 'candidate_rbps.txt')
    # define plot title
    mytitle = 'GERP Score Comparison: Hits vs Non-Hits'

    # get dictionary containing inclusion and exclusion for miso event
    incl_excl_dic = miso_events.get_inclusion_exclusion(incl_file=incl_fasta,
                                                        excl_file=excl_fasta)
    tomtom_dic = miso_events.get_tomtom_hits(meme_dir, rel_path)
    region_gerp_scores = {}  # gerp scores, indexed by region.
    with open(meme_summarypath, 'rb') as readfile:
        myreader = csv.reader(readfile, delimiter='\t')
        header = myreader.next()
        regions = get_regions(header)
        # init output dic with empty lists
        for region in regions:
            region_gerp_scores[region] = {}

        for row in myreader:
            # get gerp score in each region.
            # beware of empty values.
            for region in regions:
                subdic = region_gerp_scores[region]
                gerp_colname = ':'.join([region, gerp_str])
                motif_numb_colname = ':'.join([region, motif_numb_str])
                gerp_score = row[header.index(gerp_colname)]
                motif_numb = row[header.index(motif_numb_colname)]
                miso_event = row[header.index(miso_colname)]
                incl_or_excl = incl_excl_dic[miso_event]
                motif_id = ' '.join(['Motif', motif_numb, incl_or_excl])
                if gerp_score is not '':
                    if motif_id not in subdic:
                        subdic[motif_id] = []
                    subdic[motif_id].append(float(gerp_score))

    avg_scores_in_tomtom = []
    avg_scores_not_in_tomtom = []
    # Plot histogram of average scores.
    for region in region_gerp_scores:
        for motif_id in region_gerp_scores[region]:
            tomtom_key = create_tomtom_key(motif_id, region)
            if tomtom_key in tomtom_dic:
                avg_scores_in_tomtom += region_gerp_scores[region][motif_id]
            else:
                avg_scores_not_in_tomtom += region_gerp_scores[region][
                    motif_id]

    conserved_counts_in_tomtom = 0
    conserved_counts_not_in_tomtom = 0
    for s in avg_scores_in_tomtom:
        if s >= 2:
            conserved_counts_in_tomtom += 1
    for s in avg_scores_not_in_tomtom:
        if s >= 2:
            conserved_counts_not_in_tomtom += 1

    for avg_scores, mylabel in zip(
        [avg_scores_in_tomtom, avg_scores_not_in_tomtom],
        ['Motif with matching RBPs', 'Motif without matching RBPs']):
        plot_functions.plot_density(avg_scores,
                                    mytitle=mytitle,
                                    mylabel=mylabel)
    plt.legend()
    plt.show()
def main():
    usage = 'usage: %prog meme_gerp_genename_filepath output_filepath\n'\
        'Requires two input arguments:\n'\
        '1) pkl file from summarize_meme_results: non-null\n'\
        '2) pkl file from summarize_meme_results: null-mode\n'
    parser = OptionParser(usage=usage)   
    parser.add_option('-t', '--threshold', dest='score_threshold',
                      default=2.0,
                      help='Float, threshold for what one considers conserved.') 
    parser.add_option('-y', '--ymax', dest='ymax',
                      type='float',
                      default=0.03,
                      help='Y max for density plot')
    (options, args) = parser.parse_args()
    if len(args) < 2:
        print 'Two arguments need to be specified in command line.\n'
        print usage
        sys.exit()
    non_null_pklpath = args[0]
    null_pklpath = args[1]
    # parse ops
    score_threshold = float(options.score_threshold)
    
    # get dics from pkl 
    non_null_dic = get_dic_from_pklpath(non_null_pklpath)
    null_dic = get_dic_from_pklpath(null_pklpath)
    
    non_null_gerp_scores = get_gerp_scores(non_null_dic, gerpkey='avg_rs_score')
    null_gerp_scores = get_gerp_scores(null_dic, gerpkey='avg_rs_score')
    
    plot_functions.plot_density([non_null_gerp_scores, null_gerp_scores], 
                                mytitle='Density plot of conservation scores', 
                                labels_lists=['MEME motifs', 'Controls'],
                                xlabel='GERP conservation score',
                                ylabel='Density',
                                xmin=-4, xmax=4,
                                ymax=options.ymax,
                                smoothness=0.15,
                                drawvline=score_threshold)
    
    # find how many conserved regions are in each.
    n_conserved_in_meme = \
        gerp_utilities.conserved_regions(non_null_gerp_scores, fraction=False, threshold=score_threshold)
    n_conserved_in_null = \
        gerp_utilities.conserved_regions(null_gerp_scores, fraction=False, threshold=score_threshold)
    n_total_in_meme = len(non_null_gerp_scores)
    n_total_in_null = len(null_gerp_scores)
    n_not_conserved_in_meme = n_total_in_meme - n_conserved_in_meme
    n_not_conserved_in_null = n_total_in_null - n_conserved_in_null
    
    print 'Threshold: %s' %score_threshold
    print 'Number of conserved elements: %s' %n_conserved_in_meme
    print 'Number of conserved elements found in control: %s' %n_conserved_in_null
    
    # Perform fisher's exact test
    oddsratio, pvalue = fisher_exact([[n_conserved_in_meme, 
                                       n_conserved_in_null], 
                                      [n_not_conserved_in_meme, 
                                       n_not_conserved_in_null]])
    print 'Fishers Exact Test, Oddsratio: %s. Pvalue: %s' %(oddsratio, pvalue)
    
    # plot distributions
    mylabels = ['Meme motifs', 'Control region']
    mytitle = 'Fraction of elements conserved compared to control region'
    # Plot bargraphs
    frac_conserved_meme = float(n_conserved_in_meme) / n_total_in_meme
    frac_conserved_null = float(n_conserved_in_null) / n_total_in_null
    myvals = [frac_conserved_meme, frac_conserved_null]
    plot_functions.plot_barplot(myvals, mytitle, mylabels, 
                                ylabel='Fraction of elements conserved', 
                                mytext1="%i/%i" \
                                    %(n_conserved_in_meme, 
                                      n_total_in_meme),
                                mytext2='%i/%i' %(n_conserved_in_null, 
                                                  n_total_in_null),
                                mytext3="*Fisher's Exact Test\nP-value=%.2e" %pvalue,
                                ymin=0,
                                ymax=1,
                                width=0.5)
    plt.show()
def main():
    usage = 'usage: %prog meme_gerp_genename_filepath output_filepath\n'\
        'Requires two input arguments:\n'\
        '1) textfile output from '\
            'summarize_meme_results_with_gerp_scores\n'\
        '2) inclusion fasta file\n'\
        '3) exclusion fasta file\n'\
        '4) meme dir containing meme results'
    parser = OptionParser(usage=usage)    
    (_, args) = parser.parse_args()
    
    if len(args) < 5:
        print 'Four arguments need to be specified in command line.\n'
        print usage
        sys.exit()
    meme_summarypath = args[0]
    incl_fasta = args[1]
    excl_fasta = args[2]
    meme_dir = args[3]
    
    # define column name suffix (string after colon in colname)
    gerp_str = 'avg_rs_score'
    motif_numb_str = 'motif_number'
    # miso event has no col name suffix, this is entire colname
    miso_colname = 'miso_event'
    # define rel path to tomtom files from meme dir
    rel_path = os.path.join('rbp_matches', 'candidate_rbps.txt')
    # define plot title
    mytitle = 'GERP Score Comparison: Hits vs Non-Hits'
    
    # get dictionary containing inclusion and exclusion for miso event
    incl_excl_dic = miso_events.get_inclusion_exclusion(incl_file=incl_fasta, 
                                                        excl_file=excl_fasta)
    tomtom_dic = miso_events.get_tomtom_hits(meme_dir, rel_path)
    region_gerp_scores = {}    # gerp scores, indexed by region.
    with open(meme_summarypath, 'rb') as readfile:
        myreader = csv.reader(readfile, delimiter='\t')
        header = myreader.next()
        regions = get_regions(header)
        # init output dic with empty lists
        for region in regions:
            region_gerp_scores[region] = {}
        
        for row in myreader:
            # get gerp score in each region.
            # beware of empty values.
            for region in regions:
                subdic = region_gerp_scores[region]
                gerp_colname = ':'.join([region, gerp_str])
                motif_numb_colname = ':'.join([region, motif_numb_str])
                gerp_score = row[header.index(gerp_colname)]
                motif_numb = row[header.index(motif_numb_colname)]
                miso_event = row[header.index(miso_colname)]
                incl_or_excl = incl_excl_dic[miso_event]
                motif_id = ' '.join(['Motif', motif_numb, incl_or_excl])
                if gerp_score is not '':
                    if motif_id not in subdic:
                        subdic[motif_id] = []
                    subdic[motif_id].append(float(gerp_score))          
                    
    avg_scores_in_tomtom = []
    avg_scores_not_in_tomtom = []
    # Plot histogram of average scores.
    for region in region_gerp_scores:
        for motif_id in region_gerp_scores[region]:
            tomtom_key = create_tomtom_key(motif_id, region)
            if tomtom_key in tomtom_dic:
                avg_scores_in_tomtom += region_gerp_scores[region][motif_id]
            else:
                avg_scores_not_in_tomtom += region_gerp_scores[region][motif_id]
    
    conserved_counts_in_tomtom = 0
    conserved_counts_not_in_tomtom = 0
    for s in avg_scores_in_tomtom:
        if s >= 2:
            conserved_counts_in_tomtom += 1
    for s in avg_scores_not_in_tomtom:
        if s >= 2:
            conserved_counts_not_in_tomtom += 1
    
    for avg_scores, mylabel in zip([avg_scores_in_tomtom, 
                                    avg_scores_not_in_tomtom], 
                                   ['Motif with matching RBPs', 
                                    'Motif without matching RBPs']):     
        plot_functions.plot_density(avg_scores, 
                                    mytitle=mytitle, 
                                    mylabel=mylabel)
    plt.legend()
    plt.show()
Exemplo n.º 10
0
def main():
    usage = 'usage: %prog meme_gerp_genename_filepath output_filepath\n'\
        'Requires two input arguments:\n'\
        '1) pkl file from summarize_meme_results: non-null\n'\
        '2) pkl file from summarize_meme_results: null-mode\n'
    parser = OptionParser(usage=usage)
    parser.add_option(
        '-t',
        '--threshold',
        dest='score_threshold',
        default=2.0,
        help='Float, threshold for what one considers conserved.')
    parser.add_option('-y',
                      '--ymax',
                      dest='ymax',
                      type='float',
                      default=0.03,
                      help='Y max for density plot')
    (options, args) = parser.parse_args()
    if len(args) < 2:
        print 'Two arguments need to be specified in command line.\n'
        print usage
        sys.exit()
    non_null_pklpath = args[0]
    null_pklpath = args[1]
    # parse ops
    score_threshold = float(options.score_threshold)

    # get dics from pkl
    non_null_dic = get_dic_from_pklpath(non_null_pklpath)
    null_dic = get_dic_from_pklpath(null_pklpath)

    non_null_gerp_scores = get_gerp_scores(non_null_dic,
                                           gerpkey='avg_rs_score')
    null_gerp_scores = get_gerp_scores(null_dic, gerpkey='avg_rs_score')

    plot_functions.plot_density([non_null_gerp_scores, null_gerp_scores],
                                mytitle='Density plot of conservation scores',
                                labels_lists=['MEME motifs', 'Controls'],
                                xlabel='GERP conservation score',
                                ylabel='Density',
                                xmin=-4,
                                xmax=4,
                                ymax=options.ymax,
                                smoothness=0.15,
                                drawvline=score_threshold)

    # find how many conserved regions are in each.
    n_conserved_in_meme = \
        gerp_utilities.conserved_regions(non_null_gerp_scores, fraction=False, threshold=score_threshold)
    n_conserved_in_null = \
        gerp_utilities.conserved_regions(null_gerp_scores, fraction=False, threshold=score_threshold)
    n_total_in_meme = len(non_null_gerp_scores)
    n_total_in_null = len(null_gerp_scores)
    n_not_conserved_in_meme = n_total_in_meme - n_conserved_in_meme
    n_not_conserved_in_null = n_total_in_null - n_conserved_in_null

    print 'Threshold: %s' % score_threshold
    print 'Number of conserved elements: %s' % n_conserved_in_meme
    print 'Number of conserved elements found in control: %s' % n_conserved_in_null

    # Perform fisher's exact test
    oddsratio, pvalue = fisher_exact(
        [[n_conserved_in_meme, n_conserved_in_null],
         [n_not_conserved_in_meme, n_not_conserved_in_null]])
    print 'Fishers Exact Test, Oddsratio: %s. Pvalue: %s' % (oddsratio, pvalue)

    # plot distributions
    mylabels = ['Meme motifs', 'Control region']
    mytitle = 'Fraction of elements conserved compared to control region'
    # Plot bargraphs
    frac_conserved_meme = float(n_conserved_in_meme) / n_total_in_meme
    frac_conserved_null = float(n_conserved_in_null) / n_total_in_null
    myvals = [frac_conserved_meme, frac_conserved_null]
    plot_functions.plot_barplot(myvals, mytitle, mylabels,
                                ylabel='Fraction of elements conserved',
                                mytext1="%i/%i" \
                                    %(n_conserved_in_meme,
                                      n_total_in_meme),
                                mytext2='%i/%i' %(n_conserved_in_null,
                                                  n_total_in_null),
                                mytext3="*Fisher's Exact Test\nP-value=%.2e" %pvalue,
                                ymin=0,
                                ymax=1,
                                width=0.5)
    plt.show()