Пример #1
0
def get_t_p_values_dict_for_subset(samples_dict, otu_library, sample_map_file_path, ranks = None, real_abundance = False):
    """this function sorts OTUs according to students t-test values with expected mean difference of 10. this way
       we get a well sorted OTUs from the ones that have the biggest differnce in terms of percent abundance mean
       to the ones that have smaller difference. Storing this information helps to decide in what order dot plots
       should be shown to the researcher.

       Also OTUs that present in the otu library but have no value within a particular subset of samples are being
       discarded for that subset"""

    otu_t_p_tuples_dict = {}

    # because the way we create otu_library, phylum is the first and genus is the last.
    # opposite of what we have in constants.
    ranks.reverse()

    for rank in ranks:
        sample_groups, group_colors = helper_functions.get_groups_colors_from_sample_map_file(sample_map_file_path)

        temporary_list_of_tuples = []

        for otu in set([o[ranks.index(rank)] for o in otu_library]):
            otu_vectors = {}
            for group in sample_groups.keys():
                otu_vectors[group] = []
                for sample in sample_groups[group]:
                    if samples_dict[sample][rank].has_key(otu):
                        if real_abundance:
                            otu_vectors[group].append(samples_dict[sample][rank][otu])
                        else:
                            if samples_dict[sample]['tr'] == 0:
                                otu_vectors[group].append(0.0)
                            else:
                                otu_vectors[group].append(samples_dict[sample][rank][otu] * 100.0 / samples_dict[sample]['tr'])
                    else:
                        otu_vectors[group].append(0.0)

            #has more than one sample in at least one group (if every group has only one sample t-test would fail)
            has_enough_samples = sum([len(t) - 1 for t in otu_vectors.values()]) > 0

            if sum([sum(v) for v in otu_vectors.values()]) > 0.0:

                sorting_assist = max([numpy.mean(t) for t in otu_vectors.values()])

                # ^^ an OTU is presented at least once in any group
                if len(sample_groups) == 2 and has_enough_samples:
                    # if we have only two groups, go for t-test stuff

                    # t-test fails when there is no variance. ex, t_test([5.0,5.0], [0.0,0.0]) is None, None.
                    # adding a very very small number to all values seemed okay.
                    for vector in otu_vectors.values():
                            vector[0] += r.random() * 1e-6
                    t, p = t_test(otu_vectors.values()[0], otu_vectors.values()[1])
                    temporary_list_of_tuples.append((abs(sorting_assist), otu, t, p),)
                else:
                    # we have more than two groups, just return nothing for now
                    # ANOVA could be used for the rest

                    temporary_list_of_tuples.append((sorting_assist, otu, None, None),)

        otu_t_p_tuples_dict[rank] = []

        # sorting the list based on the temporary sorting assist value
        temporary_list_of_tuples.sort(reverse = 1)

        for tpl in temporary_list_of_tuples:
            otu, otu_fs = tpl[1], helper_functions.get_fs_compatible_name(tpl[1])
            if len(sample_groups) == 2 and has_enough_samples:
                otu_t_p_tuples_dict[rank].append((otu, otu_fs, "%.2f" % tpl[2], "%.2f" % tpl[3]),)
            else:
                otu_t_p_tuples_dict[rank].append((otu, otu_fs, None, None),)

    return otu_t_p_tuples_dict
Пример #2
0
def otu_confidence_analysis(rdp_output_file, save_path, seperator, samples, rank = "genus"):
    otu_loc_in_rdp_output = {}
    # silly magic numbers.. one day I'll be very sorry for not storing them in one location.
    # please don't hate me; just consider fixing it :p
    otu_loc_in_rdp_output["genus"] = 20
    otu_loc_in_rdp_output["family"] = 17
    otu_loc_in_rdp_output["order"] = 14
    otu_loc_in_rdp_output["class"] = 11
    otu_loc_in_rdp_output["phylum"] = 8

    otu_loc = otu_loc_in_rdp_output[rank]

    otu_rdp_confidence_dict = {}

    lines = open(rdp_output_file).readlines()

    # fill the information into a dictionary with one pass.
    for line in lines:
        s = line.split('\t')
        sample = s[0].split(seperator)[0]
        otu = s[otu_loc]
        rdp_confidence = float(s[otu_loc + 2].strip())

        if not len(otu):
            continue

        if not sample in samples:
            continue

        if otu_rdp_confidence_dict.has_key(otu):
            if otu_rdp_confidence_dict[otu].has_key(sample):
                otu_rdp_confidence_dict[otu][sample].append(rdp_confidence)
            else:
                otu_rdp_confidence_dict[otu][sample] = [rdp_confidence]
        else:
            otu_rdp_confidence_dict[otu] = {}
            otu_rdp_confidence_dict[otu][sample] = [rdp_confidence]

    # now, fix the dict: if a sample doesn't have any sequences identified as a particular OTU,
    # lets put an empty list for that OTU in the dict. it will make things a little easier in
    # a second.
    for otu in otu_rdp_confidence_dict:
        for sample in samples:
            if not otu_rdp_confidence_dict[otu].has_key(sample):
                otu_rdp_confidence_dict[otu][sample] = [0]


    otu_rdp_confidence_tuples_list = [] # list of tuples: (otu_image_fname, otu_name, confidence_values_list)

    # now we're gonna create figures, and save them. also we'll use a semi-smart way to sort OTU's
    # to show them in a particular order.
    for otu in otu_rdp_confidence_dict:
        figure_file_name = rank + "_" + helper_functions.get_fs_compatible_name(otu) + '_rdp_confidence.png'
        total_number_of_sequences_for_otu = sum([sum(x) for x in otu_rdp_confidence_dict[otu].values()])
        otu_rdp_confidence_tuples_list.append((total_number_of_sequences_for_otu, otu, figure_file_name),)

        values_for_boxplots = []
        for sample in samples:
            values_for_boxplots.append(otu_rdp_confidence_dict[otu][sample])


        # we're ready to generate figures..
        max_val = 1.0

        max_val = max_val + max_val * 10 / 100

        width = len(samples) / 5
        if width < 5:
            width = 5

        if width > 15:
            width = 15

        fig = pylab.figure(figsize=(width, 4))

        pylab.rcParams['font.size'] = 8.0
        pylab.rcParams.update({'axes.linewidth' : 0, 'axes.axisbelow': False})
        pylab.rc('grid', color='0.50', linestyle='-', linewidth=0.1)
        pylab.grid(True)

        for i in range(0, len(samples)):
            b = pylab.boxplot(values_for_boxplots[i], positions=[i], sym=',', widths=0.3)
            pylab.setp(b['medians'], color='black')
            pylab.setp(b['whiskers'], color='black', alpha=0.9)
            pylab.setp(b['boxes'], color='black', alpha=0.9)
            pylab.setp(b['fliers'], color='black', alpha=0.9)
            pylab.setp(b['caps'], color='black', alpha=0.9)
            
            pylab.xlim(xmin=-0.75, xmax=len(samples) - 0.15)
            pylab.ylim(ymin=-max_val * 10 / 100, ymax=max_val)
            pylab.xticks(pylab.      arange(len(samples)), samples, rotation=90)

        if not save_path:
            pylab.show()
        else:
            pylab.savefig(os.path.join(save_path, figure_file_name))

        # clean memory
        try:
            fig.clf()
        except:
            pass
        pylab.close('all')

    otu_rdp_confidence_tuples_list.sort(reverse = True)

    if save_path:
    # get rid of the first entry -which was used for sorting- and store the ordering info
    # FIXME: could you please put data serializing and de-serializing function into the
    # helper_functions.py
        write_samples_dictionary(os.path.join(save_path, rank + "_rdp_confidence_ordering_info"), [(x[1], x[2]) for x in otu_rdp_confidence_tuples_list])
Пример #3
0
def generate(samples_dict, otu_t_p_tuples_dict, sample_map_file_path, rank = "genus", save_dir = None, is_transparent = False, real_abundance = False):
    sample_groups, group_colors = helper_functions.get_groups_colors_from_sample_map_file(sample_map_file_path)

    if real_abundance:
        """if we're gonna work with real abundance, we need to find out about the 
        ymax of the y axis. to do that, first, we learn the max abundance, then,
        find out the smallest power of 10 that is larger than max_abundance.."""
        max_abundance = helper_functions.get_largest_abundance_number_in_all_samples(samples_dict)
        max_y = 1
        while 1:
            if max_y > max_abundance:
                break
            max_y *= 10

    for otu in [t[0] for t in otu_t_p_tuples_dict[rank]]:
        txt_output = ''
        plot_dict = {}
        for group in sample_groups.keys():
            plot_dict[group] = []
            for sample in sample_groups[group]:
                if samples_dict[sample][rank].has_key(otu):
                    if real_abundance:
                        plot_dict[group].append([samples_dict[sample][rank][otu], sample],)
                        txt_output += '%s\t%s\t%f\n' % (group, sample, samples_dict[sample][rank][otu])
                    else:
                        if samples_dict[sample]['tr'] == 0:
                            otu_vectors[group].append(0.0)
                        else:
                            plot_dict[group].append([samples_dict[sample][rank][otu] * 100.0 / samples_dict[sample]['tr'], sample],)
                            txt_output += '%s\t%s\t%f\n' % (group, sample, samples_dict[sample][rank][otu] * 100.0 / samples_dict[sample]['tr'])
                else:
                    plot_dict[group].append([0.0, sample],)
                    txt_output += '%s\t%s\t0.0\n' % (group, sample)

        fig = pylab.figure(figsize=(3, 6))
        if real_abundance:
            ax = pylab.axes()

        pylab.rcParams['axes.titlesize'] = 12.0
        pylab.rcParams['font.size'] = 8.0

        pylab.rcParams.update({'axes.linewidth' : 0, 'axes.axisbelow': False})
        pylab.rc('grid', color='0.50', linestyle='-', linewidth=0.1)
        pylab.grid(True)

        keys = helper_functions.sorted_copy(plot_dict.keys())

        presence = []

        for key in keys:
            i = keys.index(key)
            if real_abundance:
                """if abundance is 0.0, make it 1 so it would look better on log scale"""
                for j in range(0, len(plot_dict[key])):
                    if plot_dict[key][j][0] < 1:
                        plot_dict[key][j][0] = 1.0

            pylab.title(otu)

            
            presence.append('%.3f' % (len([t[0] for t in plot_dict[key] if t[0] > 0.01]) * 100.0 / len(plot_dict[key])) )

            # scattering the samples in X axis, so it would be easier to see them when there are a bunch of them
            # at the same spot. instead of this, i * len(plot_dict[key]) could be used to plot them.
            y_positions =  [((1 - (r.gauss(100, 3) /100)) + i) for x in range(0, len(plot_dict[key]))]

            pylab.plot(y_positions, [t[0] for t in plot_dict[key]], 'o', color = group_colors[key], ms = 10, mew = 0.6, alpha = .5)

            b = pylab.boxplot([t[0] for t in plot_dict[key]], positions=[i + 0.35], sym=',', widths=0.2)
            pylab.setp(b['medians'], color=group_colors[key])
            pylab.setp(b['whiskers'], color='black', alpha=0.3)
            pylab.setp(b['boxes'], color='black', alpha=0.3)
            pylab.setp(b['fliers'], color='black', alpha=0.3)
            pylab.setp(b['caps'], color='black', alpha=0.3)
        if real_abundance:
            ax.set_yscale('log')
            formatter = pylab.FuncFormatter(log_10_fix)
            ax.yaxis.set_major_formatter(formatter)

            pylab.xlim(xmin=-0.75, xmax=len(plot_dict) - 0.15)
            pylab.xticks(pylab.arange(len(plot_dict)), keys, rotation=90)
            pylab.ylim(ymin=1e-1, ymax=max_y)
        else:
            pylab.ylim(ymin=-5, ymax=105)
            pylab.xlim(xmin=-0.75, xmax=len(plot_dict) - 0.15)
            pylab.xticks(pylab.arange(len(plot_dict)), keys, rotation=90)
            pylab.yticks(pylab.arange(0, 101, 10))

        print '%s,%s' % (otu, ','.join(presence))

        if not save_dir:
            pylab.show()
        else:
            if real_abundance:
                pylab.savefig(os.path.join(save_dir, rank + "_" + helper_functions.get_fs_compatible_name(otu) + '_real_abundance' + '.png'), transparent = is_transparent)
                open(os.path.join(save_dir, rank + "_" + helper_functions.get_fs_compatible_name(otu) + '_real_abundance.txt'), 'w').write(txt_output)
            else:
                pylab.savefig(os.path.join(save_dir, rank + "_" + helper_functions.get_fs_compatible_name(otu) + '.png'), transparent = is_transparent)
                open(os.path.join(save_dir, rank + "_" + helper_functions.get_fs_compatible_name(otu) + '.txt'), 'w').write(txt_output)

        # clean memory
        try:
            fig.clf()
        except:
            pass
        pylab.close('all')