Exemplo n.º 1
0
 def test_t_two_sample_switch(self):
     """t_two_sample should call t_one_observation if 1 item in sample."""
     sample = array([4.02, 3.88, 3.34, 3.87, 3.18])
     x = array([3.02])
     self.assertFloatEqual(t_two_sample(x, sample), (-1.5637254, 0.1929248))
     self.assertFloatEqual(t_two_sample(sample, x), (-1.5637254, 0.1929248))
     #can't do the test if both samples have single item
     self.assertEqual(t_two_sample(x, x), (None, None))
Exemplo n.º 2
0
 def test_t_two_sample_switch(self):
     """t_two_sample should call t_one_observation if 1 item in sample."""
     sample = array([4.02, 3.88, 3.34, 3.87, 3.18])
     x = array([3.02])
     self.assertFloatEqual(t_two_sample(x,sample),(-1.5637254,0.1929248))
     self.assertFloatEqual(t_two_sample(sample, x),(-1.5637254,0.1929248))
     #can't do the test if both samples have single item
     self.assertEqual(t_two_sample(x,x), (None, None))
Exemplo n.º 3
0
def oneTrial(n,f=np.random.normal):
    N = 500      # num of individual tests
    SZ = n*2*N    # need this many nums
    draw = f(loc=50,scale=3,size=SZ)
    counter = 0
    for i in range(0,SZ,n*2):
        nums1 = draw[i:i+n]
        nums2 = draw[i+n:i+2*n]
        t, prob = stats.t_two_sample(nums1,nums2)
        if prob < 0.05:  counter += 1
    return 1.0*counter / N
def compare_alpha_diversities(rarefaction_lines, mapping_lines, 
                              category, depth):
    """compares alpha diversities
    
    inputs:
        rarefaction_file - rarefaction file which gives scores for 
        various rarefactions and depths
        
        mapping_file - file that has ID's and categories that the ID's
        fall in
        
        category - the category to be compared, is a string
        
        depth - the depth of the rarefaction_file to use, is an integer
    
    outputs:
        results - a nested dictionary which specifies the category as
        the top level key, and as its value, dictionaries which give the
        results of the t_two_sample test for all unique pairs of values
        in the specified category
    
    """
     
    rarefaction_data = parse_rarefaction(rarefaction_lines)
    mapping_data = parse_mapping_file_to_dict(mapping_lines)[0]
    value_pairs = make_value_pairs_from_category(mapping_data, category)
    
    category_values_Ids = make_category_values_Id_dict(mapping_data, 
                                                       category)
    
    SampleId_pairs = map_category_value_pairs_to_Ids(value_pairs,
                                                    category_values_Ids)
    
    map_from_Id_to_col = make_SampleIds_rarefaction_columns_dict(
                                                       rarefaction_data)
    
    reduced_rarefaction_mtx = extract_rarefaction_scores_at_depth(depth,
                                                       rarefaction_data)
    
    results = {category:{}}
    
    for pair in range(len(SampleId_pairs)):
        i=(convert_SampleIds_to_rarefaction_mtx(SampleId_pairs[pair][0],
                           reduced_rarefaction_mtx, map_from_Id_to_col))
        
        j=(convert_SampleIds_to_rarefaction_mtx(SampleId_pairs[pair][1],
                           reduced_rarefaction_mtx, map_from_Id_to_col))
        
        results[category][(str(value_pairs[pair][0]),
                           str(value_pairs[pair][1]))] =\
                          t_two_sample(i,j)
    
    return results
Exemplo n.º 5
0
def t_test(nums1, nums2, col_num):
        if len(nums1) != len(nums2):
                print 'fail'
                return
        w1 = []
        w2 = []
        for i in range(0,len(nums1)):
                w1.append(nums1[i][col_num])
                w2.append(nums2[i][col_num])
                
        t, prob = stats.t_two_sample(w1,w2)
        print 'results:'
        print t, prob
        return t, prob
Exemplo n.º 6
0
    def assertSimilarMeans(self, observed, expected, pvalue=0.01, msg=None):
        """Fail if observed p is lower than pvalue"""
        if self._suite_pvalue:
            pvalue = self._suite_pvalue

        observed, expected = asarray(observed), asarray(expected)

        t, p = t_two_sample(observed, expected)
            
        if p > pvalue:
            return
        elif p is None or not isfinite(p): #handle case where all elements were the same
            if not observed.shape:
                observed = observed.reshape((1,))
            if not expected.shape:
                expected = expected.reshape((1,))
            if observed[0] == expected[0]:
                return
        else:
            raise self.failureException(msg or 'p-value %s, t-test p %s' % (repr(pvalue), repr(p)))
Exemplo n.º 7
0
    def assertSimilarMeans(self, observed, expected, pvalue=0.01, msg=None):
        """Fail if observed p is lower than pvalue"""
        if self._suite_pvalue:
            pvalue = self._suite_pvalue

        observed, expected = asarray(observed), asarray(expected)

        t, p = t_two_sample(observed, expected)
            
        if p > pvalue:
            return
        elif p is None or not isfinite(p): #handle case where all elements were the same
            if not observed.shape:
                observed = observed.reshape((1,))
            if not expected.shape:
                expected = expected.reshape((1,))
            if observed[0] == expected[0]:
                return
        else:
            raise self.failureException, \
            (msg or 'p-value %s, t-test p %s' % (`pvalue`, `p`))
Exemplo n.º 8
0
    fout.write(pstr + "\n")
    varcount += 1
fout.write("pdf(\"plot.bl_distribution.pdf" + "\", width=8, height=4)\n")
fout.write("plot(lengths0, proportions0, type='l',xlab=\"BLs, binned\", ylab=\"proportion\", col=\"" + colors[0].__str__() + "\", lwd='2', pch=" + pch[0].__str__() + ", main=\"BL Distribution\");\n")
for i in range(1, varcount):
    fout.write("points(lengths" + i.__str__() + ", proportions" + i.__str__() + ", type='l', col=\"" + colors[i].__str__() + "\", lwd='2', pch=" + pch[i].__str__() + ")\n")
fout.write("dev.off()\n")
fout.close()
os.system("r --no-save < " + scriptpath)
"""

#
# barplot
#
dataseries = {}
for binid in range(0, get_bin_count()):
    dataseries[binid] = {}
    for path in paths:
        if bins[path].__contains__( binid ):
            dataseries[binid][path] = bins[path][binid]
        else:
            dataseries[binid][path] = 0.0
barplot1(dataseries, "BL bins", "proportion", "bl_distribution" + id.__str__())

# print stats
print "\n.\n. Stats about these ML branch length distributions:\n."
for path in paths:
    stats_about_bls( path, filepath_bls[path])
[t, p] = stats.t_two_sample(filepath_bls[ paths[0] ], filepath_bls[ paths[1] ])
print "T = ", t, "P=", p
Exemplo n.º 9
0
 def test_t_two_sample_no_variance(self):
     """t_two_sample should return None if lists are invariant"""
     x = array([1, 1, 1])
     y = array([0, 0, 0])
     self.assertEqual(t_two_sample(x,x), (None, None))
     self.assertEqual(t_two_sample(x,y), (None, None))
Exemplo n.º 10
0
 def test_t_two_sample(self):
     """t_two_sample should match example on p.225 of Sokal and Rohlf"""
     I =  array([7.2, 7.1, 9.1, 7.2, 7.3, 7.2, 7.5])
     II = array([8.8, 7.5, 7.7, 7.6, 7.4, 6.7, 7.2])
     self.assertFloatEqual(t_two_sample(I, II), (-0.1184, 0.45385 * 2),
         0.001)
Exemplo n.º 11
0
def monte_carlo_group_distances_within_between(single_field, \
    paired_field, dmat, dir_prefix = '', \
    subdir_prefix='monte_carlo_group_distances',\
    num_iters=10):
    """Calculate Monte Carlo stats within and between fields.
    
    Specifically:
    - find the groups for each specified col (or combination of cols)
    - do t test between each pair of groups
    - randomize matrix n times and find empirical value of t for each pair
    - compare the actual value of t to the randomized values

    WARNING: Only symmetric, hollow distance matrices may be used as input.
    Asymmetric distance matrices, such as those obtained by the UniFrac Gain
    metric (i.e. beta_diversity.py -m unifrac_g), should not be used as input.
    """

    path_prefix = path.join(dir_prefix, subdir_prefix)
    #if dir doesn't exist
    if not path.isdir(path_prefix):
        # make directory
        mkdir(path_prefix)

    real_dists = []
    within_category_distances = \
        within_category_distances_grouped(single_field,label_suffix='')
    real_dists.extend([['Within',field,distances] for field,\
        distances in within_category_distances.items()])

    between_category_distances = \
        between_category_distances_grouped(single_field,label_suffix='')
    real_dists.extend([['Between',field,distances] for field,\
        distances in between_category_distances.items()])

    within_and_between = \
        within_and_between_fields(paired_field)

    real_dists.extend([[field.split('_',1)[0],\
        field.split('_',1)[1],distances] for \
        field, distances in within_and_between.items()])

    outfile = open(
        path.join(path_prefix, 'group_distances_within_and_between.xls'), 'w')
    outfile.write('\t'.join(['Comparison','Category_1','Avg',\
        'Comparison','Category_2','Avg','t','p',\
        'p_greater','p_less','Iterations\n']))

    rand_distances = get_random_dists(real_dists, dmat, num_iters)

    #iterate over the groups
    for i, (first_g1, second_g1, distances_g1) in \
        enumerate(real_dists[:-1]):
        real_dist_1 = average(distances_g1)
        rand_dists_1 = [rand_distances[n][i][-1] for n in range(num_iters)]
        #then for each other pair (not including same group)
        for j in range(i + 1, len(real_dists)):
            first_g2, second_g2, distances_g2 = real_dists[j]
            real_dist_2 = average(distances_g2)
            rand_dists_2 = [rand_distances[n][j][-1] \
                for n in range(num_iters)]
            ttests = [t_two_sample(rand_dists_1[n],rand_dists_2[n])[0] \
                for n in range(num_iters)]
            real_ttest = t_two_sample(distances_g1, distances_g2)
            curr_line = [first_g1, second_g1, real_dist_1, \
                first_g2, second_g2, real_dist_2]
            curr_line.extend([real_ttest[0], real_ttest[1],\
                (array(ttests)>real_ttest[0]).sum()/float(num_iters), \
                (array(ttests)<real_ttest[0]).sum()/float(num_iters), \
                num_iters])
            outfile.write('\t'.join(map(str, curr_line)))
            outfile.write('\n')
Exemplo n.º 12
0
def monte_carlo_group_distances(mapping_file, dmatrix_file, prefs, \
    dir_prefix = '', subdir_prefix='monte_carlo_group_distances',\
    default_iters=10, fields=None):
    """Calculate Monte Carlo stats for specified group distances.
    
    Specifically:
    - find the groups for each specified col (or combination of cols)
    - do t test between each pair of groups
    - randomize matrix n times and find empirical value of t for each pair
    - compare the actual value of t to the randomized values

    WARNING: Only symmetric, hollow distance matrices may be used as input.
    Asymmetric distance matrices, such as those obtained by the UniFrac Gain
    metric (i.e. beta_diversity.py -m unifrac_g), should not be used as input.
    """
    mapping, header, comments = parse_mapping_file(open(mapping_file, 'U'))
    header = [header]
    header.extend(mapping)
    mapping = header

    distance_header, distance_matrix = \
        parse_distmat(open(dmatrix_file,'U'))

    orig_distance_matrix = distance_matrix.copy()

    path_prefix = path.join(dir_prefix, subdir_prefix)

    #if dir doesn't exist
    if not path.isdir(path_prefix):
        # make directory
        mkdir(path_prefix)

    if fields is None:
        fields = [mapping[0][0]]

    if prefs is None:
        prefs = {}

    if 'MONTE_CARLO_GROUP_DISTANCES' not in prefs:
        prefs = build_monte_carlo_prefs(fields, default_iters)

    for field, num_iters in prefs['MONTE_CARLO_GROUP_DISTANCES'].items():
        if '&&' in field:
            groups = group_by_fields(mapping, field.split('&&'))
        else:
            groups = group_by_field(mapping, field)
        outfile = open(
            path.join(path_prefix, 'group_distances_' + field + '.xls'), 'w')
        outfile.write('\t'.join(['Category_1a','Category_1b','Avg',\
            'Category_2a','Category_2b','Avg','t','p',\
            'p_greater','p_less','Iterations\n']))
        real_dists = distances_by_groups(distance_header, distance_matrix,\
            groups)

        #iterate over the groups
        for i, (first_g1, second_g1, distances_g1) in \
            enumerate(real_dists[:-1]):

            real_dist_1 = average(distances_g1)

            #then for each other pair (not including same group)
            for j in range(i + 1, len(real_dists)):
                first_g2, second_g2, distances_g2 = real_dists[j]

                real_dist_2 = average(distances_g2)

                # permute distances just within these groups!
                rand_dists_1, rand_dists_2 = \
                        permute_between_groups(distances_g1,
                                               distances_g2,
                                               num_iters)

                ttests = [t_two_sample(rand_dists_1[n].flatten(),rand_dists_2[n].flatten())[0] \
                    for n in range(num_iters)]
                real_ttest = t_two_sample(distances_g1.flatten(),
                                          distances_g2.flatten())
                curr_line = [first_g1, second_g1, real_dist_1, \
                    first_g2, second_g2, real_dist_2]
                curr_line.extend([real_ttest[0], real_ttest[1],\
                    (array(ttests)>real_ttest[0]).sum()/float(num_iters), \
                    (array(ttests)<real_ttest[0]).sum()/float(num_iters), \
                    num_iters])
                outfile.write('\t'.join(map(str, curr_line)))
                outfile.write('\n')
Exemplo n.º 13
0
def t_test(nums1, nums2):
    t, prob = stats.t_two_sample(nums1,nums2)
    print t
    print prob   
Exemplo n.º 14
0
 def test_t_two_sample(self):
     """t_two_sample should match example on p.225 of Sokal and Rohlf"""
     I = array([7.2, 7.1, 9.1, 7.2, 7.3, 7.2, 7.5])
     II = array([8.8, 7.5, 7.7, 7.6, 7.4, 6.7, 7.2])
     self.assertFloatEqual(t_two_sample(I, II), (-0.1184, 0.45385 * 2),
                           0.001)
Exemplo n.º 15
0
def monte_carlo_group_distances(mapping_file, dmatrix_file, prefs, \
    dir_prefix = '', subdir_prefix='monte_carlo_group_distances',\
    default_iters=10, fields=None):
    """Calculate Monte Carlo stats for specified group distances.
    
    Specifically:
    - find the groups for each specified col (or combination of cols)
    - do t test between each pair of groups
    - randomize matrix n times and find empirical value of t for each pair
    - compare the actual value of t to the randomized values

    WARNING: Only symmetric, hollow distance matrices may be used as input.
    Asymmetric distance matrices, such as those obtained by the UniFrac Gain
    metric (i.e. beta_diversity.py -m unifrac_g), should not be used as input.
    """
    mapping, header, comments = parse_mapping_file(open(mapping_file,'U'))
    header = [header]
    header.extend(mapping)
    mapping=header

    distance_header, distance_matrix = \
        parse_distmat(open(dmatrix_file,'U'))

    orig_distance_matrix = distance_matrix.copy()

    path_prefix = path.join(dir_prefix,subdir_prefix)
    
    #if dir doesn't exist
    if not path.isdir(path_prefix):
        # make directory
        mkdir(path_prefix)
    
    if fields is None:
        fields = [mapping[0][0]]
        
    if prefs is None:
        prefs = {}
 
    if 'MONTE_CARLO_GROUP_DISTANCES' not in prefs:
        prefs = build_monte_carlo_prefs(fields,default_iters)
            
    for field, num_iters in prefs['MONTE_CARLO_GROUP_DISTANCES'].items():
        if '&&' in field:
            groups = group_by_fields(mapping, field.split('&&'))
        else:
            groups = group_by_field(mapping, field)
        outfile = open(path.join(path_prefix,
                                 'group_distances_'+field+'.txt'), 'w')
        outfile.write('\t'.join(['Category_1a','Category_1b','Avg',\
            'Category_2a','Category_2b','Avg','t','p',\
            'p_greater','p_less','Iterations\n']))
        real_dists = distances_by_groups(distance_header, distance_matrix,\
            groups)
 
        #iterate over the groups
        for i, (first_g1, second_g1, distances_g1) in \
            enumerate(real_dists[:-1]):

            real_dist_1 = average(distances_g1)

            #then for each other pair (not including same group)
            for j in range(i+1,len(real_dists)):
                first_g2, second_g2, distances_g2 = real_dists[j]

                real_dist_2 = average(distances_g2)

                # permute distances just within these groups!
                rand_dists_1, rand_dists_2 = \
                        permute_between_groups(distances_g1, 
                                               distances_g2,
                                               num_iters)

                ttests = [t_two_sample(rand_dists_1[n].flatten(),rand_dists_2[n].flatten())[0] \
                    for n in range(num_iters)]
                real_ttest = t_two_sample(distances_g1.flatten(), distances_g2.flatten())
                curr_line = [first_g1, second_g1, real_dist_1, \
                    first_g2, second_g2, real_dist_2]
                curr_line.extend([real_ttest[0], real_ttest[1],\
                    (array(ttests)>real_ttest[0]).sum()/float(num_iters), \
                    (array(ttests)<real_ttest[0]).sum()/float(num_iters), \
                    num_iters])
                outfile.write('\t'.join(map(str, curr_line)))
                outfile.write('\n')
Exemplo n.º 16
0
def compare_alpha_diversities(rarefaction_lines, mapping_lines, category, depth,
    test_type='nonparametric', num_permutations=999):
    """Compares alpha diversity values for differences per category treatment.
    Notes: 
     Returns a defaultdict which as keys has the pairs of treatments being 
     compared, and as values, lists of (pval,tval) tuples for each comparison at
     for a given iteration.     
    Inputs:
     rarefaction_lines - list of lines, result of multiple rarefactions.
     mapping_lines - list of lines, mapping file lines. 
     category - str, the category to be compared, eg 'Treatment' or 'Age'.
     depth - int, depth of the rarefaction file to use.
     test_type - str, the type of t-test to perform. Must be either
     'parametric' or 'nonparametric'.
     num_permutations - int, the number of Monte Carlo permutations to use if
     test_type is 'nonparametric'.    
    """
    if test_type == 'nonparametric' and num_permutations < 1:
        raise ValueError("Invalid number of permutations: %d. Must be greater "
                         "than zero." % num_permutations)
     
    rarefaction_data = parse_rarefaction(rarefaction_lines)
    mapping_data = parse_mapping_file_to_dict(mapping_lines)[0]
    # samid_pairs, treatment_pairs are in the same order
    samid_pairs, treatment_pairs = sampleId_pairs(mapping_data, 
        rarefaction_data, category)
    
    # extract only rows of the rarefaction data that are at the given depth
    rare_mat = array([row for row in rarefaction_data[3] if row[0]==depth])
    
    # Average each col of the rarefaction mtx. Computing t test on averages over
    # all iterations. Avoids more comps which kills signifigance. 
    rare_mat = (rare_mat.sum(0)/rare_mat.shape[0])[2:] #remove depth,iter cols
    sids = rarefaction_data[0][3:] # 0-2 are header strings
    results = {}
    for sid_pair, treatment_pair in zip(samid_pairs, treatment_pairs):
        # if there is only 1 sample for each treatment in a comparison, and mc
        # using mc method, will error (e.g. mc_t_two_sample([1],[1]).
        if len(sid_pair[0])==1 and len(sid_pair[1])==1:
            t_key = '%s,%s' % (treatment_pair[0], treatment_pair[1])
            results[t_key]= (None,None)
        else:
            pair0_indices = [sids.index(i) for i in sid_pair[0]]
            pair1_indices = [sids.index(i) for i in sid_pair[1]]
            t_key = '%s,%s' % (treatment_pair[0], treatment_pair[1])
            i = rare_mat.take(pair0_indices)
            j = rare_mat.take(pair1_indices)
            # found discussion of how to quickly check an array for nan here:
            # http://stackoverflow.com/questions/6736590/fast-check-for-nan-in-numpy
            if isnan(np_min(i)) or isnan(np_min(j)):
                results[t_key]= (None,None)
                continue
            if test_type == 'parametric':
                obs_t, p_val = t_two_sample(i,j)
            elif test_type == 'nonparametric':
                obs_t, _, _, p_val = mc_t_two_sample(i,j, 
                    permutations=num_permutations)
                if p_val != None: 
                    p_val = float(format_p_value_for_num_iters(p_val, 
                        num_iters=num_permutations))
                elif p_val ==  None: #None will error in format_p_val
                    obs_t, p_val = None, None
            else:
                raise ValueError("Invalid test type '%s'." % test_type)
            results[t_key]= (obs_t,p_val)
    return results
Exemplo n.º 17
0
def monte_carlo_group_distances(mapping_file, dmatrix_file, prefs, \
    dir_prefix = '', subdir_prefix='monte_carlo_group_distances',\
    default_iters=10, fields=None):
    """Calculate Monte Carlo stats for specified group distances.
    
    Specifically:
    - find the groups for each specified col (or combination of cols)
    - do t test between each pair of groups
    - randomize matrix n times and find empirical value of t for each pair
    - compare the actual value of t to the randomized values
    """
    mapping, header, comments = parse_mapping_file(open(mapping_file, 'U'))
    header = [header]
    header.extend(mapping)
    mapping = header

    distance_header, distance_matrix = \
        parse_distmat(open(dmatrix_file,'U'))

    orig_distance_matrix = distance_matrix.copy()

    path_prefix = _make_path([dir_prefix, subdir_prefix])

    #if dir doesn't exist
    if not path.isdir(path_prefix):
        # make directory
        mkdir(path_prefix)

    if fields is None:
        fields = [mapping[0][0]]

    if prefs is None:
        prefs = {}

    if 'MONTE_CARLO_GROUP_DISTANCES' not in prefs:
        prefs = build_monte_carlo_prefs(fields, default_iters)

    for field, num_iters in prefs['MONTE_CARLO_GROUP_DISTANCES'].items():
        if '&&' in field:
            groups = group_by_fields(mapping, field.split('&&'))
        else:
            groups = group_by_field(mapping, field)
        outfile = open(path_prefix + 'group_distances_' + field + '.xls', 'w')
        outfile.write('\t'.join(['Category_1a','Category_1b','Avg',\
            'Category_2a','Category_2b','Avg','t','p',\
            'p_greater','p_less','Iterations\n']))
        real_dists = distances_by_groups(distance_header, distance_matrix,\
            groups)
        rand_distances = [distances_by_groups(distance_header, \
            permute_for_monte_carlo(distance_matrix), groups) \
            for i in range(num_iters)]
        #iterate over the groups

        for i, (first_g1, second_g1, distances_g1) in \
            enumerate(real_dists[:-1]):

            real_dist_1 = average(distances_g1)
            rand_dists_1 = [rand_distances[n][i][-1] for n in range(num_iters)]
            #then for each other pair (not including same group)
            for j in range(i + 1, len(real_dists)):
                first_g2, second_g2, distances_g2 = real_dists[j]

                real_dist_2 = average(distances_g2)
                rand_dists_2 = [rand_distances[n][j][-1] \
                    for n in range(num_iters)]
                ttests = [t_two_sample(rand_dists_1[n],rand_dists_2[n])[0] \
                    for n in range(num_iters)]
                real_ttest = t_two_sample(distances_g1, distances_g2)
                curr_line = [first_g1, second_g1, real_dist_1, \
                    first_g2, second_g2, real_dist_2]
                curr_line.extend([real_ttest[0], real_ttest[1],\
                    (array(ttests)>real_ttest[0]).sum()/float(num_iters), \
                    (array(ttests)<real_ttest[0]).sum()/float(num_iters), \
                    num_iters])
                outfile.write('\t'.join(map(str, curr_line)))
                outfile.write('\n')
Exemplo n.º 18
0
 def test_t_two_sample_no_variance(self):
     """t_two_sample should return None if lists are invariant"""
     x = array([1, 1, 1])
     y = array([0, 0, 0])
     self.assertEqual(t_two_sample(x, x), (None, None))
     self.assertEqual(t_two_sample(x, y), (None, None))
Exemplo n.º 19
0
def monte_carlo_group_distances_within_between(single_field, \
    paired_field, dmat, dir_prefix = '', \
    subdir_prefix='monte_carlo_group_distances',\
    num_iters=10):
    """Calculate Monte Carlo stats within and between fields.
    
    Specifically:
    - find the groups for each specified col (or combination of cols)
    - do t test between each pair of groups
    - randomize matrix n times and find empirical value of t for each pair
    - compare the actual value of t to the randomized values

    WARNING: Only symmetric, hollow distance matrices may be used as input.
    Asymmetric distance matrices, such as those obtained by the UniFrac Gain
    metric (i.e. beta_diversity.py -m unifrac_g), should not be used as input.
    """

    path_prefix = path.join(dir_prefix,subdir_prefix)
    #if dir doesn't exist
    if not path.isdir(path_prefix):
        # make directory
        mkdir(path_prefix)
    
    real_dists = []
    within_category_distances = \
        within_category_distances_grouped(single_field,label_suffix='')
    real_dists.extend([['Within',field,distances] for field,\
        distances in within_category_distances.items()])
        
    between_category_distances = \
        between_category_distances_grouped(single_field,label_suffix='')
    real_dists.extend([['Between',field,distances] for field,\
        distances in between_category_distances.items()])
    
    within_and_between = \
        within_and_between_fields(paired_field)
    
    real_dists.extend([[field.split('_',1)[0],\
        field.split('_',1)[1],distances] for \
        field, distances in within_and_between.items()])
    
    outfile = open(path.join(path_prefix,
                            'group_distances_within_and_between.txt'), 'w')
    outfile.write('\t'.join(['Comparison','Category_1','Avg',\
        'Comparison','Category_2','Avg','t','p',\
        'p_greater','p_less','Iterations\n']))

    rand_distances = get_random_dists(real_dists, dmat, num_iters)
    
    #iterate over the groups
    for i, (first_g1, second_g1, distances_g1) in \
        enumerate(real_dists[:-1]):
        real_dist_1 = average(distances_g1)
        rand_dists_1 = [rand_distances[n][i][-1] for n in range(num_iters)]
        #then for each other pair (not including same group)
        for j in range(i+1,len(real_dists)):
            first_g2, second_g2, distances_g2 = real_dists[j]
            real_dist_2 = average(distances_g2)
            rand_dists_2 = [rand_distances[n][j][-1] \
                for n in range(num_iters)]
            ttests = [t_two_sample(rand_dists_1[n],rand_dists_2[n])[0] \
                for n in range(num_iters)]
            real_ttest = t_two_sample(distances_g1, distances_g2)
            curr_line = [first_g1, second_g1, real_dist_1, \
                first_g2, second_g2, real_dist_2]
            curr_line.extend([real_ttest[0], real_ttest[1],\
                (array(ttests)>real_ttest[0]).sum()/float(num_iters), \
                (array(ttests)<real_ttest[0]).sum()/float(num_iters), \
                num_iters])
            outfile.write('\t'.join(map(str, curr_line)))
            outfile.write('\n')
Exemplo n.º 20
0
def compare_alpha_diversities(rarefaction_lines, mapping_lines, 
                              category, depth, test_type='nonparametric',
                              num_permutations=999):
    """compares alpha diversities
    
    inputs:
        rarefaction_file - rarefaction file which gives scores for 
        various rarefactions and depths
        
        mapping_file - file that has ID's and categories that the ID's
        fall in
        
        category - the category to be compared, is a string
        
        depth - the depth of the rarefaction_file to use, is an integer

        test_type - the type of t-test to perform, is a string. Must be either
        'parametric' or 'nonparametric'

        num_permutations - the number of Monte Carlo permutations to use if
        test_type is 'nonparametric', is an integer
    
    outputs:
        results - a nested dictionary which specifies the category as
        the top level key, and as its value, dictionaries which give the
        results of the t_two_sample test for all unique pairs of values
        in the specified category
    
    """
    if test_type == 'nonparametric' and num_permutations < 1:
        raise ValueError("Invalid number of permutations: %d. Must be greater "
                         "than zero." % num_permutations)
     
    rarefaction_data = parse_rarefaction(rarefaction_lines)
    mapping_data = parse_mapping_file_to_dict(mapping_lines)[0]
    value_pairs = make_value_pairs_from_category(mapping_data, category)
    
    category_values_Ids = make_category_values_Id_dict(mapping_data, 
                                                       category)
    
    SampleId_pairs = map_category_value_pairs_to_Ids(value_pairs,
                                                    category_values_Ids)
    
    map_from_Id_to_col = make_SampleIds_rarefaction_columns_dict(
                                                       rarefaction_data)
    
    reduced_rarefaction_mtx = extract_rarefaction_scores_at_depth(depth,
                                                       rarefaction_data)
    
    results = {category:{}}
    
    for pair in range(len(SampleId_pairs)):
        # Must flatten the matrix because t_two_sample only operates on
        # non-nested sequences (otherwise we'll get the wrong degrees of
        # freedom).
        i=(convert_SampleIds_to_rarefaction_mtx(SampleId_pairs[pair][0],
                                                reduced_rarefaction_mtx,
                                                map_from_Id_to_col)).flatten()
        
        j=(convert_SampleIds_to_rarefaction_mtx(SampleId_pairs[pair][1],
                                                reduced_rarefaction_mtx,
                                                map_from_Id_to_col)).flatten()

        if test_type == 'parametric':
            obs_t, p_val = t_two_sample(i,j)
        elif test_type == 'nonparametric':
            obs_t, _, _, p_val = mc_t_two_sample(i,j,
                                                 permutations=num_permutations)
            p_val = format_p_value_for_num_iters(p_val, num_permutations)
        else:
            raise ValueError("Invalid test type '%s'." % test_type)

        results[category][(str(value_pairs[pair][0]),
                           str(value_pairs[pair][1]))] = obs_t, p_val
    return results