def rank_test(data, value="Yes", tails="high"): data.sort() v_dist = [] non_v_dist = [] for d in data: if d[1] == value: v_dist.append(d[0]) else: non_v_dist.append(d[0]) t, parametric_p, ts, non_parametric_p = mc_t_two_sample(v_dist, non_v_dist, tails=tails) return t, parametric_p, non_parametric_p, v_dist, non_v_dist
def compare_alpha_diversities(rarefaction_lines, mapping_lines, category, depth, test_type='nonparametric', num_permutations=999): """Compares alpha diversity values for differences per category treatment. Notes: Returns a defaultdict which as keys has the pairs of treatments being compared, and as values, lists of (pval,tval) tuples for each comparison at for a given iteration. Inputs: rarefaction_lines - list of lines, result of multiple rarefactions. mapping_lines - list of lines, mapping file lines. category - str, the category to be compared, eg 'Treatment' or 'Age'. depth - int, depth of the rarefaction file to use. test_type - str, the type of t-test to perform. Must be either 'parametric' or 'nonparametric'. num_permutations - int, the number of Monte Carlo permutations to use if test_type is 'nonparametric'. """ if test_type == 'nonparametric' and num_permutations < 1: raise ValueError("Invalid number of permutations: %d. Must be greater " "than zero." % num_permutations) rarefaction_data = parse_rarefaction(rarefaction_lines) mapping_data = parse_mapping_file_to_dict(mapping_lines)[0] # samid_pairs, treatment_pairs are in the same order samid_pairs, treatment_pairs = sampleId_pairs(mapping_data, rarefaction_data, category) # extract only rows of the rarefaction data that are at the given depth rare_mat = array([row for row in rarefaction_data[3] if row[0]==depth]) # Average each col of the rarefaction mtx. Computing t test on averages over # all iterations. Avoids more comps which kills signifigance. rare_mat = (rare_mat.sum(0)/rare_mat.shape[0])[2:] #remove depth,iter cols sids = rarefaction_data[0][3:] # 0-2 are header strings results = {} for sid_pair, treatment_pair in zip(samid_pairs, treatment_pairs): # if there is only 1 sample for each treatment in a comparison, and mc # using mc method, will error (e.g. mc_t_two_sample([1],[1]). if len(sid_pair[0])==1 and len(sid_pair[1])==1: t_key = '%s,%s' % (treatment_pair[0], treatment_pair[1]) results[t_key]= (None,None) else: pair0_indices = [sids.index(i) for i in sid_pair[0]] pair1_indices = [sids.index(i) for i in sid_pair[1]] t_key = '%s,%s' % (treatment_pair[0], treatment_pair[1]) i = rare_mat.take(pair0_indices) j = rare_mat.take(pair1_indices) # found discussion of how to quickly check an array for nan here: # http://stackoverflow.com/questions/6736590/fast-check-for-nan-in-numpy if isnan(np_min(i)) or isnan(np_min(j)): results[t_key]= (None,None) continue if test_type == 'parametric': obs_t, p_val = t_two_sample(i,j) elif test_type == 'nonparametric': obs_t, _, _, p_val = mc_t_two_sample(i,j, permutations=num_permutations) if p_val != None: p_val = float(format_p_value_for_num_iters(p_val, num_iters=num_permutations)) elif p_val == None: #None will error in format_p_val obs_t, p_val = None, None else: raise ValueError("Invalid test type '%s'." % test_type) results[t_key]= (obs_t,p_val) return results
def compare_alpha_diversities(rarefaction_lines, mapping_lines, category, depth, test_type='nonparametric', num_permutations=999): """compares alpha diversities inputs: rarefaction_file - rarefaction file which gives scores for various rarefactions and depths mapping_file - file that has ID's and categories that the ID's fall in category - the category to be compared, is a string depth - the depth of the rarefaction_file to use, is an integer test_type - the type of t-test to perform, is a string. Must be either 'parametric' or 'nonparametric' num_permutations - the number of Monte Carlo permutations to use if test_type is 'nonparametric', is an integer outputs: results - a nested dictionary which specifies the category as the top level key, and as its value, dictionaries which give the results of the t_two_sample test for all unique pairs of values in the specified category """ if test_type == 'nonparametric' and num_permutations < 1: raise ValueError("Invalid number of permutations: %d. Must be greater " "than zero." % num_permutations) rarefaction_data = parse_rarefaction(rarefaction_lines) mapping_data = parse_mapping_file_to_dict(mapping_lines)[0] value_pairs = make_value_pairs_from_category(mapping_data, category) category_values_Ids = make_category_values_Id_dict(mapping_data, category) SampleId_pairs = map_category_value_pairs_to_Ids(value_pairs, category_values_Ids) map_from_Id_to_col = make_SampleIds_rarefaction_columns_dict( rarefaction_data) reduced_rarefaction_mtx = extract_rarefaction_scores_at_depth(depth, rarefaction_data) results = {category:{}} for pair in range(len(SampleId_pairs)): # Must flatten the matrix because t_two_sample only operates on # non-nested sequences (otherwise we'll get the wrong degrees of # freedom). i=(convert_SampleIds_to_rarefaction_mtx(SampleId_pairs[pair][0], reduced_rarefaction_mtx, map_from_Id_to_col)).flatten() j=(convert_SampleIds_to_rarefaction_mtx(SampleId_pairs[pair][1], reduced_rarefaction_mtx, map_from_Id_to_col)).flatten() if test_type == 'parametric': obs_t, p_val = t_two_sample(i,j) elif test_type == 'nonparametric': obs_t, _, _, p_val = mc_t_two_sample(i,j, permutations=num_permutations) p_val = format_p_value_for_num_iters(p_val, num_permutations) else: raise ValueError("Invalid test type '%s'." % test_type) results[category][(str(value_pairs[pair][0]), str(value_pairs[pair][1]))] = obs_t, p_val return results