def setUp(self): """define data for tests""" # small amount of redundancy here since setUp called at each test, but # limited tests means little concern self.rarefaction_file = \ ['\tsequences per sample\titeration\tSam1\tSam2\tSam3\tSam4\tSam5\tSam6', 'rare480.txt\t480\t0\t2.52800404052\t2.3614611247\t2.59867416108\t3.56970811181\t3.44800265895\t1.9433560517', 'rare480.txt\t480\t1\t2.06375457238\t3.32293450758\t3.4189896645\t3.35312890712\t3.10763472113\t2.78155253726', 'rare480.txt\t480\t2\t2.44788730109\t3.42464996459\t2.24541787295\t2.491419231\t2.60106690099\t5.40828403581', 'rare480.txt\t480\t3\t5.1846120153\t3.67022675065\t1.54879964908\t2.8055801405\t4.3086171269\t3.87761898868', 'rare910.txt\t910\t0\t2.67580703282\t1.72405794627\t2.15312863498\t2.4300954476\t3.7753658185\t3.36198860355', 'rare910.txt\t910\t1\t4.10226466956\t2.24587945345\t3.02932964779\t2.98218513619\t3.73316846484\t1.85879566537', 'rare910.txt\t910\t2\t1.65800670063\t2.42281993323\t3.02400997565\t3.271608097\t2.99265263795\t3.68802382515', 'rare910.txt\t910\t3\t2.50976021964\t2.43976761056\t3.32119905587\t2.47487750248\t1.901408525\t3.42883742207', 'rare500.txt\t500\t0\t3.42225118215\tn/a\t4.03758268426\t2.35344629448\t2.26690085385\t1.80164570104', 'rare850.txt\t850\t0\t4.2389858006\t4.97464230229\t1.53451087057\t3.35785261181\t1.91658777533\t2.32583475424', 'rare850.txt\t850\t1\t2.81445883827\tn/a\t2.54767461948\t1.38835207925\t3.70018890199\t1.57359105209', 'rare850.txt\t850\t2\t2.9340493412\t3.95897035158\tn/a\t2.07761860166\t3.42393336685\t2.6927305603'] self.rarefaction_data = parse_rarefaction(self.rarefaction_file) self.mapping_file = \ ['#SampleID\tDose\tLinkerPrimerSequence\tWeight\tTTD\tDescription', '#Comment Line', 'Sam1\t1xDose\tATCG\tHigh\t31\ts1_desc', 'Sam2\t1xDose\tACCG\tLow\t67\ts2_desc', 'Sam3\t2xDose\tACGT\tMed\t21\ts3_desc', 'Sam4\t2xDose\tAACG\tLow\t55\ts4_desc', 'Sam5\tControl\tCGTC\tLow\t67\ts5_desc', 'Sam6\t1xDose\tACCT\tLow\t55\ts6_desc'] self.mapping_data = parse_mapping_file_to_dict(self.mapping_file)[0]
def generate_alpha_diversity_boxplots(rarefaction_lines, mapping_lines, category, depth=None): rarefaction_data = parse_rarefaction(rarefaction_lines) category_value_to_sample_ids = \ get_category_value_to_sample_ids(mapping_lines, category) per_sample_average_diversities = \ get_per_sample_average_diversities(rarefaction_data, category, depth) per_category_value_average_diversities = \ collapse_sample_diversities_by_category_value(category_value_to_sample_ids, per_sample_average_diversities) # sort the data alphabetically sorted_per_category_value_average_diversities = \ per_category_value_average_diversities.items() sorted_per_category_value_average_diversities.sort() x_tick_labels = [] distributions = [] for cat, avg_diversities in sorted_per_category_value_average_diversities: x_tick_labels.append("%s (n=%d)" % (cat, len(avg_diversities))) distributions.append(avg_diversities) return generate_box_plots(distributions, x_tick_labels=x_tick_labels)
def compare_alpha_diversities(rarefaction_lines, mapping_lines, category, depth): """compares alpha diversities inputs: rarefaction_file - rarefaction file which gives scores for various rarefactions and depths mapping_file - file that has ID's and categories that the ID's fall in category - the category to be compared, is a string depth - the depth of the rarefaction_file to use, is an integer outputs: results - a nested dictionary which specifies the category as the top level key, and as its value, dictionaries which give the results of the t_two_sample test for all unique pairs of values in the specified category """ rarefaction_data = parse_rarefaction(rarefaction_lines) mapping_data = parse_mapping_file_to_dict(mapping_lines)[0] value_pairs = make_value_pairs_from_category(mapping_data, category) category_values_Ids = make_category_values_Id_dict(mapping_data, category) SampleId_pairs = map_category_value_pairs_to_Ids(value_pairs, category_values_Ids) map_from_Id_to_col = make_SampleIds_rarefaction_columns_dict( rarefaction_data) reduced_rarefaction_mtx = extract_rarefaction_scores_at_depth(depth, rarefaction_data) results = {category:{}} for pair in range(len(SampleId_pairs)): i=(convert_SampleIds_to_rarefaction_mtx(SampleId_pairs[pair][0], reduced_rarefaction_mtx, map_from_Id_to_col)) j=(convert_SampleIds_to_rarefaction_mtx(SampleId_pairs[pair][1], reduced_rarefaction_mtx, map_from_Id_to_col)) results[category][(str(value_pairs[pair][0]), str(value_pairs[pair][1]))] =\ t_two_sample(i,j) return results
def mean_alpha(alpha_dict, depth): """mean collated alpha diversity data at a given depth Input: alpha_dict: dictionary where the values are the lines of a collated alpha diversity data files and the keys are the names of each of these files with no extension, this name is usually the metric used to compute the alpha diversity. depth: selected depth to mean the computed alpha diversity values for the alpha_dict data. Output: metrics: list of metric names i. e. the name of each collated alpha div file sample_ids: list of sample identifiers represented data: a list of lists with the mean of alpha diversity data at a given depth for the different metrics, each column is a different metric. """ assert type(alpha_dict) == dict, "Input data must be a dictionary" assert depth >= 0 and type(depth) == int, "The spcified depth must be a "+\ "positive integer." metrics = [] sample_ids = [] data = [] for key, value in alpha_dict.iteritems(): metrics.append('{0}_even_{1}'.format(key, depth)) identifiers, _, _, rarefaction_data = parse_rarefaction(value) # check all the files have the same sample ids in the same order if sample_ids: if not sample_ids == identifiers[3:]: raise (ValueError, "Non-matching sample ids were found in the " "collated alpha diversity files. Make sure all the files " "contain data for the same samples.") else: sample_ids = identifiers[3:] # find all the data at the desired depth and get the mean values, remove # the first two elements ([depth, iteration]) as those are not needed data.append(array([row[2:] for row in rarefaction_data if\ row[0] == depth]).mean(axis=0)) # transpose the data to match the formatting of non-collated alpha div data data = array(data).T.tolist() return metrics, sample_ids, data
def _collect_alpha_diversity_boxplot_data(rarefaction_f, metadata_map, rarefaction_depth, split_category, comparison_category): """Pulls data from rarefaction file based on supplied categories.""" # Pull out rarefaction data for the specified depth. rarefaction = parse_rarefaction(rarefaction_f) # First three vals are part of the header, so ignore them. sample_ids = rarefaction[0][3:] # First two vals are depth and iteration number, so ignore them. rarefaction_data = [row[2:] for row in rarefaction[3] if row[0] == rarefaction_depth] if not rarefaction_data: raise ValueError("Rarefaction depth of %d could not be found in " "collated alpha diversity file." % rarefaction_depth) # Build up dict mapping (body site, [self|other]) -> distribution. plot_data = defaultdict(list) for row in rarefaction_data: assert len(sample_ids) == len(row) for sample_id, adiv_val in zip(sample_ids, row): if not isnan(adiv_val): split_cat_val = metadata_map.getCategoryValue(sample_id, split_category) comp_cat_val = metadata_map.getCategoryValue(sample_id, comparison_category) plot_data[split_cat_val, comp_cat_val].append(adiv_val) # Format tick labels as '<body site> (self|other)' and sort alphabetically. plot_data = sorted(map(lambda e: ('%s (%s)' % (e[0][0], e[0][1]), e[1]), plot_data.items())) x_tick_labels = [] dists = [] for label, dist in plot_data: x_tick_labels.append(label) dists.append(dist) return x_tick_labels, dists
def setUp(self): """define some top-level data""" self.data = {} self.data['xaxis'] = [10.0] self.sample_dict = {'Sample1': {10.00: [1.3276140000000001]}} self.data['yvals'] = {'Sample1': [1.3276140000000001]} self.data['err'] = {'Sample1': [.1]} self.xmax = 140 self.ymax = 20 self.std_type = 'stddev' self.ops = ['Sample1'] self.mapping_category = 'SampleID' self.imagetype = 'png' self.resolution = 70 self.mapping_lookup = {'SampleID-Sample1': 'col_0_row_0'} self.data['map'] = [['SampleID', 'Day'], ['Sample1', 'Day1']] self.color_prefs={'SampleID': {'column': 'SampleID', 'color': \ {'Sample1': '#ff0000'}}} self.groups = {'Sample1': ['Sample1']} self.background_color = 'black' self.label_color = 'white' self.labelname = 'SampleID' self.rare_data={'color': {'Sample1': '#ff0000'}, \ 'series': {'Sample1': [2.0515300000000001],}, \ 'headers': ['test.txt','SampleID'], 'xaxis': [10.0], \ 'error': {'Sample1': [0.0]}, 'options': ['Sample1']} self.fpath = '/tmp/' self.output_dir = '/tmp/' self.metric_name = 'test' self._paths_to_clean_up = [] self._folders_to_cleanup = [] self.rarefaction_file_data = [[10.0, 0.0, 1.0], [10.0, 1.0, 3.0]] d = {'redtowhite3_0': '#7fff00', 'redtowhite3_1': '#7fff00'} self.data_colors = color_dict_to_objects(d) self.colors = {'Sample1': 'redtowhite3_0', 'Sample2': 'redtowhite3_1'} self.colors2 = {'Sample1': 'redtowhite3_0'} self.mappingfile = [ '#SampleID\tSex\tAge', '123\tF\t32', '234\tM\t30', '345\tM\t32' ] #self.p_mappingfile = parse_mapping_file(self.mappingfile,\ # strip_quotes=True) self.rarefactionfile=[\ '\tsequences per sample\titeration\t123\t234\t345', 'rare10.txt\t10\t0\t1.99181\t0.42877\t2.13996', 'rare10.txt\t10\t1\t2.07163\t0.42877\t2.37055', 'rare310.txt\t310\t0\t8.83115\t0.42877\t11.00725', 'rare310.txt\t310\t1\t10.05242\t0.42877\t8.24474', 'rare610.txt\t610\t0\t12.03067\t0.42877\t11.58928', 'rare610.txt\t610\t1\t12.9862\t0.42877\t11.58642'] self.rares = {'test.txt': (['', 'sequences per sample', 'iteration', \ 'Sample1'], [], ['rare1.txt', 'rare2.txt'], \ [[10.0, 2.0, 7.0, 7.0, 9.0], [10.0, 2.0, 7.0, 7.0, 9.0]])} self.col_headers, self.comments, self.rarefaction_fns, \ self.rarefaction_data = parse_rarefaction(self.rarefactionfile) self.matrix, self.seqs_per_samp, self.sampleIDs = \ get_rarefaction_data(self.rarefaction_data, self.col_headers) self.ave_seqs_per_sample1 = {'Sample1':[2.03172,9.4417849999999994,\ 12.508435]} self.ave_seqs_per_sample = {'123':[2.03172,9.4417849999999994,\ 12.508435],'234':[0.42876999999999998,0.42876999999999998,\ 0.42876999999999998],'345':[2.255255,9.625995,11.58785]} self.collapsed_ser_sex = {'M':[1.3420125000000001,5.0273824999999999,\ 6.0083099999999998], 'F':[2.03172,9.4417849999999994,12.508435]} self.err_ser_sex = {'M':[0.91324250000000007,4.5986124999999998,\ 5.5795399999999997],'F':[0.0,0.0,0.0]} self.rarefaction_legend_mat_init = {'test': {'SampleID': {}}} self.col_headers2=['', 'sequences per sample', 'iteration', 'Sample1', \ 'Sample2'] self.rarefaction_data_mat = { 'SampleID': { 'Sample1': { 'test': { 'ave': [' 7.000'], 'err': [' nan'] } } } } self.rarefaction_legend_mat = { 'test': { 'samples': { 'Sample1': { 'color': '#ff0000', 'link': 'html_plots/testcol_0_row_0.png' } }, 'groups': { 'SampleID': { 'Sample1': { 'groupcolor': '#ff0000', 'groupsamples': ['Sample1'] } } } } } self.exp_err_series_ave = { 'M': [1.571915, 6.49885, 8.1750183333333339] }
def compare_alpha_diversities(rarefaction_lines, mapping_lines, category, depth=None, test_type='nonparametric', num_permutations=999): """Compares alpha diversity values for differences per category treatment. Notes: Returns a defaultdict which as keys has the pairs of treatments being compared, and as values, lists of (pval,tval) tuples for each comparison at for a given iteration. Inputs: rarefaction_lines - list of lines, result of multiple rarefactions. mapping_lines - list of lines, mapping file lines. category - str, the category to be compared, eg 'Treatment' or 'Age'. depth - int, depth of the rarefaction file to use. if None, then will use the deepest available in the file. test_type - str, the type of t-test to perform. Must be either 'parametric' or 'nonparametric'. num_permutations - int, the number of Monte Carlo permutations to use if test_type is 'nonparametric'. """ if test_type == 'nonparametric' and num_permutations < 1: raise ValueError("Invalid number of permutations: %d. Must be greater " "than zero." % num_permutations) rarefaction_data = parse_rarefaction(rarefaction_lines) mapping_data = parse_mapping_file_to_dict(mapping_lines)[0] # samid_pairs, treatment_pairs are in the same order samid_pairs, treatment_pairs = sampleId_pairs(mapping_data, rarefaction_data, category) ps_avg_div = get_per_sample_average_diversities(rarefaction_data, depth) ttest_results, ad_avgs = {}, {} for sid_pair, treatment_pair in zip(samid_pairs, treatment_pairs): # if there is only 1 sample for each treatment in a comparison, and mc # using mc method, will error (e.g. mc_t_two_sample([1],[1]). if len(sid_pair[0]) == 1 and len(sid_pair[1]) == 1: ttest_results[treatment_pair] = (None, None) # add alpha diversity averages and standard deviations. since their # is only a single sample if we are in this part of the loop, we can # just record the sample value as the avg and 0 as the std. ad_avgs[treatment_pair[0]] = (sid_pair[0][0], 0.) ad_avgs[treatment_pair[1]] = (sid_pair[1][0], 0.) else: i = array([ps_avg_div[x] for x in sid_pair[0]]) j = array([ps_avg_div[x] for x in sid_pair[1]]) # add alpha diversity averages and standard deviations. ad_avgs[treatment_pair[0]] = (i.mean(), i.std()) ad_avgs[treatment_pair[1]] = (j.mean(), j.std()) # conduct tests if isnan(np_min(i)) or isnan(np_min(j)): ttest_results[treatment_pair] = (None, None) continue if test_type == 'parametric': obs_t, p_val = t_two_sample(i, j) elif test_type == 'nonparametric': obs_t, _, _, p_val = mc_t_two_sample(i, j, permutations=num_permutations) if p_val is not None: p_val = float(format_p_value_for_num_iters(p_val, num_iters=num_permutations)) elif p_val is None: # None will error in format_p_val obs_t, p_val = None, None else: raise ValueError("Invalid test type '%s'." % test_type) ttest_results[treatment_pair] = (obs_t, p_val) return ttest_results, ad_avgs
def setUp(self): """define some top-level data""" self.data = {} self.data["xaxis"] = [10.0] self.sample_dict = {"Sample1": {10.00: [1.3276140000000001]}} self.data["yvals"] = {"Sample1": [1.3276140000000001]} self.data["err"] = {"Sample1": [0.1]} self.xmax = 140 self.ymax = 20 self.std_type = "stddev" self.ops = ["Sample1"] self.mapping_category = "SampleID" self.imagetype = "png" self.resolution = 70 self.mapping_lookup = {"SampleID-Sample1": "col_0_row_0"} self.data["map"] = [["SampleID", "Day"], ["Sample1", "Day1"]] self.color_prefs = {"SampleID": {"column": "SampleID", "color": {"Sample1": "#ff0000"}}} self.groups = {"Sample1": ["Sample1"]} self.background_color = "black" self.label_color = "white" self.labelname = "SampleID" self.rare_data = { "color": {"Sample1": "#ff0000"}, "series": {"Sample1": [2.0515300000000001]}, "headers": ["test.txt", "SampleID"], "xaxis": [10.0], "error": {"Sample1": [0.0]}, "options": ["Sample1"], } self.fpath = "/tmp/" self.output_dir = "/tmp/" self.metric_name = "test" self._paths_to_clean_up = [] self._folders_to_cleanup = [] self.rarefaction_file_data = [[10.0, 0.0, 1.0], [10.0, 1.0, 3.0]] d = {"redtowhite3_0": "#7fff00", "redtowhite3_1": "#7fff00"} self.data_colors = color_dict_to_objects(d) self.colors = {"Sample1": "redtowhite3_0", "Sample2": "redtowhite3_1"} self.colors2 = {"Sample1": "redtowhite3_0"} self.mappingfile = ["#SampleID\tSex\tAge", "123\tF\t32", "234\tM\t30", "345\tM\t32"] # self.p_mappingfile = parse_mapping_file(self.mappingfile,\ # strip_quotes=True) self.rarefactionfile = [ "\tsequences per sample\titeration\t123\t234\t345", "rare10.txt\t10\t0\t1.99181\t0.42877\t2.13996", "rare10.txt\t10\t1\t2.07163\t0.42877\t2.37055", "rare310.txt\t310\t0\t8.83115\t0.42877\t11.00725", "rare310.txt\t310\t1\t10.05242\t0.42877\t8.24474", "rare610.txt\t610\t0\t12.03067\t0.42877\t11.58928", "rare610.txt\t610\t1\t12.9862\t0.42877\t11.58642", ] self.rares = { "test.txt": ( ["", "sequences per sample", "iteration", "Sample1"], [], ["rare1.txt", "rare2.txt"], [[10.0, 2.0, 7.0, 7.0, 9.0], [10.0, 2.0, 7.0, 7.0, 9.0]], ) } self.col_headers, self.comments, self.rarefaction_fns, self.rarefaction_data = parse_rarefaction( self.rarefactionfile ) self.matrix, self.seqs_per_samp, self.sampleIDs = get_rarefaction_data(self.rarefaction_data, self.col_headers) self.ave_seqs_per_sample1 = {"Sample1": [2.03172, 9.4417849999999994, 12.508435]} self.ave_seqs_per_sample = { "123": [2.03172, 9.4417849999999994, 12.508435], "234": [0.42876999999999998, 0.42876999999999998, 0.42876999999999998], "345": [2.255255, 9.625995, 11.58785], } self.collapsed_ser_sex = { "M": [1.3420125000000001, 5.0273824999999999, 6.0083099999999998], "F": [2.03172, 9.4417849999999994, 12.508435], } self.err_ser_sex = {"M": [0.91324250000000007, 4.5986124999999998, 5.5795399999999997], "F": [0.0, 0.0, 0.0]} self.rarefaction_legend_mat_init = {"test": {"SampleID": {}}} self.col_headers2 = ["", "sequences per sample", "iteration", "Sample1", "Sample2"] self.rarefaction_data_mat = {"SampleID": {"Sample1": {"test": {"ave": [" 7.000"], "err": [" nan"]}}}} self.rarefaction_legend_mat = { "test": { "samples": {"Sample1": {"color": "#ff0000", "link": "html_plots/testcol_0_row_0.png"}}, "groups": {"SampleID": {"Sample1": {"groupcolor": "#ff0000", "groupsamples": ["Sample1"]}}}, } } self.exp_err_series_ave = {"M": [1.571915, 6.49885, 8.1750183333333339]}
def setUp(self): """define data for tests""" self.rarefaction_file = \ ['\tsequences per sample\titeration\t123\t234\t345\t456', 'rare10.txt\t10\t0\t1.99181\t5.42877\t2.13996\t0.002322', 'rare10.txt\t10\t1\t2.07163\t1.42877\t2.37055\t0.01219', 'rare310.txt\t310\t0\t8.83115\t6.42877\t11.00725\t0.18233', 'rare310.txt\t310\t1\t10.05242\t9.42877\t8.24474\t0.99229', 'rare810.txt\t810\t0\t12.03067\tn/a\t11.58928\t0.8993', 'rare910.txt\t910\t1\t12.9862\t2.42877\t11.58642\t1.22563'] self.rarefaction_data = parse_rarefaction(self.rarefaction_file) self.mapping_file = \ ['#SampleID\tTreatment\tLinker'+\ 'PrimerSequence\tDose\tTTD\tDescription', '#Comment Line', '123\tAAAA\tBBBB\tHigh\t31\tM_ID_123', '234\tCCCC\tDDDD\tLow\t67\tM_ID_234', '345\tAAAA\tFFFF\tMed\t21\tM_ID_345', '456\tAAAA\tGGGG\tLow\t67\tM_ID_456' ] self.mapping_data = \ parse_mapping_file_to_dict(self.mapping_file)[0] self.value_pairs_Dose = \ [('Low','Med'),('Low','High'),('Med','High')] self.value_pairs_TTD = \ [('67', '21'), ('67', '31'), ('21', '31')] self.value_pairs_Treatment = \ [('CCCC', 'AAAA')] self.cat_val_Dose = \ {'High': ['123'], 'Low': ['234', '456'], 'Med': ['345']} self.cat_val_TTD = \ {'21': ['345'], '31': ['123'], '67': ['234', '456']} self.cat_val_Treatment = \ {'AAAA': ['345', '123', '456'], 'CCCC': ['234']} self.Id_pairs_Dose = \ [(['234', '456'], ['345']), (['234', '456'], ['123']), (['345'], ['123'])] self.Id_pairs_TTD = \ [(['234', '456'], ['345']), (['234', '456'], ['123']), (['345'], ['123'])] self.Id_pairs_Treatment = \ [(['234'], ['345', '123', '456'])] self.rarefaction_cols_dict = \ {'123': 0, '234': 1, '345': 2, '456':3} self.extracted_mtx_10 = \ array([[ 1.99181, 5.42877, 2.13996, 0.002322], [ 2.07163, 1.42877, 2.37055, 0.01219]]) self.extracted_mtx_310 = \ array([[ 8.83115, 6.42877, 11.00725, 0.18233], [ 10.05242, 9.42877, 8.24474, 0.99229]]) self.extracted_mtx_910 = \ array([[ 12.9862 , 2.42877, 11.58642, 1.22563]]) self.sample_pair1 = \ (['234'], ['345', '123']) self.rarefaction_mtx_for_sample_pair1_0 = \ array([[ 5.42877], [ 1.42877]]) self.rarefaction_mtx_for_sample_pair1_1 = \ array([[ 2.13996, 1.99181], [ 2.37055, 2.07163]]) self.compared_alpha_diversities_TTD = { 'TTD': { ('67', '21'): (-0.27929839680103463, 0.79386220041241184), ('21', '31'): (1.8321466933860993, 0.20839398129924847), ('67', '31'): (-0.16318504125427058, 0.87828549279958279) } }
def mean_alpha(alpha_dict, depth): """mean collated alpha diversity data at a given depth Input: alpha_dict: dictionary where the values are the lines of a collated alpha diversity data files and the keys are the names of each of these files with no extension, this name is usually the metric used to compute the alpha diversity. depth: selected depth to mean the computed alpha diversity values for the alpha_dict data. If None is passed, the highest depth will be used. Output: metrics: list of metric names i. e. the name of each collated alpha div file sample_ids: list of sample identifiers represented data: a list of lists with the mean of alpha diversity data at a given depth for the different metrics, each column is a different metric. """ assert type(alpha_dict) == dict, "Input data must be a dictionary" assert depth == None or (depth >= 0 and type(depth) == int), "The "+\ "specified depth must be a positive integer." metrics = [] sample_ids = [] data = [] for key, value in alpha_dict.iteritems(): identifiers, _, _, rarefaction_data = parse_rarefaction(value) # if depth is specified as None use the highest available, retrieve it # on a per file basis so you make sure the value exists for all files if depth == None: _depth = int(max([row[0] for row in rarefaction_data])) else: _depth = depth metrics.append('{0}_even_{1}'.format(key, _depth)) # check there are elements with the desired rarefaction depth if sum([1 for row in rarefaction_data if row[0] == _depth]) == 0: # get a sorted list of strings with the available rarefaction depths available_rarefaction_depths = map(str, sorted(list(set([row[0] for row in rarefaction_data])))) raise ValueError, ("The depth %d does not exist in the collated " "alpha diversity file for the metric: %s. The available depths " "are: %s."%(_depth,key,', '.join(available_rarefaction_depths))) # check all the files have the same sample ids in the same order if sample_ids: if not sample_ids == identifiers[3:]: raise ValueError, ("Non-matching sample ids were found in the " "collated alpha diversity files. Make sure all the files " "contain data for the same samples.") else: sample_ids = identifiers[3:] # find all the data at the desired depth and get the mean values, remove # the first two elements ([depth, iteration]) as those are not needed data.append(array([row[2:] for row in rarefaction_data if\ row[0] == _depth]).mean(axis=0)) # transpose the data to match the formatting of non-collated alpha div data data = array(data).T.tolist() return metrics, sample_ids, data
def compare_alpha_diversities(rarefaction_lines, mapping_lines, category, depth=None, test_type='nonparametric', num_permutations=999): """Compares alpha diversity values for differences per category treatment. Notes: Returns a defaultdict which as keys has the pairs of treatments being compared, and as values, lists of (pval,tval) tuples for each comparison at for a given iteration. Inputs: rarefaction_lines - list of lines, result of multiple rarefactions. mapping_lines - list of lines, mapping file lines. category - str, the category to be compared, eg 'Treatment' or 'Age'. depth - int, depth of the rarefaction file to use. if None, then will use the deepest available in the file. test_type - str, the type of t-test to perform. Must be either 'parametric' or 'nonparametric'. num_permutations - int, the number of Monte Carlo permutations to use if test_type is 'nonparametric'. """ if test_type == 'nonparametric' and num_permutations < 1: raise ValueError("Invalid number of permutations: %d. Must be greater " "than zero." % num_permutations) rarefaction_data = parse_rarefaction(rarefaction_lines) mapping_data = parse_mapping_file_to_dict(mapping_lines)[0] # samid_pairs, treatment_pairs are in the same order samid_pairs, treatment_pairs = sampleId_pairs(mapping_data, rarefaction_data, category) # extract only rows of the rarefaction data that are at the given depth # if depth is not given default to the deepest rarefaction available # rarefaction file is not guaranteed to be in order of rarefaction depth if depth == None: depth = array(rarefaction_data[3])[:,0].max() rare_mat = array([row for row in rarefaction_data[3] if row[0]==depth]) # Average each col of the rarefaction mtx. Computing t test on averages over # all iterations. Avoids more comps which kills signifigance. rare_mat = (rare_mat.sum(0)/rare_mat.shape[0])[2:] #remove depth,iter cols sids = rarefaction_data[0][3:] # 0-2 are header strings results = {} for sid_pair, treatment_pair in zip(samid_pairs, treatment_pairs): # if there is only 1 sample for each treatment in a comparison, and mc # using mc method, will error (e.g. mc_t_two_sample([1],[1]). if len(sid_pair[0])==1 and len(sid_pair[1])==1: t_key = '%s,%s' % (treatment_pair[0], treatment_pair[1]) results[t_key]= (None,None) else: pair0_indices = [sids.index(i) for i in sid_pair[0]] pair1_indices = [sids.index(i) for i in sid_pair[1]] t_key = '%s,%s' % (treatment_pair[0], treatment_pair[1]) i = rare_mat.take(pair0_indices) j = rare_mat.take(pair1_indices) # found discussion of how to quickly check an array for nan here: # http://stackoverflow.com/questions/6736590/fast-check-for-nan-in-numpy if isnan(np_min(i)) or isnan(np_min(j)): results[t_key]= (None,None) continue if test_type == 'parametric': obs_t, p_val = t_two_sample(i,j) elif test_type == 'nonparametric': obs_t, _, _, p_val = mc_t_two_sample(i,j, permutations=num_permutations) if p_val != None: p_val = float(format_p_value_for_num_iters(p_val, num_iters=num_permutations)) elif p_val == None: #None will error in format_p_val obs_t, p_val = None, None else: raise ValueError("Invalid test type '%s'." % test_type) results[t_key]= (obs_t,p_val) return results
def setUp(self): """define data for tests""" self.rarefaction_file = \ ['\tsequences per sample\titeration\t123\t234\t345\t456', 'rare10.txt\t10\t0\t1.99181\t5.42877\t2.13996\t0.002322', 'rare10.txt\t10\t1\t2.07163\t1.42877\t2.37055\t0.01219', 'rare310.txt\t310\t0\t8.83115\t6.42877\t11.00725\t0.18233', 'rare310.txt\t310\t1\t10.05242\t9.42877\t8.24474\t0.99229', 'rare810.txt\t810\t0\t12.03067\tn/a\t11.58928\t0.8993', 'rare910.txt\t910\t1\t12.9862\t2.42877\t11.58642\t1.22563'] self.rarefaction_data = parse_rarefaction(self.rarefaction_file) self.mapping_file = \ ['#SampleID\tTreatment\tLinker'+\ 'PrimerSequence\tDose\tTTD\tDescription', '#Comment Line', '123\tAAAA\tBBBB\tHigh\t31\tM_ID_123', '234\tCCCC\tDDDD\tLow\t67\tM_ID_234', '345\tAAAA\tFFFF\tMed\t21\tM_ID_345', '456\tAAAA\tGGGG\tLow\t67\tM_ID_456' ] self.mapping_data = \ parse_mapping_file_to_dict(self.mapping_file)[0] self.value_pairs_Dose = \ [('Low','Med'),('Low','High'),('Med','High')] self.value_pairs_TTD = \ [('67', '21'), ('67', '31'), ('21', '31')] self.value_pairs_Treatment = \ [('CCCC', 'AAAA')] self.cat_val_Dose = \ {'High': ['123'], 'Low': ['234', '456'], 'Med': ['345']} self.cat_val_TTD = \ {'21': ['345'], '31': ['123'], '67': ['234', '456']} self.cat_val_Treatment = \ {'AAAA': ['345', '123', '456'], 'CCCC': ['234']} self.Id_pairs_Dose = \ [(['234', '456'], ['345']), (['234', '456'], ['123']), (['345'], ['123'])] self.Id_pairs_TTD = \ [(['234', '456'], ['345']), (['234', '456'], ['123']), (['345'], ['123'])] self.Id_pairs_Treatment = \ [(['234'], ['345', '123', '456'])] self.rarefaction_cols_dict = \ {'123': 0, '234': 1, '345': 2, '456':3} self.extracted_mtx_10 = \ array([[ 1.99181, 5.42877, 2.13996, 0.002322], [ 2.07163, 1.42877, 2.37055, 0.01219]]) self.extracted_mtx_310 = \ array([[ 8.83115, 6.42877, 11.00725, 0.18233], [ 10.05242, 9.42877, 8.24474, 0.99229]]) self.extracted_mtx_910 = \ array([[ 12.9862 , 2.42877, 11.58642, 1.22563]]) self.sample_pair1 = \ (['234'], ['345', '123']) self.rarefaction_mtx_for_sample_pair1_0 = \ array([[ 5.42877], [ 1.42877]]) self.rarefaction_mtx_for_sample_pair1_1 = \ array([[ 2.13996, 1.99181], [ 2.37055, 2.07163]]) self.compared_alpha_diversities_TTD = {'TTD': {('67', '21'): (-0.27929839680103463, 0.79386220041241184), ('21', '31'): (1.8321466933860993, 0.20839398129924847), ('67', '31'): (-0.16318504125427058, 0.87828549279958279)}}
def setUp(self): """define data for tests""" self.rarefaction_file = \ ['\tsequences per sample\titeration\t123\t234\t345', 'rare10.txt\t10\t0\t1.99181\t5.42877\t2.13996', 'rare10.txt\t10\t1\t2.07163\t1.42877\t2.37055', 'rare310.txt\t310\t0\t8.83115\t6.42877\t11.00725', 'rare310.txt\t310\t1\t10.05242\t9.42877\t8.24474', 'rare810.txt\t810\t0\t12.03067\tn/a\t11.58928', 'rare910.txt\t910\t1\t12.9862\t2.42877\t11.58642'] self.rarefaction_data = parse_rarefaction(self.rarefaction_file) self.mapping_file = \ ['#SampleID\tTreatment\tLinker'+\ 'PrimerSequence\tDose\tTTD\tDescription', '#Comment Line', '123\tAAAA\tBBBB\tHigh\t31\tM_ID_123', '234\tCCCC\tDDDD\tLow\t67\tM_ID_234', '345\tAAAA\tFFFF\tMed\t21\tM_ID_345'] self.mapping_data = \ parse_mapping_file_to_dict(self.mapping_file)[0] self.value_pairs_Dose = \ [('Low','Med'),('Low','High'),('Med','High')] self.value_pairs_TTD = \ [('67', '21'), ('67', '31'), ('21', '31')] self.value_pairs_Treatment = \ [('CCCC', 'AAAA')] self.cat_val_Dose = \ {'High': ['123'], 'Low': ['234'], 'Med': ['345']} self.cat_val_TTD = \ {'21': ['345'], '31': ['123'], '67': ['234']} self.cat_val_Treatment = \ {'AAAA': ['345', '123'], 'CCCC': ['234']} self.Id_pairs_Dose = \ [(['234'], ['345']), (['234'], ['123']), (['345'], ['123'])] self.Id_pairs_TTD = \ [(['234'], ['345']), (['234'], ['123']), (['345'], ['123'])] self.Id_pairs_Treatment = \ [(['234'], ['345', '123'])] self.rarefaction_cols_dict = \ {'123': 0, '234': 1, '345': 2} self.extracted_mtx_10 = \ array([[ 1.99181, 5.42877, 2.13996], [ 2.07163, 1.42877, 2.37055]]) self.extracted_mtx_310 = \ array([[ 8.83115, 6.42877, 11.00725], [ 10.05242, 9.42877, 8.24474]]) self.extracted_mtx_910 = \ array([[ 12.9862 , 2.42877, 11.58642]]) self.sample_pair1 = \ (['234'], ['345', '123']) self.rarefaction_mtx_for_sample_pair1_0 = \ array([[ 5.42877], [ 1.42877]]) self.rarefaction_mtx_for_sample_pair1_1 = \ array([[ 2.13996, 1.99181], [ 2.37055, 2.07163]]) self.compared_alpha_diversities_TTD = \ {'TTD':{('21','31'):(1.8321466933860993,0.20839398129924847), ('67', '21'): (0.58578495700890432, 0.61731739324369639), ('67', '31'): (0.69838596448703294, 0.55721515283248324)}}
def compare_alpha_diversities(rarefaction_lines, mapping_lines, category, depth=None, test_type='nonparametric', num_permutations=999): """Compares alpha diversity values for differences per category treatment. Notes: Returns a defaultdict which as keys has the pairs of treatments being compared, and as values, lists of (pval,tval) tuples for each comparison at for a given iteration. Inputs: rarefaction_lines - list of lines, result of multiple rarefactions. mapping_lines - list of lines, mapping file lines. category - str, the category to be compared, eg 'Treatment' or 'Age'. depth - int, depth of the rarefaction file to use. if None, then will use the deepest available in the file. test_type - str, the type of t-test to perform. Must be either 'parametric' or 'nonparametric'. num_permutations - int, the number of Monte Carlo permutations to use if test_type is 'nonparametric'. """ if test_type == 'nonparametric' and num_permutations < 1: raise ValueError("Invalid number of permutations: %d. Must be greater " "than zero." % num_permutations) rarefaction_data = parse_rarefaction(rarefaction_lines) mapping_data = parse_mapping_file_to_dict(mapping_lines)[0] # samid_pairs, treatment_pairs are in the same order samid_pairs, treatment_pairs = sampleId_pairs(mapping_data, rarefaction_data, category) # extract only rows of the rarefaction data that are at the given depth # if depth is not given default to the deepest rarefaction available # rarefaction file is not guaranteed to be in order of rarefaction depth if depth == None: depth = array(rarefaction_data[3])[:,0].max() rare_mat = array([row for row in rarefaction_data[3] if row[0]==depth]) # Average each col of the rarefaction mtx. Computing t test on averages over # all iterations. Avoids more comps which kills signifigance. rare_mat = (rare_mat.sum(0)/rare_mat.shape[0])[2:] #remove depth,iter cols sids = rarefaction_data[0][3:] # 0-2 are header strings ttest_results = {} for sid_pair, treatment_pair in zip(samid_pairs, treatment_pairs): # if there is only 1 sample for each treatment in a comparison, and mc # using mc method, will error (e.g. mc_t_two_sample([1],[1]). if len(sid_pair[0])==1 and len(sid_pair[1])==1: ttest_results[treatment_pair]= (None,None) else: pair0_indices = [sids.index(i) for i in sid_pair[0]] pair1_indices = [sids.index(i) for i in sid_pair[1]] i = rare_mat.take(pair0_indices) j = rare_mat.take(pair1_indices) # found discussion of how to quickly check an array for nan here: # http://stackoverflow.com/questions/6736590/fast-check-for-nan-in-numpy if isnan(np_min(i)) or isnan(np_min(j)): ttest_results[treatment_pair]= (None,None) continue if test_type == 'parametric': obs_t, p_val = t_two_sample(i,j) elif test_type == 'nonparametric': obs_t, _, _, p_val = mc_t_two_sample(i,j, permutations=num_permutations) if p_val != None: p_val = float(format_p_value_for_num_iters(p_val, num_iters=num_permutations)) elif p_val == None: #None will error in format_p_val obs_t, p_val = None, None else: raise ValueError("Invalid test type '%s'." % test_type) ttest_results[treatment_pair]= (obs_t,p_val) # create dict of average alpha diversity values alphadiv_avgs = {} for sid_pair, treatment_pair in zip(samid_pairs, treatment_pairs): # calculate the alpha diversity average, std vals. choosing only first # treatment pair doesn't guarantees full covering, must look at both for sid_list, treatment_str in zip(sid_pair, treatment_pair): # check if already computed and added if not treatment_str in alphadiv_avgs.keys(): alphadiv_vals = \ rare_mat.take([sids.index(i) for i in sid_list]) ad_mean = alphadiv_vals.mean() ad_std = alphadiv_vals.std() alphadiv_avgs[treatment_str] = (ad_mean, ad_std) return ttest_results, alphadiv_avgs
def compare_alpha_diversities(rarefaction_lines, mapping_lines, category, depth, test_type='nonparametric', num_permutations=999): """Compares alpha diversity values for differences per category treatment. Notes: Returns a defaultdict which as keys has the pairs of treatments being compared, and as values, lists of (pval,tval) tuples for each comparison at for a given iteration. Inputs: rarefaction_lines - list of lines, result of multiple rarefactions. mapping_lines - list of lines, mapping file lines. category - str, the category to be compared, eg 'Treatment' or 'Age'. depth - int, depth of the rarefaction file to use. test_type - str, the type of t-test to perform. Must be either 'parametric' or 'nonparametric'. num_permutations - int, the number of Monte Carlo permutations to use if test_type is 'nonparametric'. """ if test_type == 'nonparametric' and num_permutations < 1: raise ValueError("Invalid number of permutations: %d. Must be greater " "than zero." % num_permutations) rarefaction_data = parse_rarefaction(rarefaction_lines) mapping_data = parse_mapping_file_to_dict(mapping_lines)[0] # samid_pairs, treatment_pairs are in the same order samid_pairs, treatment_pairs = sampleId_pairs(mapping_data, rarefaction_data, category) # extract only rows of the rarefaction data that are at the given depth rare_mat = array([row for row in rarefaction_data[3] if row[0]==depth]) # Average each col of the rarefaction mtx. Computing t test on averages over # all iterations. Avoids more comps which kills signifigance. rare_mat = (rare_mat.sum(0)/rare_mat.shape[0])[2:] #remove depth,iter cols sids = rarefaction_data[0][3:] # 0-2 are header strings results = {} for sid_pair, treatment_pair in zip(samid_pairs, treatment_pairs): # if there is only 1 sample for each treatment in a comparison, and mc # using mc method, will error (e.g. mc_t_two_sample([1],[1]). if len(sid_pair[0])==1 and len(sid_pair[1])==1: t_key = '%s,%s' % (treatment_pair[0], treatment_pair[1]) results[t_key]= (None,None) else: pair0_indices = [sids.index(i) for i in sid_pair[0]] pair1_indices = [sids.index(i) for i in sid_pair[1]] t_key = '%s,%s' % (treatment_pair[0], treatment_pair[1]) i = rare_mat.take(pair0_indices) j = rare_mat.take(pair1_indices) # found discussion of how to quickly check an array for nan here: # http://stackoverflow.com/questions/6736590/fast-check-for-nan-in-numpy if isnan(np_min(i)) or isnan(np_min(j)): results[t_key]= (None,None) continue if test_type == 'parametric': obs_t, p_val = t_two_sample(i,j) elif test_type == 'nonparametric': obs_t, _, _, p_val = mc_t_two_sample(i,j, permutations=num_permutations) if p_val != None: p_val = float(format_p_value_for_num_iters(p_val, num_iters=num_permutations)) elif p_val == None: #None will error in format_p_val obs_t, p_val = None, None else: raise ValueError("Invalid test type '%s'." % test_type) results[t_key]= (obs_t,p_val) return results
def test_parse_rarefaction(self): self.rarefactionfile = [ '\tsequences per sample\titeration\t123\t234\t345', 'rare10.txt\t10\t0\t1.99181\t0.42877\t2.13996', 'rare10.txt\t10\t1\t2.07163\t0.42877\t2.37055', 'rare310.txt\t310\t0\t8.83115\t0.42877\t11.00725', 'rare310.txt\t310\t1\t10.05242\t0.42877\t8.24474', 'rare610.txt\t610\t0\t12.03067\t0.42877\t11.58928', 'rare610.txt\t610\t1\t12.9862\t0.42877\t11.58642' ] self.col_headers = [ '', 'sequences per sample', 'iteration', '123', '234', '345' ] self.comments = [] self.rarefaction_fns = [ 'rare10.txt', 'rare10.txt', 'rare310.txt', 'rare310.txt', 'rare610.txt', 'rare610.txt' ] self.rarefaction_data = [ [ 10.0, 0.0, 1.9918100000000001, 0.42876999999999998, 2.1399599999999999 ], [ 10.0, 1.0, 2.0716299999999999, 0.42876999999999998, 2.3705500000000002 ], [ 310.0, 0.0, 8.8311499999999992, 0.42876999999999998, 11.007250000000001 ], [310.0, 1.0, 10.05242, 0.42876999999999998, 8.2447400000000002], [610.0, 0.0, 12.030670000000001, 0.42876999999999998, 11.58928], [610.0, 1.0, 12.9862, 0.42876999999999998, 11.58642] ] test_col_headers, test_comments, test_rarefaction_fns, test_rarefaction_data = parse_rarefaction( self.rarefactionfile) self.assertEqual(test_col_headers, self.col_headers) self.assertEqual(test_comments, self.comments) self.assertEqual(test_rarefaction_fns, self.rarefaction_fns) self.assertEqual(test_rarefaction_data, self.rarefaction_data)
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) input_dir = opts.input_dir imagetype = opts.imagetype resolution = opts.resolution output_dir = opts.output_dir ymax = opts.ymax std_type = opts.std_type suppress_webpage = opts.suppress_html_output output_type = opts.output_type generate_per_sample_plots = opts.generate_per_sample_plots generate_average_tables = opts.generate_average_tables # Get the command-line options. prefs, data, background_color, label_color, ball_scale, arrow_colors = \ sample_color_prefs_and_map_data_from_options(opts) rares = {} if isdir(input_dir): rarenames = listdir(input_dir) rarenames = [r for r in rarenames if not r.startswith('.')] for r in rarenames: try: rarefl = open(path.join(input_dir, r), 'U').readlines() rares[r] = parse_rarefaction(rarefl) except (IOError): option_parser.error('Problem with rarefaction file. %s' % exc_info()[1]) exit(0) else: try: input_file = input_dir.split(',') for i in range(len(input_file)): input_path = split(input_file[i])[-1] rarefl = open(input_file[i], 'U').readlines() rares[input_path] = parse_rarefaction(rarefl) except (IOError): option_parser.error('Problem with rarefaction file. %s' % exc_info()[1]) exit(0) if imagetype not in ['png', 'svg', 'pdf']: option_parser.error('Supplied extension not supported.') exit(0) try: resolution = int(resolution) except (ValueError): option_parser.error('Invalid resolution.') exit(0) # output directory check if isinstance(output_dir, str) and output_dir != '.': if exists(output_dir): output_dir = output_dir else: try: create_dir(output_dir, False) output_dir = output_dir except (ValueError): option_parser.error('Could not create output directory.') exit(0) else: output_dir = mkdtemp('./') # Generate the plots and html text html_output = make_averages( prefs, data, background_color, label_color, rares, output_dir, resolution, imagetype, ymax, suppress_webpage, std_type, output_type, generate_per_sample_plots=generate_per_sample_plots, generate_average_tables=generate_average_tables) if html_output: # Write the html file. outfile = open(path.join(output_dir, 'rarefaction_plots.html'), 'w') outfile.write(html_output) outfile.close()
def test_parse_rarefaction(self): self.rarefactionfile = ['\tsequences per sample\titeration\t123\t234\t345', 'rare10.txt\t10\t0\t1.99181\t0.42877\t2.13996', 'rare10.txt\t10\t1\t2.07163\t0.42877\t2.37055', 'rare310.txt\t310\t0\t8.83115\t0.42877\t11.00725', 'rare310.txt\t310\t1\t10.05242\t0.42877\t8.24474', 'rare610.txt\t610\t0\t12.03067\t0.42877\t11.58928', 'rare610.txt\t610\t1\t12.9862\t0.42877\t11.58642'] self.col_headers = ['', 'sequences per sample', 'iteration', '123', '234', '345'] self.comments = [] self.rarefaction_fns = ['rare10.txt', 'rare10.txt', 'rare310.txt', 'rare310.txt', 'rare610.txt', 'rare610.txt'] self.rarefaction_data = [[10.0, 0.0, 1.9918100000000001, 0.42876999999999998, 2.1399599999999999], [10.0, 1.0, 2.0716299999999999, 0.42876999999999998, 2.3705500000000002], [310.0, 0.0, 8.8311499999999992, 0.42876999999999998, 11.007250000000001], [310.0, 1.0, 10.05242, 0.42876999999999998, 8.2447400000000002], [610.0, 0.0, 12.030670000000001, 0.42876999999999998, 11.58928], [610.0, 1.0, 12.9862, 0.42876999999999998, 11.58642]] test_col_headers, test_comments, test_rarefaction_fns, test_rarefaction_data = parse_rarefaction(self.rarefactionfile) self.assertEqual(test_col_headers, self.col_headers) self.assertEqual(test_comments, self.comments) self.assertEqual(test_rarefaction_fns, self.rarefaction_fns) self.assertEqual(test_rarefaction_data, self.rarefaction_data)
def main(): option_parser, options, args = parse_command_line_parameters(**script_info) ops = {} input_dir = options.input_dir rares = {} if isdir(input_dir): rarenames = listdir(input_dir) rarenames = [r for r in rarenames if not r.startswith(".")] for r in rarenames: try: rarefl = open(path.join(input_dir, r), "U").readlines() rares[r] = parse_rarefaction(rarefl) except (IOError): option_parser.error("Problem with rarefaction file. %s" % exc_info()[1]) exit(0) else: try: input_file = input_dir.split(",") for i in range(len(input_file)): input_path = split(input_file[i])[-1] rarefl = open(input_file[i], "U").readlines() rares[input_path] = parse_rarefaction(rarefl) except (IOError): option_parser.error("Problem with rarefaction file. %s" % exc_info()[1]) exit(0) if options.imagetype not in ["png", "svg", "pdf"]: option_parser.error("Supplied extension not supported.") exit(0) else: imagetype = options.imagetype try: resolution = int(options.resolution) except (ValueError): option_parser.error("Inavlid resolution.") exit(0) # Get the command-line options. prefs, data, background_color, label_color, ball_scale, arrow_colors = sample_color_prefs_and_map_data_from_options( options ) # output directory check if isinstance(options.output_dir, str) and options.output_dir != ".": if exists(options.output_dir): output_dir = options.output_dir else: try: create_dir(options.output_dir, False) output_dir = options.output_dir except (ValueError): option_parser.error("Could not create output directory.") exit(0) else: output_dir = get_random_directory_name() # Generate the plots and html text ymax = options.ymax suppress_webpage = options.suppress_html_output html_output = make_averages( prefs, data, background_color, label_color, rares, output_dir, resolution, imagetype, ymax, suppress_webpage ) if html_output: # Write the html file. outfile = open(path.join(output_dir, "rarefaction_plots.html"), "w") outfile.write(html_output) outfile.close()
def setUp(self): """define some top-level data""" self.data={} self.data['xaxis']=[10.0] self.sample_dict={'Sample1':{10.00: [1.3276140000000001]}} self.data['yvals']={'Sample1': [1.3276140000000001]} self.data['err']={'Sample1': [.1]} self.xmax=140 self.ymax=20 self.std_type='stddev' self.ops=['Sample1'] self.mapping_category='SampleID' self.imagetype='png' self.resolution=70 self.mapping_lookup={'SampleID-Sample1':'col_0_row_0'} self.data['map']=[['SampleID','Day'],['Sample1','Day1']] self.color_prefs={'SampleID': {'column': 'SampleID', 'color': \ {'Sample1': '#ff0000'}}} self.groups={'Sample1':['Sample1']} self.background_color='black' self.label_color='white' self.labelname='SampleID' self.rare_data={'color': {'Sample1': '#ff0000'}, \ 'series': {'Sample1': [2.0515300000000001],}, \ 'headers': ['test.txt','SampleID'], 'xaxis': [10.0], \ 'error': {'Sample1': [0.0]}, 'options': ['Sample1']} self.fpath='/tmp/' self.output_dir='/tmp/' self.metric_name='test' self._paths_to_clean_up = [] self._folders_to_cleanup = [] self.rarefaction_file_data=[[10.0, 0.0, 1.0], [10.0, 1.0, 3.0]] d = {'redtowhite3_0':'#7fff00','redtowhite3_1':'#7fff00'} self.data_colors = color_dict_to_objects(d) self.colors={'Sample1':'redtowhite3_0','Sample2':'redtowhite3_1'} self.colors2={'Sample1':'redtowhite3_0'} self.mappingfile = ['#SampleID\tSex\tAge', '123\tF\t32', '234\tM\t30', '345\tM\t32'] #self.p_mappingfile = parse_mapping_file(self.mappingfile,\ # strip_quotes=True) self.rarefactionfile=[\ '\tsequences per sample\titeration\t123\t234\t345', 'rare10.txt\t10\t0\t1.99181\t0.42877\t2.13996', 'rare10.txt\t10\t1\t2.07163\t0.42877\t2.37055', 'rare310.txt\t310\t0\t8.83115\t0.42877\t11.00725', 'rare310.txt\t310\t1\t10.05242\t0.42877\t8.24474', 'rare610.txt\t610\t0\t12.03067\t0.42877\t11.58928', 'rare610.txt\t610\t1\t12.9862\t0.42877\t11.58642'] self.rares = {'test.txt': (['', 'sequences per sample', 'iteration', \ 'Sample1'], [], ['rare1.txt', 'rare2.txt'], \ [[10.0, 2.0, 7.0, 7.0, 9.0], [10.0, 2.0, 7.0, 7.0, 9.0]])} self.col_headers, self.comments, self.rarefaction_fns, \ self.rarefaction_data = parse_rarefaction(self.rarefactionfile) self.matrix, self.seqs_per_samp, self.sampleIDs = \ get_rarefaction_data(self.rarefaction_data, self.col_headers) self.ave_seqs_per_sample1 = {'Sample1':[2.03172,9.4417849999999994,\ 12.508435]} self.ave_seqs_per_sample = {'123':[2.03172,9.4417849999999994,\ 12.508435],'234':[0.42876999999999998,0.42876999999999998,\ 0.42876999999999998],'345':[2.255255,9.625995,11.58785]} self.collapsed_ser_sex = {'M':[1.3420125000000001,5.0273824999999999,\ 6.0083099999999998], 'F':[2.03172,9.4417849999999994,12.508435]} self.err_ser_sex = {'M':[0.91324250000000007,4.5986124999999998,\ 5.5795399999999997],'F':[0.0,0.0,0.0]} self.rarefaction_legend_mat_init={'test': {'SampleID': {}}} self.col_headers2=['', 'sequences per sample', 'iteration', 'Sample1', \ 'Sample2'] self.rarefaction_data_mat={'SampleID': {'Sample1': {'test': {'ave': [' 7.000'], 'err': [' nan']}}}} self.rarefaction_legend_mat={'test': {'samples': {'Sample1': {'color': '#ff0000', 'link': 'html_plots/testcol_0_row_0.png'}}, 'groups': {'SampleID': {'Sample1': {'groupcolor': '#ff0000', 'groupsamples': ['Sample1']}}}}} self.exp_err_series_ave={'M': [1.571915, 6.49885, 8.1750183333333339]}
def mean_alpha(alpha_dict, depth): """mean collated alpha diversity data at a given depth Input: alpha_dict: dictionary where the values are the lines of a collated alpha diversity data files and the keys are the names of each of these files with no extension, this name is usually the metric used to compute the alpha diversity. depth: selected depth to mean the computed alpha diversity values for the alpha_dict data. If None is passed, the highest depth will be used. Output: metrics: list of metric names i. e. the name of each collated alpha div file sample_ids: list of sample identifiers represented data: a list of lists with the mean of alpha diversity data at a given depth for the different metrics, each column is a different metric. """ assert type(alpha_dict) == dict, "Input data must be a dictionary" assert depth == None or (depth >= 0 and type(depth) == int), "The "+\ "specified depth must be a positive integer." metrics = [] sample_ids = [] data = [] for key, value in alpha_dict.iteritems(): identifiers, _, _, rarefaction_data = parse_rarefaction(value) # if depth is specified as None use the highest available, retrieve it # on a per file basis so you make sure the value exists for all files if depth == None: _depth = int(max([row[0] for row in rarefaction_data])) else: _depth = depth metrics.append('{0}_even_{1}'.format(key, _depth)) # check there are elements with the desired rarefaction depth if sum([1 for row in rarefaction_data if row[0] == _depth]) == 0: # get a sorted list of strings with the available rarefaction depths available_rarefaction_depths = map( str, sorted(list(set([row[0] for row in rarefaction_data])))) raise ValueError, ( "The depth %d does not exist in the collated " "alpha diversity file for the metric: %s. The available depths " "are: %s." % (_depth, key, ', '.join(available_rarefaction_depths))) # check all the files have the same sample ids in the same order if sample_ids: if not sample_ids == identifiers[3:]: raise ValueError, ( "Non-matching sample ids were found in the " "collated alpha diversity files. Make sure all the files " "contain data for the same samples.") else: sample_ids = identifiers[3:] # find all the data at the desired depth and get the mean values, remove # the first two elements ([depth, iteration]) as those are not needed data.append(array([row[2:] for row in rarefaction_data if\ row[0] == _depth]).mean(axis=0)) # transpose the data to match the formatting of non-collated alpha div data data = array(data).T.tolist() return metrics, sample_ids, data
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) input_dir = opts.input_dir imagetype = opts.imagetype resolution = opts.resolution output_dir = opts.output_dir ymax = opts.ymax std_type = opts.std_type suppress_webpage = opts.suppress_html_output output_type = opts.output_type generate_per_sample_plots = opts.generate_per_sample_plots generate_average_tables = opts.generate_average_tables # Get the command-line options. prefs, data, background_color, label_color, ball_scale, arrow_colors = \ sample_color_prefs_and_map_data_from_options(opts) rares = {} if isdir(input_dir): rarenames = listdir(input_dir) rarenames = [r for r in rarenames if not r.startswith('.')] for r in rarenames: try: rarefl = open(path.join(input_dir, r), 'U').readlines() rares[r] = parse_rarefaction(rarefl) except(IOError): option_parser.error('Problem with rarefaction file. %s' % exc_info()[1]) exit(0) else: try: input_file = input_dir.split(',') for i in range(len(input_file)): input_path = split(input_file[i])[-1] rarefl = open(input_file[i], 'U').readlines() rares[input_path] = parse_rarefaction(rarefl) except(IOError): option_parser.error('Problem with rarefaction file. %s' % exc_info()[1]) exit(0) if imagetype not in ['png', 'svg', 'pdf']: option_parser.error('Supplied extension not supported.') exit(0) try: resolution = int(resolution) except(ValueError): option_parser.error('Inavlid resolution.') exit(0) # output directory check if isinstance(output_dir, str) and output_dir != '.': if exists(output_dir): output_dir = output_dir else: try: create_dir(output_dir, False) output_dir = output_dir except(ValueError): option_parser.error('Could not create output directory.') exit(0) else: output_dir = get_random_directory_name() # Generate the plots and html text html_output = make_averages(prefs, data, background_color, label_color, rares, output_dir, resolution, imagetype, ymax, suppress_webpage, std_type, output_type, generate_per_sample_plots=generate_per_sample_plots, generate_average_tables=generate_average_tables) if html_output: # Write the html file. outfile = open(path.join(output_dir, 'rarefaction_plots.html'), 'w') outfile.write(html_output) outfile.close()
def compare_alpha_diversities(rarefaction_lines, mapping_lines, category, depth, test_type='nonparametric', num_permutations=999): """compares alpha diversities inputs: rarefaction_file - rarefaction file which gives scores for various rarefactions and depths mapping_file - file that has ID's and categories that the ID's fall in category - the category to be compared, is a string depth - the depth of the rarefaction_file to use, is an integer test_type - the type of t-test to perform, is a string. Must be either 'parametric' or 'nonparametric' num_permutations - the number of Monte Carlo permutations to use if test_type is 'nonparametric', is an integer outputs: results - a nested dictionary which specifies the category as the top level key, and as its value, dictionaries which give the results of the t_two_sample test for all unique pairs of values in the specified category """ if test_type == 'nonparametric' and num_permutations < 1: raise ValueError("Invalid number of permutations: %d. Must be greater " "than zero." % num_permutations) rarefaction_data = parse_rarefaction(rarefaction_lines) mapping_data = parse_mapping_file_to_dict(mapping_lines)[0] value_pairs = make_value_pairs_from_category(mapping_data, category) category_values_Ids = make_category_values_Id_dict(mapping_data, category) SampleId_pairs = map_category_value_pairs_to_Ids(value_pairs, category_values_Ids) map_from_Id_to_col = make_SampleIds_rarefaction_columns_dict( rarefaction_data) reduced_rarefaction_mtx = extract_rarefaction_scores_at_depth(depth, rarefaction_data) results = {category:{}} for pair in range(len(SampleId_pairs)): # Must flatten the matrix because t_two_sample only operates on # non-nested sequences (otherwise we'll get the wrong degrees of # freedom). i=(convert_SampleIds_to_rarefaction_mtx(SampleId_pairs[pair][0], reduced_rarefaction_mtx, map_from_Id_to_col)).flatten() j=(convert_SampleIds_to_rarefaction_mtx(SampleId_pairs[pair][1], reduced_rarefaction_mtx, map_from_Id_to_col)).flatten() if test_type == 'parametric': obs_t, p_val = t_two_sample(i,j) elif test_type == 'nonparametric': obs_t, _, _, p_val = mc_t_two_sample(i,j, permutations=num_permutations) p_val = format_p_value_for_num_iters(p_val, num_permutations) else: raise ValueError("Invalid test type '%s'." % test_type) results[category][(str(value_pairs[pair][0]), str(value_pairs[pair][1]))] = obs_t, p_val return results