def test_group_by_field(self): """group_by_field should group table by fields""" t = [ ['#sample', 'loc', 'age'], ['a','US','5'], ['b','US','10'], ['c','Mal','5'], ['d','Mal','10'], ['e','Ven','5'], ] self.assertEqual(group_by_field(t, 'loc'), \ {'US':['a','b'], 'Mal':['c','d'], 'Ven':['e']}) self.assertEqual(group_by_field(t, 'age'), \ {'5':['a','c','e'], '10':['b','d']})
def test_group_by_field(self): """group_by_field should group table by fields""" t = [ ['#sample', 'loc', 'age'], ['a', 'US', '5'], ['b', 'US', '10'], ['c', 'Mal', '5'], ['d', 'Mal', '10'], ['e', 'Ven', '5'], ] self.assertEqual(group_by_field(t, 'loc'), \ {'US':['a','b'], 'Mal':['c','d'], 'Ven':['e']}) self.assertEqual(group_by_field(t, 'age'), \ {'5':['a','c','e'], '10':['b','d']})
def _collate_cluster_pcoa_plot_data(coords_f, map_f, category): pc_data = parse_coords(coords_f) coords_d = dict(zip(pc_data[0], pc_data[1])) map_data = parse_mapping_file(map_f) full_map_data = [map_data[1]] full_map_data.extend(map_data[0]) sid_map = group_by_field(full_map_data, category) sorted_states = sorted(sid_map.keys()) color_pool = get_color_pool() if len(sorted_states) > len(color_pool): raise ValueError("Not enough colors to uniquely color sample " "groups.") results = [] for state, color in zip(sorted_states, color_pool[:len(sorted_states)]): sids = sid_map[state] xs = [coords_d[sid][0] for sid in sids] ys = [coords_d[sid][1] for sid in sids] results.append((xs, ys, color, state)) return results
def group_distances(mapping_file, dmatrix_file, fields, dir_prefix='', subdir_prefix='group_distances'): """Calculate all lists of distance groups. WARNING: Only symmetric, hollow distance matrices may be used as input. Asymmetric distance matrices, such as those obtained by the UniFrac Gain metric (i.e. beta_diversity.py -m unifrac_g), should not be used as input. """ distance_groups = {} mapping, header, comments = parse_mapping_file(open(mapping_file, 'U')) header = [header] header.extend(mapping) mapping = header distance_header, distance_matrix = \ parse_distmat(open(dmatrix_file, 'U')) if fields == []: raise ValueError( 'Since no fields were defined and the values within your fields are either all the same or all unique, a field was not chosen for analysis. Please define a field to analyse.' ) single_field = defaultdict(dict) for i in range(len(fields)): field = fields[i] groups = group_by_field(mapping, field) data = distances_by_groups(distance_header, distance_matrix, groups) # Need to remove pound signs from field name. field_name = field.replace('#', '') single_field[field_name] = data write_distance_files(group_distance_dict=single_field, dir_prefix=dir_prefix, subdir_prefix=subdir_prefix + '_single') paired_field = defaultdict(dict) paired_field_for_writing = defaultdict(dict) for i in range(len(fields)): for j in range(i, len(fields)): fieldi = fields[i] fieldj = fields[j] groups = group_by_fields(mapping, [fieldi, fieldj]) data = distances_by_groups(distance_header, distance_matrix, groups) paired_field[fieldi + '_to_' + fieldj] = data paired_field_for_writing[fieldi + '_to_' + field] = data write_distance_files(group_distance_dict=paired_field_for_writing, dir_prefix=dir_prefix, subdir_prefix=subdir_prefix + '_pairs') return single_field, paired_field, distance_matrix
def group_distances(mapping_file, dmatrix_file, fields, dir_prefix='', subdir_prefix='group_distances'): """Calculate all lists of distance groups. WARNING: Only symmetric, hollow distance matrices may be used as input. Asymmetric distance matrices, such as those obtained by the UniFrac Gain metric (i.e. beta_diversity.py -m unifrac_g), should not be used as input. """ distance_groups = {} mapping, header, comments = parse_mapping_file(open(mapping_file, 'U')) header = [header] header.extend(mapping) mapping = header distance_header, distance_matrix = \ parse_distmat(open(dmatrix_file, 'U')) if fields == []: raise ValueError( 'Since no fields were defined and the values within your fields are either all the same or all unique, a field was not chosen for analysis. Please define a field to analyse.') single_field = defaultdict(dict) for i in range(len(fields)): field = fields[i] groups = group_by_field(mapping, field) data = distances_by_groups(distance_header, distance_matrix, groups) # Need to remove pound signs from field name. field_name = field.replace('#', '') single_field[field_name] = data write_distance_files(group_distance_dict=single_field, dir_prefix=dir_prefix, subdir_prefix=subdir_prefix + '_single') paired_field = defaultdict(dict) paired_field_for_writing = defaultdict(dict) for i in range(len(fields)): for j in range(i, len(fields)): fieldi = fields[i] fieldj = fields[j] groups = group_by_fields(mapping, [fieldi, fieldj]) data = distances_by_groups( distance_header, distance_matrix, groups) paired_field[fieldi + '_to_' + fieldj] = data paired_field_for_writing[fieldi + '_to_' + field] = data write_distance_files(group_distance_dict=paired_field_for_writing, dir_prefix=dir_prefix, subdir_prefix=subdir_prefix + '_pairs') return single_field, paired_field, distance_matrix
def get_grouped_distances( dist_matrix_header, dist_matrix, mapping_header, mapping, field, within=True, suppress_symmetry_and_hollowness_check=False, ): """Returns a list of distance groupings for the specified field. The return value is a list that contains tuples of three elements: the first two elements are the field values being compared, and the third element is a list of the distances. WARNING: Only symmetric, hollow distance matrices may be used as input. Asymmetric distance matrices, such as those obtained by the UniFrac Gain metric (i.e. beta_diversity.py -m unifrac_g), should not be used as input. Arguments: - dist_matrix_header: The distance matrix header, obtained from parse.parse_distmat() - dist_matrix: The distance matrix, obtained from parse.parse_distmat(). - mapping_header: The mapping file header, obtained from parse.parse_mapping_file() - mapping: The mapping file's contents, obtained from parse.parse_mapping_file() - field: A field in the mapping file to do the grouping on. - within: If True, distances are grouped within a field value. If False, distances are grouped between field values. - suppress_symmetry_and_hollowness_check: By default, the input distance matrix will be checked for symmetry and hollowness. It is recommended to leave this check in place for safety, as the check is fairly fast. However, if you *know* you have a symmetric and hollow distance matrix, you can disable this check for small performance gains on extremely large distance matrices """ _validate_input(dist_matrix_header, dist_matrix, mapping_header, mapping, field) mapping_data = [mapping_header] mapping_data.extend(mapping) groups = group_by_field(mapping_data, field) return _get_groupings(dist_matrix_header, dist_matrix, groups, within, suppress_symmetry_and_hollowness_check)
def get_grouped_distances(dist_matrix_header, dist_matrix, mapping_header, mapping, field, within=True, suppress_symmetry_and_hollowness_check=False): """Returns a list of distance groupings for the specified field. The return value is a list that contains tuples of three elements: the first two elements are the field values being compared, and the third element is a list of the distances. WARNING: Only symmetric, hollow distance matrices may be used as input. Asymmetric distance matrices, such as those obtained by the UniFrac Gain metric (i.e. beta_diversity.py -m unifrac_g), should not be used as input. Arguments: - dist_matrix_header: The distance matrix header, obtained from parse.parse_distmat() - dist_matrix: The distance matrix, obtained from parse.parse_distmat(). - mapping_header: The mapping file header, obtained from parse.parse_mapping_file() - mapping: The mapping file's contents, obtained from parse.parse_mapping_file() - field: A field in the mapping file to do the grouping on. - within: If True, distances are grouped within a field value. If False, distances are grouped between field values. - suppress_symmetry_and_hollowness_check: By default, the input distance matrix will be checked for symmetry and hollowness. It is recommended to leave this check in place for safety, as the check is fairly fast. However, if you *know* you have a symmetric and hollow distance matrix, you can disable this check for small performance gains on extremely large distance matrices """ _validate_input(dist_matrix_header, dist_matrix, mapping_header, mapping, field) mapping_data = [mapping_header] mapping_data.extend(mapping) groups = group_by_field(mapping_data, field) return _get_groupings(dist_matrix_header, dist_matrix, groups, within, suppress_symmetry_and_hollowness_check)
def group_distances(mapping_file,dmatrix_file,fields,dir_prefix='',\ subdir_prefix='group_distances'): """Calculate all lists of distance groups.""" distance_groups = {} mapping, header, comments = parse_mapping_file(open(mapping_file,'U')) header = [header] header.extend(mapping) mapping=header distance_header, distance_matrix = \ parse_distmat(open(dmatrix_file,'U')) if fields == []: raise ValueError, 'Since no fields were defined and the values within your fields are either all the same or all unique, a field was not chosen for analysis. Please define a field to analyse.' single_field = defaultdict(dict) for i in range(len(fields)): field = fields[i] groups = group_by_field(mapping, field) data = distances_by_groups(distance_header, distance_matrix, groups) #Need to remove pound signs from field name. field_name = field.replace('#','') single_field[field_name]=data write_distance_files(group_distance_dict=single_field,\ dir_prefix=dir_prefix,subdir_prefix=subdir_prefix+'_single') paired_field = defaultdict(dict) paired_field_for_writing = defaultdict(dict) for i in range(len(fields)): for j in range(i,len(fields)): fieldi = fields[i] fieldj = fields[j] groups = group_by_fields(mapping, [fieldi,fieldj]) data = distances_by_groups(distance_header, distance_matrix, groups) paired_field[fieldi+'_to_'+fieldj]=data paired_field_for_writing[fieldi+'_to_'+field]=data write_distance_files(group_distance_dict=paired_field_for_writing,\ dir_prefix=dir_prefix,subdir_prefix=subdir_prefix+'_pairs') return single_field, paired_field, distance_matrix
def group_distances(mapping_file,dmatrix_file,fields,dir_prefix='',\ subdir_prefix='group_distances'): """Calculate all lists of distance groups.""" distance_groups = {} mapping, header, comments = parse_mapping_file(open(mapping_file, 'U')) header = [header] header.extend(mapping) mapping = header distance_header, distance_matrix = \ parse_distmat(open(dmatrix_file,'U')) if fields is None: fields = [mapping[0][0]] single_field = defaultdict(dict) for i in range(len(fields)): field = fields[i] groups = group_by_field(mapping, field) data = distances_by_groups(distance_header, distance_matrix, groups) #Need to remove pound signs from field name. field_name = field.replace('#', '') single_field[field_name] = data write_distance_files(group_distance_dict=single_field,\ dir_prefix=dir_prefix,subdir_prefix=subdir_prefix+'_single') paired_field = defaultdict(dict) for i in range(len(fields)): for j in range(i, len(fields)): fieldi = fields[i] fieldj = fields[j] groups = group_by_fields(mapping, [fieldi, fieldj]) data = distances_by_groups(distance_header, distance_matrix, groups) paired_field[fieldi + '_to_' + fieldj] = data write_distance_files(group_distance_dict=paired_field,\ dir_prefix=dir_prefix,subdir_prefix=subdir_prefix+'_pairs') return single_field, paired_field, distance_matrix
def get_field_state_comparisons(dist_matrix_header, dist_matrix, mapping_header, mapping, field, comparison_field_states, suppress_symmetry_and_hollowness_check=False): """Returns a 2D dictionary relating distances between field states. The 2D dictionary is constructed such that each top-level key is a field state other than the field states in comparison_field_states. The second-level key is a field state from comparison_field_states, and the value at the (key, key) index is a list of distances between those two field states. Thus, given a field, this function will create comparisons between the specified comparison_field_states and all other field states. WARNING: Only symmetric, hollow distance matrices may be used as input. Asymmetric distance matrices, such as those obtained by the UniFrac Gain metric (i.e. beta_diversity.py -m unifrac_g), should not be used as input. Arguments: - dist_matrix_header: The distance matrix header, obtained from parse.parse_distmat() - dist_matrix: The distance matrix, obtained from parse.parse_distmat(). - mapping_header: The mapping file header, obtained from parse.parse_mapping_file() - mapping: The mapping file's contents, obtained from parse.parse_mapping_file() - field: A field in the mapping file to do the comparisons on. - comparison_field_states: A list of strings specifying the field states to compare to all other field states. Cannot be an empty list. - suppress_symmetry_and_hollowness_check: By default, the input distance matrix will be checked for symmetry and hollowness. It is recommended to leave this check in place for safety, as the check is fairly fast. However, if you *know* you have a symmetric and hollow distance matrix, you can disable this check for small performance gains on extremely large distance matrices """ _validate_input(dist_matrix_header, dist_matrix, mapping_header, mapping, field) # Make sure each comparison group field state is in the specified field. if not comparison_field_states: raise ValueError("You must provide at least one field state to " "compare to all of the other field states.") mapping_data = [mapping_header] mapping_data.extend(mapping) groups = group_by_field(mapping_data, field) for field_state in comparison_field_states: if field_state not in groups: raise ValueError("The comparison group field state '%s' is not in " "the provided mapping file's field '%s'." % (field_state, field)) # Grab a list of all other field states (besides the ones in # comparison_field_states). These will be the field states that the states # in comparison_field_states will be compared against. field_states = [ group for group in groups.keys() if group not in comparison_field_states ] # Get between distance groupings for the field of interest. between_groupings = get_grouped_distances(dist_matrix_header, dist_matrix, mapping_header, mapping, field, within=False, suppress_symmetry_and_hollowness_check=\ suppress_symmetry_and_hollowness_check) # Build up our 2D dictionary giving the distances between a field state and # a comparison group field state by filtering out the between_groupings # list to include only the comparisons that we want. result = {} for field_state in field_states: result[field_state] = {} for comp_field_state in comparison_field_states: result[field_state][comp_field_state] = [] for group in between_groupings: if ((group[0] == field_state or group[1] == field_state) and (group[0] == comp_field_state or group[1] == comp_field_state)): # We've found a group of distances between our comparison # field state and the current field state, so keep the # data. result[field_state][comp_field_state] = group[2] return result
def get_field_state_comparisons(dist_matrix_header, dist_matrix, mapping_header, mapping, field, comparison_field_states, suppress_symmetry_and_hollowness_check=False): """Returns a 2D dictionary relating distances between field states. The 2D dictionary is constructed such that each top-level key is a field state other than the field states in comparison_field_states. The second-level key is a field state from comparison_field_states, and the value at the (key, key) index is a list of distances between those two field states. Thus, given a field, this function will create comparisons between the specified comparison_field_states and all other field states. WARNING: Only symmetric, hollow distance matrices may be used as input. Asymmetric distance matrices, such as those obtained by the UniFrac Gain metric (i.e. beta_diversity.py -m unifrac_g), should not be used as input. Arguments: - dist_matrix_header: The distance matrix header, obtained from parse.parse_distmat() - dist_matrix: The distance matrix, obtained from parse.parse_distmat(). - mapping_header: The mapping file header, obtained from parse.parse_mapping_file() - mapping: The mapping file's contents, obtained from parse.parse_mapping_file() - field: A field in the mapping file to do the comparisons on. - comparison_field_states: A list of strings specifying the field states to compare to all other field states. Cannot be an empty list. - suppress_symmetry_and_hollowness_check: By default, the input distance matrix will be checked for symmetry and hollowness. It is recommended to leave this check in place for safety, as the check is fairly fast. However, if you *know* you have a symmetric and hollow distance matrix, you can disable this check for small performance gains on extremely large distance matrices """ _validate_input(dist_matrix_header, dist_matrix, mapping_header, mapping, field) # Make sure each comparison group field state is in the specified field. if not comparison_field_states: raise ValueError("You must provide at least one field state to " "compare to all of the other field states.") mapping_data = [mapping_header] mapping_data.extend(mapping) groups = group_by_field(mapping_data, field) for field_state in comparison_field_states: if field_state not in groups: raise ValueError("The comparison group field state '%s' is not in " "the provided mapping file's field '%s'." % (field_state, field)) # Grab a list of all other field states (besides the ones in # comparison_field_states). These will be the field states that the states # in comparison_field_states will be compared against. field_states = [group for group in groups.keys() if group not in comparison_field_states] # Get between distance groupings for the field of interest. between_groupings = get_grouped_distances(dist_matrix_header, dist_matrix, mapping_header, mapping, field, within=False, suppress_symmetry_and_hollowness_check=\ suppress_symmetry_and_hollowness_check) # Build up our 2D dictionary giving the distances between a field state and # a comparison group field state by filtering out the between_groupings # list to include only the comparisons that we want. result = {} for field_state in field_states: result[field_state] = {} for comp_field_state in comparison_field_states: result[field_state][comp_field_state] = [] for group in between_groupings: if ((group[0] == field_state or group[1] == field_state) and (group[0] == comp_field_state or group[1] == comp_field_state)): # We've found a group of distances between our comparison # field state and the current field state, so keep the # data. result[field_state][comp_field_state] = group[2] return result
def make_pie_chart(data, dir_path, level, color_data, prefs, background_color, label_color, file_prefix=None, props={}, y_len=6.5, dpi=80, generate_eps=False, generate_pdf=True, others_key="All Other Categories", others_color="#eeeeee", should_capitalize=True): """ Write interactive piechart data: [fraction:label,...] trunc_len: truncates labels after this many chars """ if not data: raise ValueError, "No data available for pie chart." all_fracs = [] all_labels = [] colors = [] for key in prefs.keys(): if prefs[key]['column'] != str(level): continue col_name = 'Taxon' mapping = [['Taxon']] mapping.extend([[m] for m in color_data[1]]) if 'colors' in prefs[key]: if isinstance(prefs[key]['colors'], dict): pref_colors = prefs[key]['colors'].copy( ) #copy so we can mutate else: pref_colors = prefs[key]['colors'][:] else: pref_colors = {} labelname = prefs[key]['column'] #Define groups and associate appropriate colors to each group groups = group_by_field(mapping, col_name) pref_colors, data_colors, data_color_order = \ get_group_colors(groups, pref_colors) # set up labels and colors for pie chart for color_ix, (c_label, c_frac) in enumerate(data): #commented out the following line, since the key becomes invalid when #replacing part of the string. #c_label = c_label.replace("_", " ") # we also want to color others category same every time if c_label == others_key: colors.append(others_color) else: colors.append(data_colors[pref_colors[c_label]].toHex()) all_fracs.append(c_frac) if should_capitalize: capital = "%s (%.2f%%)" % (c_label.capitalize(), (c_frac * 100.0)) all_labels.append(capital) else: all_labels.append("%s (%.2f%%)" % (c_label, (c_frac * 100.0))) rc('font', size='10') rc('text', color=label_color) rc('patch', linewidth=.1) rc('axes', linewidth=.5, edgecolor=label_color) rc('text', usetex=False) fig = figure(randrange(10000), figsize=(2 * y_len, y_len)) fp = FontProperties() fp.set_size('8') if len(data) > 30: loc = 4 else: loc = 5 mtitle = "Pie Chart" if "title" in props: mtitle = props["title"] axis('off') title(mtitle, fontsize='10', color=label_color) ax = axes([0.0, 0.0, .5, 1]) p1 = pie(all_fracs, shadow=False, colors=colors) flg = figlegend(p1[0],labels = all_labels, loc = loc, borderpad=0.3, \ labelspacing=0.3, prop = fp) flg.legendPatch.set_alpha(0.0) #write out if file_prefix is None: img_name = make_img_name() else: img_name = file_prefix img_abs = os.path.join(dir_path, 'pie_charts', img_name) savefig(img_abs, dpi=dpi, facecolor=background_color) eps_link = "" eps_abs = "" if generate_pdf: if file_prefix is None: eps_img_name = make_img_name(file_ext=".pdf") else: eps_img_name = file_prefix + ".pdf" savefig(os.path.join(dir_path, 'pie_charts', eps_img_name), facecolor=background_color) eps_abs = os.path.join('pie_charts', eps_img_name) eps_link = DOWNLOAD_LINK % ((os.path.join('pie_charts',\ eps_img_name)),\ IMG_SRC % (os.path.join('pie_charts',img_name))) if generate_eps: if file_prefix is None: eps_img_name = make_img_name(file_ext=".eps") else: eps_img_name = file_prefix + ".eps" savefig(os.path.join(dir_path, 'pie_charts', eps_img_name), facecolor=background_color) strip_eps_font(os.path.join(dir_path, 'pie_charts', eps_img_name)) out = getoutput("gzip " + os.path.join(dir_path, 'pie_charts', eps_img_name)) eps_abs = os.path.join(dir_path, 'pie_charts', eps_img_name) + ".gz" eps_link=DOWNLOAD_LINK % ((os.path.join('pie_charts', eps_img_name)+".gz"),\ IMG_SRC % (os.path.join('pie_charts',img_name))) close(fig) clf() return eps_link, IMG_SRC_2 % (os.path.join('pie_charts', img_name))
def monte_carlo_group_distances(mapping_file, dmatrix_file, prefs, \ dir_prefix = '', subdir_prefix='monte_carlo_group_distances',\ default_iters=10, fields=None): """Calculate Monte Carlo stats for specified group distances. Specifically: - find the groups for each specified col (or combination of cols) - do t test between each pair of groups - randomize matrix n times and find empirical value of t for each pair - compare the actual value of t to the randomized values WARNING: Only symmetric, hollow distance matrices may be used as input. Asymmetric distance matrices, such as those obtained by the UniFrac Gain metric (i.e. beta_diversity.py -m unifrac_g), should not be used as input. """ mapping, header, comments = parse_mapping_file(open(mapping_file, 'U')) header = [header] header.extend(mapping) mapping = header distance_header, distance_matrix = \ parse_distmat(open(dmatrix_file,'U')) orig_distance_matrix = distance_matrix.copy() path_prefix = path.join(dir_prefix, subdir_prefix) #if dir doesn't exist if not path.isdir(path_prefix): # make directory mkdir(path_prefix) if fields is None: fields = [mapping[0][0]] if prefs is None: prefs = {} if 'MONTE_CARLO_GROUP_DISTANCES' not in prefs: prefs = build_monte_carlo_prefs(fields, default_iters) for field, num_iters in prefs['MONTE_CARLO_GROUP_DISTANCES'].items(): if '&&' in field: groups = group_by_fields(mapping, field.split('&&')) else: groups = group_by_field(mapping, field) outfile = open( path.join(path_prefix, 'group_distances_' + field + '.txt'), 'w') outfile.write('\t'.join(['Category_1a','Category_1b','Avg',\ 'Category_2a','Category_2b','Avg','t','p',\ 'p_greater','p_less','Iterations\n'])) real_dists = distances_by_groups(distance_header, distance_matrix,\ groups) #iterate over the groups for i, (first_g1, second_g1, distances_g1) in \ enumerate(real_dists[:-1]): real_dist_1 = average(distances_g1) #then for each other pair (not including same group) for j in range(i + 1, len(real_dists)): first_g2, second_g2, distances_g2 = real_dists[j] real_dist_2 = average(distances_g2) # permute distances just within these groups! rand_dists_1, rand_dists_2 = \ permute_between_groups(distances_g1, distances_g2, num_iters) ttests = [t_two_sample(rand_dists_1[n].flatten(),rand_dists_2[n].flatten())[0] \ for n in range(num_iters)] real_ttest = t_two_sample(distances_g1.flatten(), distances_g2.flatten()) curr_line = [first_g1, second_g1, real_dist_1, \ first_g2, second_g2, real_dist_2] curr_line.extend([real_ttest[0], real_ttest[1],\ (array(ttests)>real_ttest[0]).sum()/float(num_iters), \ (array(ttests)<real_ttest[0]).sum()/float(num_iters), \ num_iters]) outfile.write('\t'.join(map(str, curr_line))) outfile.write('\n')
def get_category_value_to_sample_ids(mapping_lines, category): mapping_data, headers, _ = parse_mapping_file(mapping_lines) return group_by_field([headers] + mapping_data, category)
def setUp(self): """setup data function for DistanceHistogramsTests.""" self.working_dir = '/tmp/distance_histogram_tests/' try: mkdir(self.working_dir) except OSError: #except already exisits pass self.histogram_dir = path.join(self.working_dir,'histograms') try: mkdir(self.histogram_dir) except OSError: #except already exisits remove it and make a new one pass #Create distance matrix file self.dmat_file = self.working_dir+'dmat.txt' dmat_out = open(self.dmat_file,'w') dmat_out.write(DISTANCE_MATRIX_STRING) dmat_out.close() self.distance_header, self.dmat = \ parse_distmat(open(self.dmat_file,'U')) #Create mapping file self.map_file = self.working_dir+'map.txt' map_out = open(self.map_file,'w') map_out.write(MAPPING_STRING) map_out.close() mapping, header, comments = parse_mapping_file(open(self.map_file,'U')) header[0] = '#'+header[0] header = [header] header.extend(mapping) self.mapping=header #Create prefs file self.prefs_file = self.working_dir+'prefs.txt' prefs_out = open(self.prefs_file,'w') prefs_out.write(str(PREFS)) prefs_out.close() #Build single field dict for 'Treatment' field. self.single_field_treatment = defaultdict(dict) self.treatment_groups = group_by_field(self.mapping, 'Treatment') self.single_field_treatment['Treatment'] = \ distances_by_groups(self.distance_header,self.dmat,\ self.treatment_groups) self.paired_field_treatment = {'Treatment_to_Treatment':[\ [('Control','Control'),('Fast','Fast'),\ array([[0.729, 0.8 , 0.721, 0.765], [0.776, 0.744, 0.749, 0.677], [0.734, 0.777, 0.733, 0.724], [0.696, 0.675, 0.654, 0.696], [0.731, 0.758, 0.738, 0.737]])],\ [('Control','Control'),('Control','Control'),\ array([0.625, 0.623, 0.61 , 0.577, 0.615, 0.642, 0.673, 0.682, 0.737, 0.704])],\ [('Fast','Fast'),('Fast','Fast'),\ array([0.718, 0.666, 0.727, 0.6, 0.578, 0.623])] ]} self.distances_file = self.working_dir+'distances_out.txt' dist_out = open(self.distances_file,'w') dist_out.write(DISTANCES_OUT) dist_out.close()
def monte_carlo_group_distances(mapping_file, dmatrix_file, prefs, \ dir_prefix = '', subdir_prefix='monte_carlo_group_distances',\ default_iters=10, fields=None): """Calculate Monte Carlo stats for specified group distances. Specifically: - find the groups for each specified col (or combination of cols) - do t test between each pair of groups - randomize matrix n times and find empirical value of t for each pair - compare the actual value of t to the randomized values """ mapping, header, comments = parse_mapping_file(open(mapping_file, 'U')) header = [header] header.extend(mapping) mapping = header distance_header, distance_matrix = \ parse_distmat(open(dmatrix_file,'U')) orig_distance_matrix = distance_matrix.copy() path_prefix = _make_path([dir_prefix, subdir_prefix]) #if dir doesn't exist if not path.isdir(path_prefix): # make directory mkdir(path_prefix) if fields is None: fields = [mapping[0][0]] if prefs is None: prefs = {} if 'MONTE_CARLO_GROUP_DISTANCES' not in prefs: prefs = build_monte_carlo_prefs(fields, default_iters) for field, num_iters in prefs['MONTE_CARLO_GROUP_DISTANCES'].items(): if '&&' in field: groups = group_by_fields(mapping, field.split('&&')) else: groups = group_by_field(mapping, field) outfile = open(path_prefix + 'group_distances_' + field + '.xls', 'w') outfile.write('\t'.join(['Category_1a','Category_1b','Avg',\ 'Category_2a','Category_2b','Avg','t','p',\ 'p_greater','p_less','Iterations\n'])) real_dists = distances_by_groups(distance_header, distance_matrix,\ groups) rand_distances = [distances_by_groups(distance_header, \ permute_for_monte_carlo(distance_matrix), groups) \ for i in range(num_iters)] #iterate over the groups for i, (first_g1, second_g1, distances_g1) in \ enumerate(real_dists[:-1]): real_dist_1 = average(distances_g1) rand_dists_1 = [rand_distances[n][i][-1] for n in range(num_iters)] #then for each other pair (not including same group) for j in range(i + 1, len(real_dists)): first_g2, second_g2, distances_g2 = real_dists[j] real_dist_2 = average(distances_g2) rand_dists_2 = [rand_distances[n][j][-1] \ for n in range(num_iters)] ttests = [t_two_sample(rand_dists_1[n],rand_dists_2[n])[0] \ for n in range(num_iters)] real_ttest = t_two_sample(distances_g1, distances_g2) curr_line = [first_g1, second_g1, real_dist_1, \ first_g2, second_g2, real_dist_2] curr_line.extend([real_ttest[0], real_ttest[1],\ (array(ttests)>real_ttest[0]).sum()/float(num_iters), \ (array(ttests)<real_ttest[0]).sum()/float(num_iters), \ num_iters]) outfile.write('\t'.join(map(str, curr_line))) outfile.write('\n')
def setUp(self): """Create some data to be used in the tests.""" # Create the mapping file/distance matrix combo from the overview # tutorial. self.dist_matrix_string = [ "\tPC.354\tPC.355\tPC.356\tPC.481\tPC.593\ \tPC.607\tPC.634\tPC.635\tPC.636", "PC.354\t0.0\t0.625\t0.623\t0.61\t0.577\ \t0.729\t0.8\t0.721\t0.765", "PC.355\t0.625\t0.0\t0.615\t0.642\t0.673\ \t0.776\t0.744\t0.749\t0.677", "PC.356\t0.623\t0.615\t0.0\t0.682\t0.737\ \t0.734\t0.777\t0.733\t0.724", "PC.481\t0.61\t0.642\t0.682\t0.0\t0.704\ \t0.696\t0.675\t0.654\t0.696", "PC.593\t0.577\t0.673\t0.737\t0.704\t0.0\ \t0.731\t0.758\t0.738\t0.737", "PC.607\t0.729\t0.776\t0.734\t0.696\t0.731\ \t0.0\t0.718\t0.666\t0.727", "PC.634\t0.8\t0.744\t0.777\t0.675\t0.758\ \t0.718\t0.0\t0.6\t0.578", "PC.635\t0.721\t0.749\t0.733\t0.654\t0.738\ \t0.666\t0.6\t0.0\t0.623", "PC.636\t0.765\t0.677\t0.724\t0.696\t0.737\ \t0.727\t0.578\t0.623\t0.0", ] self.mapping_string = [ "#SampleID\tBarcodeSequence\tTreatment\tDOB", "PC.354\tAGCACGAGCCTA\tControl\t20061218", "PC.355\tAACTCGTCGATG\tControl\t20061218", "PC.356\tACAGACCACTCA\tControl\t20061126", "PC.481\tACCAGCGACTAG\tControl\t20070314", "PC.593\tAGCAGCACTTGT\tControl\t20071210", "PC.607\tAACTGTGCGTAC\tFast\t20071112", "PC.634\tACAGAGTCGGCT\tFast\t20080116", "PC.635\tACCGCAGAGTCA\tFast\t20080116", "PC.636\tACGGTGAGTGTC\tFast\t20080116", ] # Field to test on. Field values are either "Control" or "Fast". self.field = "Treatment" # Create a tiny distancy matrix/mapping file with a single sample for # additional testing. self.tiny_dist_matrix_string = ["\tSamp.1", "Samp.1\t0"] self.tiny_mapping_string = [ "#SampleID\tBarcodeSequence\tSampleField", "Samp.1\tAGCACGAGCCTA\tSampleFieldState1", ] self.tiny_field = "SampleField" self.small_dist_matrix_string = ["\tSamp.1\tSamp.2", "Samp.1\t0\t0.5", "Samp.2\t0.5\t0"] self.small_mapping_string = [ "#SampleID\tBarcodeSequence\tSampleField", "Samp.1\tAGCACGAGCCTA\tSampleFieldState1", "Samp.2\tAGCACGAGCCTG\tSampleFieldState2", ] self.small_field = "SampleField" # Parse mapping "files" (faked here). self.mapping, self.mapping_header, self.comments = parse_mapping_file(self.mapping_string) mapping_data = [self.mapping_header] mapping_data.extend(self.mapping) self.groups = group_by_field(mapping_data, self.field) self.tiny_mapping, self.tiny_mapping_header, self.tiny_comments = parse_mapping_file(self.tiny_mapping_string) tiny_mapping_data = [self.tiny_mapping_header] tiny_mapping_data.extend(self.tiny_mapping) self.tiny_groups = group_by_field(tiny_mapping_data, self.tiny_field) self.small_mapping, self.small_mapping_header, self.small_comments = parse_mapping_file( self.small_mapping_string ) small_mapping_data = [self.small_mapping_header] small_mapping_data.extend(self.small_mapping) self.small_groups = group_by_field(small_mapping_data, self.small_field) # Parse distance matrix "files" (faked here). self.dist_matrix_header, self.dist_matrix = parse_distmat(self.dist_matrix_string) self.tiny_dist_matrix_header, self.tiny_dist_matrix = parse_distmat(self.tiny_dist_matrix_string) self.small_dist_matrix_header, self.small_dist_matrix = parse_distmat(self.small_dist_matrix_string) # extract_per_individual* input data self.individual_states_and_responses_map_f1 = parse_mapping_file_to_dict( individual_states_and_responses_map_f1.split("\n") )[0] self.individual_states_and_responses_map_f2 = parse_mapping_file_to_dict( individual_states_and_responses_map_f2.split("\n") )[0] self.paired_difference_biom1 = parse_biom_table(paired_difference_biom_f1.split("\n"))
def setUp(self): """Create some data to be used in the tests.""" # Create the mapping file/distance matrix combo from the overview # tutorial. self.dist_matrix_string = [ "\tPC.354\tPC.355\tPC.356\tPC.481\tPC.593\ \tPC.607\tPC.634\tPC.635\tPC.636", "PC.354\t0.0\t0.625\t0.623\t0.61\t0.577\ \t0.729\t0.8\t0.721\t0.765", "PC.355\t0.625\t0.0\t0.615\t0.642\t0.673\ \t0.776\t0.744\t0.749\t0.677", "PC.356\t0.623\t0.615\t0.0\t0.682\t0.737\ \t0.734\t0.777\t0.733\t0.724", "PC.481\t0.61\t0.642\t0.682\t0.0\t0.704\ \t0.696\t0.675\t0.654\t0.696", "PC.593\t0.577\t0.673\t0.737\t0.704\t0.0\ \t0.731\t0.758\t0.738\t0.737", "PC.607\t0.729\t0.776\t0.734\t0.696\t0.731\ \t0.0\t0.718\t0.666\t0.727", "PC.634\t0.8\t0.744\t0.777\t0.675\t0.758\ \t0.718\t0.0\t0.6\t0.578", "PC.635\t0.721\t0.749\t0.733\t0.654\t0.738\ \t0.666\t0.6\t0.0\t0.623", "PC.636\t0.765\t0.677\t0.724\t0.696\t0.737\ \t0.727\t0.578\t0.623\t0.0" ] self.mapping_string = [ "#SampleID\tBarcodeSequence\tTreatment\tDOB", "PC.354\tAGCACGAGCCTA\tControl\t20061218", "PC.355\tAACTCGTCGATG\tControl\t20061218", "PC.356\tACAGACCACTCA\tControl\t20061126", "PC.481\tACCAGCGACTAG\tControl\t20070314", "PC.593\tAGCAGCACTTGT\tControl\t20071210", "PC.607\tAACTGTGCGTAC\tFast\t20071112", "PC.634\tACAGAGTCGGCT\tFast\t20080116", "PC.635\tACCGCAGAGTCA\tFast\t20080116", "PC.636\tACGGTGAGTGTC\tFast\t20080116" ] # Field to test on. Field values are either "Control" or "Fast". self.field = 'Treatment' # Create a tiny distancy matrix/mapping file with a single sample for # additional testing. self.tiny_dist_matrix_string = ["\tSamp.1", "Samp.1\t0"] self.tiny_mapping_string = [ "#SampleID\tBarcodeSequence\tSampleField", "Samp.1\tAGCACGAGCCTA\tSampleFieldState1" ] self.tiny_field = 'SampleField' self.small_dist_matrix_string = [ "\tSamp.1\tSamp.2", "Samp.1\t0\t0.5", "Samp.2\t0.5\t0" ] self.small_mapping_string = [ "#SampleID\tBarcodeSequence\tSampleField", "Samp.1\tAGCACGAGCCTA\tSampleFieldState1", "Samp.2\tAGCACGAGCCTG\tSampleFieldState2" ] self.small_field = 'SampleField' # Parse mapping "files" (faked here). self.mapping, self.mapping_header, self.comments = parse_mapping_file( self.mapping_string) mapping_data = [self.mapping_header] mapping_data.extend(self.mapping) self.groups = group_by_field(mapping_data, self.field) self.tiny_mapping, self.tiny_mapping_header, self.tiny_comments = \ parse_mapping_file(self.tiny_mapping_string) tiny_mapping_data = [self.tiny_mapping_header] tiny_mapping_data.extend(self.tiny_mapping) self.tiny_groups = group_by_field(tiny_mapping_data, self.tiny_field) self.small_mapping, self.small_mapping_header, self.small_comments = \ parse_mapping_file(self.small_mapping_string) small_mapping_data = [self.small_mapping_header] small_mapping_data.extend(self.small_mapping) self.small_groups = group_by_field(small_mapping_data, self.small_field) # Parse distance matrix "files" (faked here). self.dist_matrix_header, self.dist_matrix = parse_distmat( self.dist_matrix_string) self.tiny_dist_matrix_header, self.tiny_dist_matrix = parse_distmat( self.tiny_dist_matrix_string) self.small_dist_matrix_header, self.small_dist_matrix = parse_distmat( self.small_dist_matrix_string)
def monte_carlo_group_distances(mapping_file, dmatrix_file, prefs, \ dir_prefix = '', subdir_prefix='monte_carlo_group_distances',\ default_iters=10, fields=None): """Calculate Monte Carlo stats for specified group distances. Specifically: - find the groups for each specified col (or combination of cols) - do t test between each pair of groups - randomize matrix n times and find empirical value of t for each pair - compare the actual value of t to the randomized values WARNING: Only symmetric, hollow distance matrices may be used as input. Asymmetric distance matrices, such as those obtained by the UniFrac Gain metric (i.e. beta_diversity.py -m unifrac_g), should not be used as input. """ mapping, header, comments = parse_mapping_file(open(mapping_file,'U')) header = [header] header.extend(mapping) mapping=header distance_header, distance_matrix = \ parse_distmat(open(dmatrix_file,'U')) orig_distance_matrix = distance_matrix.copy() path_prefix = path.join(dir_prefix,subdir_prefix) #if dir doesn't exist if not path.isdir(path_prefix): # make directory mkdir(path_prefix) if fields is None: fields = [mapping[0][0]] if prefs is None: prefs = {} if 'MONTE_CARLO_GROUP_DISTANCES' not in prefs: prefs = build_monte_carlo_prefs(fields,default_iters) for field, num_iters in prefs['MONTE_CARLO_GROUP_DISTANCES'].items(): if '&&' in field: groups = group_by_fields(mapping, field.split('&&')) else: groups = group_by_field(mapping, field) outfile = open(path.join(path_prefix, 'group_distances_'+field+'.txt'), 'w') outfile.write('\t'.join(['Category_1a','Category_1b','Avg',\ 'Category_2a','Category_2b','Avg','t','p',\ 'p_greater','p_less','Iterations\n'])) real_dists = distances_by_groups(distance_header, distance_matrix,\ groups) #iterate over the groups for i, (first_g1, second_g1, distances_g1) in \ enumerate(real_dists[:-1]): real_dist_1 = average(distances_g1) #then for each other pair (not including same group) for j in range(i+1,len(real_dists)): first_g2, second_g2, distances_g2 = real_dists[j] real_dist_2 = average(distances_g2) # permute distances just within these groups! rand_dists_1, rand_dists_2 = \ permute_between_groups(distances_g1, distances_g2, num_iters) ttests = [t_two_sample(rand_dists_1[n].flatten(),rand_dists_2[n].flatten())[0] \ for n in range(num_iters)] real_ttest = t_two_sample(distances_g1.flatten(), distances_g2.flatten()) curr_line = [first_g1, second_g1, real_dist_1, \ first_g2, second_g2, real_dist_2] curr_line.extend([real_ttest[0], real_ttest[1],\ (array(ttests)>real_ttest[0]).sum()/float(num_iters), \ (array(ttests)<real_ttest[0]).sum()/float(num_iters), \ num_iters]) outfile.write('\t'.join(map(str, curr_line))) outfile.write('\n')
def get_category_value_to_sample_ids(mapping_lines,category): mapping_data, headers, _ = parse_mapping_file(mapping_lines) return group_by_field([headers] + mapping_data,category)
def make_all_charts(data,dir_path,filename,num_categories,colorby,args,\ color_data, prefs,background_color,label_color, chart_type,generate_image_type,plot_width,plot_height,\ bar_width,dpi,resize_nth_label,label_type,\ include_html_legend,include_html_counts): """Generate interactive charts in one HTML file""" #iterate over the preferences and assign colors according to taxonomy img_data = [] for label,f_name in data: raw_fpath=os.path.join(dir_path,'raw_data',os.path.split(f_name)[-1]) # move raw file to output directory shutil.copyfile(f_name,raw_fpath) f = color_data['counts'][f_name] level = max([len(t.split(';')) - 1 for t in f[1]]) for key in prefs.keys(): if prefs[key]['column'] != str(level): continue col_name = 'Taxon' mapping = [['Taxon']] mapping.extend([[m] for m in f[1]]) if 'colors' in prefs[key]: if isinstance(prefs[key]['colors'], dict): pref_colors = prefs[key]['colors'].copy() #copy so we can mutate else: pref_colors = prefs[key]['colors'][:] else: pref_colors={} labelname=prefs[key]['column'] #Define groups and associate appropriate colors to each group groups = group_by_field(mapping, col_name) pref_colors, data_colors, data_color_order = \ get_group_colors(groups, pref_colors) updated_pref_colors={} if chart_type=='area' and len(f[0])==1: raise ValueError, 'When generating area charts, the number of samples (or category values) must be greater than 1. However, you can still produce a pie chart or bar chart with only 1 sample (or category value), but you must remove the area chart value from the input arguments.' for key in pref_colors: updated_pref_colors[key.replace('"','')]=pref_colors[key] for i,val in enumerate(f[1]): f[1][i]=val.replace('"','') #parse the counts and continue processing img_data.extend(get_counts(label.strip(),colorby,num_categories,\ dir_path,level,f,prefs,updated_pref_colors,\ background_color,\ label_color,chart_type,generate_image_type,\ plot_width,plot_height,bar_width,dpi,raw_fpath,\ resize_nth_label,label_type,include_html_legend,\ include_html_counts)) #generate html filepath outpath = os.path.join(dir_path,'%s_charts.html' % chart_type) out_table = ''.join(img_data) #write out html file write_html_file(out_table,outpath)
def setUp(self): """setup data function for DistanceHistogramsTests.""" self.working_dir = '/tmp/distance_histogram_tests/' try: mkdir(self.working_dir) except OSError: #except already exisits pass self.histogram_dir = self.working_dir + 'histograms/' try: mkdir(self.histogram_dir) except OSError: #except already exisits remove it and make a new one pass #Create distance matrix file self.dmat_file = self.working_dir + 'dmat.txt' dmat_out = open(self.dmat_file, 'w') dmat_out.write(DISTANCE_MATRIX_STRING) dmat_out.close() self.distance_header, self.dmat = \ parse_distmat(open(self.dmat_file,'U')) #Create mapping file self.map_file = self.working_dir + 'map.txt' map_out = open(self.map_file, 'w') map_out.write(MAPPING_STRING) map_out.close() mapping, header, comments = parse_mapping_file(open( self.map_file, 'U')) header[0] = '#' + header[0] header = [header] header.extend(mapping) self.mapping = header #Create prefs file self.prefs_file = self.working_dir + 'prefs.txt' prefs_out = open(self.prefs_file, 'w') prefs_out.write(str(PREFS)) prefs_out.close() #Build single field dict for 'Treatment' field. self.single_field_treatment = defaultdict(dict) self.treatment_groups = group_by_field(self.mapping, 'Treatment') self.single_field_treatment['Treatment'] = \ distances_by_groups(self.distance_header,self.dmat,\ self.treatment_groups) self.paired_field_treatment = {'Treatment_to_Treatment':[\ [('Control','Control'),('Fast','Fast'),\ array([[0.729, 0.8 , 0.721, 0.765], [0.776, 0.744, 0.749, 0.677], [0.734, 0.777, 0.733, 0.724], [0.696, 0.675, 0.654, 0.696], [0.731, 0.758, 0.738, 0.737]])],\ [('Control','Control'),('Control','Control'),\ array([0.625, 0.623, 0.61 , 0.577, 0.615, 0.642, 0.673, 0.682, 0.737, 0.704])],\ [('Fast','Fast'),('Fast','Fast'),\ array([0.718, 0.666, 0.727, 0.6, 0.578, 0.623])] ]} self.distances_file = self.working_dir + 'distances_out.txt' dist_out = open(self.distances_file, 'w') dist_out.write(DISTANCES_OUT) dist_out.close()