def group_distances(mapping_file, dmatrix_file, fields, dir_prefix='', subdir_prefix='group_distances'): """Calculate all lists of distance groups. WARNING: Only symmetric, hollow distance matrices may be used as input. Asymmetric distance matrices, such as those obtained by the UniFrac Gain metric (i.e. beta_diversity.py -m unifrac_g), should not be used as input. """ distance_groups = {} mapping, header, comments = parse_mapping_file(open(mapping_file, 'U')) header = [header] header.extend(mapping) mapping = header distance_header, distance_matrix = \ parse_distmat(open(dmatrix_file, 'U')) if fields == []: raise ValueError( 'Since no fields were defined and the values within your fields are either all the same or all unique, a field was not chosen for analysis. Please define a field to analyse.' ) single_field = defaultdict(dict) for i in range(len(fields)): field = fields[i] groups = group_by_field(mapping, field) data = distances_by_groups(distance_header, distance_matrix, groups) # Need to remove pound signs from field name. field_name = field.replace('#', '') single_field[field_name] = data write_distance_files(group_distance_dict=single_field, dir_prefix=dir_prefix, subdir_prefix=subdir_prefix + '_single') paired_field = defaultdict(dict) paired_field_for_writing = defaultdict(dict) for i in range(len(fields)): for j in range(i, len(fields)): fieldi = fields[i] fieldj = fields[j] groups = group_by_fields(mapping, [fieldi, fieldj]) data = distances_by_groups(distance_header, distance_matrix, groups) paired_field[fieldi + '_to_' + fieldj] = data paired_field_for_writing[fieldi + '_to_' + field] = data write_distance_files(group_distance_dict=paired_field_for_writing, dir_prefix=dir_prefix, subdir_prefix=subdir_prefix + '_pairs') return single_field, paired_field, distance_matrix
def test_group_by_fields(self): """group_by_fields should group table by fields""" t = [ ['#sample', 'loc', 'age', 'mal'], ['a','US','5','n'], ['b','US','10','n'], ['c','Mal','5','y'], ['d','Mal','10','n'], ['e','Mal','5','y'], ] self.assertEqual(group_by_fields(t, ['age','loc']), \ {('5','US'):['a'], ('10','US'):['b'], ('5','Mal'):['c','e'], ('10','Mal'):['d']})
def test_group_by_fields(self): """group_by_fields should group table by fields""" t = [ ['#sample', 'loc', 'age', 'mal'], ['a', 'US', '5', 'n'], ['b', 'US', '10', 'n'], ['c', 'Mal', '5', 'y'], ['d', 'Mal', '10', 'n'], ['e', 'Mal', '5', 'y'], ] self.assertEqual(group_by_fields(t, ['age','loc']), \ {('5','US'):['a'], ('10','US'):['b'], ('5','Mal'):['c','e'], ('10','Mal'):['d']})
def group_distances(mapping_file, dmatrix_file, fields, dir_prefix='', subdir_prefix='group_distances'): """Calculate all lists of distance groups. WARNING: Only symmetric, hollow distance matrices may be used as input. Asymmetric distance matrices, such as those obtained by the UniFrac Gain metric (i.e. beta_diversity.py -m unifrac_g), should not be used as input. """ distance_groups = {} mapping, header, comments = parse_mapping_file(open(mapping_file, 'U')) header = [header] header.extend(mapping) mapping = header distance_header, distance_matrix = \ parse_distmat(open(dmatrix_file, 'U')) if fields == []: raise ValueError( 'Since no fields were defined and the values within your fields are either all the same or all unique, a field was not chosen for analysis. Please define a field to analyse.') single_field = defaultdict(dict) for i in range(len(fields)): field = fields[i] groups = group_by_field(mapping, field) data = distances_by_groups(distance_header, distance_matrix, groups) # Need to remove pound signs from field name. field_name = field.replace('#', '') single_field[field_name] = data write_distance_files(group_distance_dict=single_field, dir_prefix=dir_prefix, subdir_prefix=subdir_prefix + '_single') paired_field = defaultdict(dict) paired_field_for_writing = defaultdict(dict) for i in range(len(fields)): for j in range(i, len(fields)): fieldi = fields[i] fieldj = fields[j] groups = group_by_fields(mapping, [fieldi, fieldj]) data = distances_by_groups( distance_header, distance_matrix, groups) paired_field[fieldi + '_to_' + fieldj] = data paired_field_for_writing[fieldi + '_to_' + field] = data write_distance_files(group_distance_dict=paired_field_for_writing, dir_prefix=dir_prefix, subdir_prefix=subdir_prefix + '_pairs') return single_field, paired_field, distance_matrix
def group_distances(mapping_file,dmatrix_file,fields,dir_prefix='',\ subdir_prefix='group_distances'): """Calculate all lists of distance groups.""" distance_groups = {} mapping, header, comments = parse_mapping_file(open(mapping_file,'U')) header = [header] header.extend(mapping) mapping=header distance_header, distance_matrix = \ parse_distmat(open(dmatrix_file,'U')) if fields == []: raise ValueError, 'Since no fields were defined and the values within your fields are either all the same or all unique, a field was not chosen for analysis. Please define a field to analyse.' single_field = defaultdict(dict) for i in range(len(fields)): field = fields[i] groups = group_by_field(mapping, field) data = distances_by_groups(distance_header, distance_matrix, groups) #Need to remove pound signs from field name. field_name = field.replace('#','') single_field[field_name]=data write_distance_files(group_distance_dict=single_field,\ dir_prefix=dir_prefix,subdir_prefix=subdir_prefix+'_single') paired_field = defaultdict(dict) paired_field_for_writing = defaultdict(dict) for i in range(len(fields)): for j in range(i,len(fields)): fieldi = fields[i] fieldj = fields[j] groups = group_by_fields(mapping, [fieldi,fieldj]) data = distances_by_groups(distance_header, distance_matrix, groups) paired_field[fieldi+'_to_'+fieldj]=data paired_field_for_writing[fieldi+'_to_'+field]=data write_distance_files(group_distance_dict=paired_field_for_writing,\ dir_prefix=dir_prefix,subdir_prefix=subdir_prefix+'_pairs') return single_field, paired_field, distance_matrix
def group_distances(mapping_file,dmatrix_file,fields,dir_prefix='',\ subdir_prefix='group_distances'): """Calculate all lists of distance groups.""" distance_groups = {} mapping, header, comments = parse_mapping_file(open(mapping_file, 'U')) header = [header] header.extend(mapping) mapping = header distance_header, distance_matrix = \ parse_distmat(open(dmatrix_file,'U')) if fields is None: fields = [mapping[0][0]] single_field = defaultdict(dict) for i in range(len(fields)): field = fields[i] groups = group_by_field(mapping, field) data = distances_by_groups(distance_header, distance_matrix, groups) #Need to remove pound signs from field name. field_name = field.replace('#', '') single_field[field_name] = data write_distance_files(group_distance_dict=single_field,\ dir_prefix=dir_prefix,subdir_prefix=subdir_prefix+'_single') paired_field = defaultdict(dict) for i in range(len(fields)): for j in range(i, len(fields)): fieldi = fields[i] fieldj = fields[j] groups = group_by_fields(mapping, [fieldi, fieldj]) data = distances_by_groups(distance_header, distance_matrix, groups) paired_field[fieldi + '_to_' + fieldj] = data write_distance_files(group_distance_dict=paired_field,\ dir_prefix=dir_prefix,subdir_prefix=subdir_prefix+'_pairs') return single_field, paired_field, distance_matrix
def monte_carlo_group_distances(mapping_file, dmatrix_file, prefs, \ dir_prefix = '', subdir_prefix='monte_carlo_group_distances',\ default_iters=10, fields=None): """Calculate Monte Carlo stats for specified group distances. Specifically: - find the groups for each specified col (or combination of cols) - do t test between each pair of groups - randomize matrix n times and find empirical value of t for each pair - compare the actual value of t to the randomized values WARNING: Only symmetric, hollow distance matrices may be used as input. Asymmetric distance matrices, such as those obtained by the UniFrac Gain metric (i.e. beta_diversity.py -m unifrac_g), should not be used as input. """ mapping, header, comments = parse_mapping_file(open(mapping_file, 'U')) header = [header] header.extend(mapping) mapping = header distance_header, distance_matrix = \ parse_distmat(open(dmatrix_file,'U')) orig_distance_matrix = distance_matrix.copy() path_prefix = path.join(dir_prefix, subdir_prefix) #if dir doesn't exist if not path.isdir(path_prefix): # make directory mkdir(path_prefix) if fields is None: fields = [mapping[0][0]] if prefs is None: prefs = {} if 'MONTE_CARLO_GROUP_DISTANCES' not in prefs: prefs = build_monte_carlo_prefs(fields, default_iters) for field, num_iters in prefs['MONTE_CARLO_GROUP_DISTANCES'].items(): if '&&' in field: groups = group_by_fields(mapping, field.split('&&')) else: groups = group_by_field(mapping, field) outfile = open( path.join(path_prefix, 'group_distances_' + field + '.txt'), 'w') outfile.write('\t'.join(['Category_1a','Category_1b','Avg',\ 'Category_2a','Category_2b','Avg','t','p',\ 'p_greater','p_less','Iterations\n'])) real_dists = distances_by_groups(distance_header, distance_matrix,\ groups) #iterate over the groups for i, (first_g1, second_g1, distances_g1) in \ enumerate(real_dists[:-1]): real_dist_1 = average(distances_g1) #then for each other pair (not including same group) for j in range(i + 1, len(real_dists)): first_g2, second_g2, distances_g2 = real_dists[j] real_dist_2 = average(distances_g2) # permute distances just within these groups! rand_dists_1, rand_dists_2 = \ permute_between_groups(distances_g1, distances_g2, num_iters) ttests = [t_two_sample(rand_dists_1[n].flatten(),rand_dists_2[n].flatten())[0] \ for n in range(num_iters)] real_ttest = t_two_sample(distances_g1.flatten(), distances_g2.flatten()) curr_line = [first_g1, second_g1, real_dist_1, \ first_g2, second_g2, real_dist_2] curr_line.extend([real_ttest[0], real_ttest[1],\ (array(ttests)>real_ttest[0]).sum()/float(num_iters), \ (array(ttests)<real_ttest[0]).sum()/float(num_iters), \ num_iters]) outfile.write('\t'.join(map(str, curr_line))) outfile.write('\n')
def monte_carlo_group_distances(mapping_file, dmatrix_file, prefs, \ dir_prefix = '', subdir_prefix='monte_carlo_group_distances',\ default_iters=10, fields=None): """Calculate Monte Carlo stats for specified group distances. Specifically: - find the groups for each specified col (or combination of cols) - do t test between each pair of groups - randomize matrix n times and find empirical value of t for each pair - compare the actual value of t to the randomized values WARNING: Only symmetric, hollow distance matrices may be used as input. Asymmetric distance matrices, such as those obtained by the UniFrac Gain metric (i.e. beta_diversity.py -m unifrac_g), should not be used as input. """ mapping, header, comments = parse_mapping_file(open(mapping_file,'U')) header = [header] header.extend(mapping) mapping=header distance_header, distance_matrix = \ parse_distmat(open(dmatrix_file,'U')) orig_distance_matrix = distance_matrix.copy() path_prefix = path.join(dir_prefix,subdir_prefix) #if dir doesn't exist if not path.isdir(path_prefix): # make directory mkdir(path_prefix) if fields is None: fields = [mapping[0][0]] if prefs is None: prefs = {} if 'MONTE_CARLO_GROUP_DISTANCES' not in prefs: prefs = build_monte_carlo_prefs(fields,default_iters) for field, num_iters in prefs['MONTE_CARLO_GROUP_DISTANCES'].items(): if '&&' in field: groups = group_by_fields(mapping, field.split('&&')) else: groups = group_by_field(mapping, field) outfile = open(path.join(path_prefix, 'group_distances_'+field+'.txt'), 'w') outfile.write('\t'.join(['Category_1a','Category_1b','Avg',\ 'Category_2a','Category_2b','Avg','t','p',\ 'p_greater','p_less','Iterations\n'])) real_dists = distances_by_groups(distance_header, distance_matrix,\ groups) #iterate over the groups for i, (first_g1, second_g1, distances_g1) in \ enumerate(real_dists[:-1]): real_dist_1 = average(distances_g1) #then for each other pair (not including same group) for j in range(i+1,len(real_dists)): first_g2, second_g2, distances_g2 = real_dists[j] real_dist_2 = average(distances_g2) # permute distances just within these groups! rand_dists_1, rand_dists_2 = \ permute_between_groups(distances_g1, distances_g2, num_iters) ttests = [t_two_sample(rand_dists_1[n].flatten(),rand_dists_2[n].flatten())[0] \ for n in range(num_iters)] real_ttest = t_two_sample(distances_g1.flatten(), distances_g2.flatten()) curr_line = [first_g1, second_g1, real_dist_1, \ first_g2, second_g2, real_dist_2] curr_line.extend([real_ttest[0], real_ttest[1],\ (array(ttests)>real_ttest[0]).sum()/float(num_iters), \ (array(ttests)<real_ttest[0]).sum()/float(num_iters), \ num_iters]) outfile.write('\t'.join(map(str, curr_line))) outfile.write('\n')
def monte_carlo_group_distances(mapping_file, dmatrix_file, prefs, \ dir_prefix = '', subdir_prefix='monte_carlo_group_distances',\ default_iters=10, fields=None): """Calculate Monte Carlo stats for specified group distances. Specifically: - find the groups for each specified col (or combination of cols) - do t test between each pair of groups - randomize matrix n times and find empirical value of t for each pair - compare the actual value of t to the randomized values """ mapping, header, comments = parse_mapping_file(open(mapping_file, 'U')) header = [header] header.extend(mapping) mapping = header distance_header, distance_matrix = \ parse_distmat(open(dmatrix_file,'U')) orig_distance_matrix = distance_matrix.copy() path_prefix = _make_path([dir_prefix, subdir_prefix]) #if dir doesn't exist if not path.isdir(path_prefix): # make directory mkdir(path_prefix) if fields is None: fields = [mapping[0][0]] if prefs is None: prefs = {} if 'MONTE_CARLO_GROUP_DISTANCES' not in prefs: prefs = build_monte_carlo_prefs(fields, default_iters) for field, num_iters in prefs['MONTE_CARLO_GROUP_DISTANCES'].items(): if '&&' in field: groups = group_by_fields(mapping, field.split('&&')) else: groups = group_by_field(mapping, field) outfile = open(path_prefix + 'group_distances_' + field + '.xls', 'w') outfile.write('\t'.join(['Category_1a','Category_1b','Avg',\ 'Category_2a','Category_2b','Avg','t','p',\ 'p_greater','p_less','Iterations\n'])) real_dists = distances_by_groups(distance_header, distance_matrix,\ groups) rand_distances = [distances_by_groups(distance_header, \ permute_for_monte_carlo(distance_matrix), groups) \ for i in range(num_iters)] #iterate over the groups for i, (first_g1, second_g1, distances_g1) in \ enumerate(real_dists[:-1]): real_dist_1 = average(distances_g1) rand_dists_1 = [rand_distances[n][i][-1] for n in range(num_iters)] #then for each other pair (not including same group) for j in range(i + 1, len(real_dists)): first_g2, second_g2, distances_g2 = real_dists[j] real_dist_2 = average(distances_g2) rand_dists_2 = [rand_distances[n][j][-1] \ for n in range(num_iters)] ttests = [t_two_sample(rand_dists_1[n],rand_dists_2[n])[0] \ for n in range(num_iters)] real_ttest = t_two_sample(distances_g1, distances_g2) curr_line = [first_g1, second_g1, real_dist_1, \ first_g2, second_g2, real_dist_2] curr_line.extend([real_ttest[0], real_ttest[1],\ (array(ttests)>real_ttest[0]).sum()/float(num_iters), \ (array(ttests)<real_ttest[0]).sum()/float(num_iters), \ num_iters]) outfile.write('\t'.join(map(str, curr_line))) outfile.write('\n')