def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) md, mh, _ = parse_mapping_file(open(mapping_fp)) body_sites = ['Gut','Tongue','Palm','Forehead'] intraindividual_distances = [] print "Unweighted UniFrac" for b in body_sites: dm_fp = "/Users/caporaso/analysis/student-microbiome-project/beta-diversity/unweighted_unifrac_dm.%s_ts_only.txt.gz" % b.lower() h, d = parse_distmat(qiime_open(dm_fp)) intraindividual_distances.append(get_grouped_distances(h, d, mh, md, 'PersonalID')) for i in range(len(body_sites)): for j in range(i): r = correlated_variability(intraindividual_distances[i],intraindividual_distances[j]) print "%s/%s (n=%d): rho:%1.3f, p=%f" % (body_sites[i],body_sites[j],r[0],r[1][0],r[1][3]) intraindividual_distances = [] print "**" print "Weighted UniFrac" for b in body_sites: dm_fp = "/Users/caporaso/analysis/student-microbiome-project/beta-diversity/weighted_unifrac_dm.%s_ts_only.txt.gz" % b.lower() h, d = parse_distmat(qiime_open(dm_fp)) intraindividual_distances.append(get_grouped_distances(h, d, mh, md, 'PersonalID')) for i in range(len(body_sites)): for j in range(i): r = correlated_variability(intraindividual_distances[i],intraindividual_distances[j]) print "%s/%s (n=%d): rho:%1.3f, p=%f" % (body_sites[i],body_sites[j],r[0],r[1][0],r[1][3])
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) sample_id_map_fp = opts.sample_id_map_fp if sample_id_map_fp: sample_id_map = dict([(k,v[0]) \ for k,v in fields_to_dict(open(sample_id_map_fp, "U")).items()]) else: sample_id_map = None input_dm_fps = opts.input_dms.split(',') output_f = open(opts.output_fp,'w') output_f.write(comment) output_f.write('DM1\tDM2\tNumber of entries\tMantel p-value\n') num_iterations = opts.num_iterations for i,fp1 in enumerate(input_dm_fps): for fp2 in input_dm_fps[i+1:]: (dm1_labels, dm1), (dm2_labels, dm2) =\ make_compatible_distance_matrices(parse_distmat(open(fp1,'U')), parse_distmat(open(fp2,'U')), lookup=sample_id_map) if len(dm1_labels) < 2: output_f.write('%s\t%s\t%d\tToo few samples\n' % (fp1,fp2,len(dm1_labels))) continue p = mantel(dm1,dm2,n=num_iterations) p_str = format_p_value_for_num_iters(p,num_iterations) output_f.write('%s\t%s\t%d\t%s\n' % (fp1,fp2,len(dm1_labels),p_str)) output_f.close()
def setUp(self): """Define some distance matrices that will be used by the tests.""" self.dm1_str = ["\ts1\ts2\ts3", "s1\t0\t0.5\t0.2", "s2\t0.5\t0\t0.3", "s3\t0.2\t0.3\t0"] self.dm1 = parse_distmat(self.dm1_str) self.dm2_str = ["\ts1\ts2\ts3", "s1\t0\t0.8\t0.25", "s2\t0.8\t0\t0.4", "s3\t0.25\t0.4\t0"] self.dm2 = parse_distmat(self.dm2_str) self.dm3_str = ["\ts1\ts2\ts3", "s1\t0\t0.1\t0.2", "s2\t0.1\t0\t0.9", "s3\t0.2\t0.9\t0"] self.dm3 = parse_distmat(self.dm3_str) self.dm4_str = ["\tz1\tz2\tz3", "z1\t0\t0.1\t0.2", "z2\t0.1\t0\t0.9", "z3\t0.2\t0.9\t0"] self.dm4 = parse_distmat(self.dm4_str) self.distmats = [self.dm1, self.dm2, self.dm3] # Sample filepaths (these aren't created or modified, just used as # strings to be added to the results). self.fp1 = "foo.txt" self.fp2 = "bar.txt" self.fp3 = "baz.txt" self.fps = [self.fp1, self.fp2, self.fp3] # Some sample parameters to use for many of the tests. self.num_perms = 999 self.comment = "# A sample comment.\n" self.alpha = 0.01 self.tail_type = 'greater' self.sample_id_map = {'z1':'s1', 'z2':'s2', 'z3':'s3', 's1':'s1', 's2':'s2', 's3':'s3'}
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) # Open the input distance matrices, parse them, find the intersection, and # write the two new distance matrices to the output filepaths. input_dm_fps = opts.input_dms.split(',') output_dm_fps = opts.output_dms.split(',') if len(input_dm_fps) != 2 or len(output_dm_fps) != 2: option_parser.error("You must provide exactly two input and output " "distance matrix filepaths.") labels1, dm1_data = parse_distmat(open(input_dm_fps[0], 'U')) labels2, dm2_data = parse_distmat(open(input_dm_fps[1], 'U')) (dm1_labels, dm1), (dm2_labels, dm2) = make_compatible_distance_matrices( parse_distmat(open(input_dm_fps[0],'U')), parse_distmat(open(input_dm_fps[1],'U'))) assert (dm1_labels == dm2_labels), "The order of sample IDs is not the " +\ "same for the two matrices." output1_f = open(output_dm_fps[0], 'w') output2_f = open(output_dm_fps[1], 'w') output1_f.write(format_distance_matrix(dm1_labels, dm1)) output2_f.write(format_distance_matrix(dm2_labels, dm2)) output1_f.close() output2_f.close()
def test_filter_samples_from_distance_matrix(self): """filter_samples_from_distance_matrix functions as expected """ actual = filter_samples_from_distance_matrix(parse_distmat(self.input_dm1), ["GHI blah","XYZ"]) self.assertEqual(actual,expected_dm1a) actual = filter_samples_from_distance_matrix(parse_distmat(self.input_dm1), ["GHI","DEF"]) self.assertEqual(actual,expected_dm1b)
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) # Create the output dir if it doesn't already exist. try: if not path.exists(opts.output_dir): create_dir(opts.output_dir) except: option_parser.error("Could not create or access output directory " "specified with the -o option.") sample_id_map = None if opts.sample_id_map_fp: sample_id_map = dict([(k, v[0]) for k, v in fields_to_dict( open(opts.sample_id_map_fp, "U")).items()]) input_dm_fps = opts.input_dms distmats = [parse_distmat(open(dm_fp, 'U')) for dm_fp in input_dm_fps] if opts.method == 'mantel': output_f = open(path.join(opts.output_dir, 'mantel_results.txt'), 'w') output_f.write( run_mantel_test('mantel', input_dm_fps, distmats, opts.num_permutations, opts.tail_type, comment_mantel_pmantel, sample_id_map=sample_id_map)) elif opts.method == 'partial_mantel': output_f = open( path.join(opts.output_dir, 'partial_mantel_results.txt'), 'w') output_f.write( run_mantel_test('partial_mantel', input_dm_fps, distmats, opts.num_permutations, opts.tail_type, comment_mantel_pmantel, control_dm_fp=opts.control_dm, control_dm=parse_distmat(open( opts.control_dm, 'U')), sample_id_map=sample_id_map)) elif opts.method == 'mantel_corr': output_f = open( path.join(opts.output_dir, 'mantel_correlogram_results.txt'), 'w') result_str, correlogram_fps, correlograms = run_mantel_correlogram( input_dm_fps, distmats, opts.num_permutations, comment_corr, opts.alpha, sample_id_map=sample_id_map, variable_size_distance_classes=opts.variable_size_distance_classes) output_f.write(result_str) for corr_fp, corr in zip(correlogram_fps, correlograms): corr.savefig(path.join(opts.output_dir, corr_fp + opts.image_type), format=opts.image_type) output_f.close()
def test_filter_samples_from_distance_matrix_negate(self): """filter_samples_from_distance_matrix functions w negate """ actual = filter_samples_from_distance_matrix( parse_distmat(self.input_dm1), ["ABC blah","DEF"], negate=True) self.assertEqual(actual,expected_dm1a) actual = filter_samples_from_distance_matrix(\ parse_distmat(self.input_dm1), ["ABC","XYZ"], negate=True) self.assertEqual(actual,expected_dm1b)
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) # Create the output dir if it doesn't already exist. try: if not path.exists(opts.output_dir): create_dir(opts.output_dir) except: option_parser.error("Could not create or access output directory " "specified with the -o option.") sample_id_map = None if opts.sample_id_map_fp: sample_id_map = dict([(k, v[0]) for k, v in fields_to_dict(open(opts.sample_id_map_fp, "U")).items()]) input_dm_fps = opts.input_dms distmats = [parse_distmat(open(dm_fp, "U")) for dm_fp in input_dm_fps] if opts.method == "mantel": output_f = open(path.join(opts.output_dir, "mantel_results.txt"), "w") output_f.write( run_mantel_test( "mantel", input_dm_fps, distmats, opts.num_permutations, opts.tail_type, comment_mantel_pmantel, sample_id_map=sample_id_map, ) ) elif opts.method == "partial_mantel": output_f = open(path.join(opts.output_dir, "partial_mantel_results.txt"), "w") output_f.write( run_mantel_test( "partial_mantel", input_dm_fps, distmats, opts.num_permutations, opts.tail_type, comment_mantel_pmantel, control_dm_fp=opts.control_dm, control_dm=parse_distmat(open(opts.control_dm, "U")), sample_id_map=sample_id_map, ) ) elif opts.method == "mantel_corr": output_f = open(path.join(opts.output_dir, "mantel_correlogram_results.txt"), "w") result_str, correlogram_fps, correlograms = run_mantel_correlogram( input_dm_fps, distmats, opts.num_permutations, comment_corr, opts.alpha, sample_id_map=sample_id_map ) output_f.write(result_str) for corr_fp, corr in zip(correlogram_fps, correlograms): corr.savefig(path.join(opts.output_dir, corr_fp + opts.image_type), format=opts.image_type) output_f.close()
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) if opts.binning is None: ranges = [] else: # simple ranges format validation if opts.binning.count('[')!=opts.binning.count(']') or\ opts.binning.count('[')!=opts.binning.count(','): raise ValueError, "The binning input has an error: '%s'; " % opts.binning +\ "\nthe format should be [increment1,top_limit1][increment2,top_limit2]" # spliting in ranges rgn_txt = opts.binning.split('][') # removing left [ and right ] rgn_txt[0] = rgn_txt[0][1:] rgn_txt[-1] = rgn_txt[-1][:-1] # converting into int ranges = [] max = 0 for i,r in enumerate(rgn_txt): values = map(float,r.split(',')) if len(values)!=2: raise ValueError, "All ranges must have only 2 values: [%s]" % r elif i+1!=len(rgn_txt): if values[0]>values[1]: raise ValueError, "The bin value can't be greater than the max value: [%s]" % r elif values<0: raise ValueError, "This value can not be negative: [%s]" % r elif max>values[1]: raise ValueError, "This value can not smaller than the previous one: [%s]" % r else: max=values[1] ranges.append(values) x_samples, x_distmtx = parse_distmat(open(opts.input_path_x,'U')) y_samples, y_distmtx = parse_distmat(open(opts.input_path_y,'U')) (x_val,y_val,x_fit,y_fit) = fit_semivariogram(x_distmtx, y_distmtx, opts.model, ranges) plot(x_val, y_val, 'o', color="white") plot(x_fit, y_fit, linewidth=2.0, color="blue") x_label = 'Distance (m)' y_label = 'Community Dissimilarity' fig_title = 'Semivariogram (%s)' % opts.model xlabel(x_label) ylabel(y_label) title(fig_title) savefig(opts.output_path)
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) if opts.output_path != None: outf = open(opts.output_path,'w') else: outf = sys.stdout dists = parse_distmat(open(opts.input_path,'U')) map_data = parse_mapping_file_to_dict(open(opts.map,'U')) diff_dists, same_dists = clust_qual_ratio(dists, map_data, opts.category) if opts.short: print >> outf, numpy.mean(diff_dists)/numpy.mean(same_dists) else: print >> outf, "dissimilarity ratio between/within (large for clustered data):" print >> outf, numpy.mean(diff_dists)/numpy.mean(same_dists) print >> outf, "dissimilarities between clusters: mean, std, num:" print >> outf, '\t'.join(map(str,[numpy.mean(diff_dists), numpy.std(diff_dists), len(diff_dists)])) print >> outf, "dissimilarities within clusters: mean, std, num:" print >> outf, '\t'.join(map(str,[numpy.mean(same_dists), numpy.std(same_dists), len(same_dists)]))
def test_get_adjacent_distances(self): """ extracting adjacent distances works as expected """ dm_str = ["\ts1\ts2\ts3", "s1\t0\t2\t4", "s2\t2\t0\t3.2", "s3\t4\t3.2\t0"] dm_header, dm = parse_distmat(dm_str) # error cases: fewer than 2 valid sample ids self.assertRaises(ValueError, get_adjacent_distances, dm_header, dm, []) self.assertRaises(ValueError, get_adjacent_distances, dm_header, dm, ["s1"]) self.assertRaises(ValueError, get_adjacent_distances, dm_header, dm, ["s0", "s1"]) self.assertRaises(ValueError, get_adjacent_distances, dm_header, dm, ["s1", "s4"]) # one pair of valid distances self.assertEqual(get_adjacent_distances(dm_header, dm, ["s1", "s2"]), ([2], [("s1", "s2")])) self.assertEqual(get_adjacent_distances(dm_header, dm, ["s1", "s1"]), ([0], [("s1", "s1")])) self.assertEqual(get_adjacent_distances(dm_header, dm, ["s1", "s3"]), ([4], [("s1", "s3")])) self.assertEqual(get_adjacent_distances(dm_header, dm, ["s2", "s3"]), ([3.2], [("s2", "s3")])) # multiple valid distances self.assertEqual( get_adjacent_distances(dm_header, dm, ["s1", "s2", "s3"]), ([2, 3.2], [("s1", "s2"), ("s2", "s3")]) ) self.assertEqual( get_adjacent_distances(dm_header, dm, ["s1", "s3", "s2", "s1"]), ([4, 3.2, 2], [("s1", "s3"), ("s3", "s2"), ("s2", "s1")]), ) # mixed valid and invalid distances ignores invalid distances self.assertEqual( get_adjacent_distances(dm_header, dm, ["s1", "s3", "s4", "s5", "s6", "s2", "s1"]), ([4, 3.2, 2], [("s1", "s3"), ("s3", "s2"), ("s2", "s1")]), ) # strict=True results in missing sample ids raising an error self.assertRaises( ValueError, get_adjacent_distances, dm_header, dm, ["s1", "s3", "s4", "s5", "s6", "s2", "s1"], strict=True )
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) output_f = open(opts.output_distance_matrix, 'w') if opts.otu_table_fp: otu_table = load_table(opts.otu_table_fp) samples_to_keep = otu_table.ids() # samples_to_keep = \ # sample_ids_from_otu_table(open(opts.otu_table_fp,'U')) elif opts.sample_id_fp: samples_to_keep = \ get_seqs_to_keep_lookup_from_seq_id_file( open(opts.sample_id_fp, 'U')) elif opts.mapping_fp and opts.valid_states: try: samples_to_keep = sample_ids_from_metadata_description( open(opts.mapping_fp, 'U'), opts.valid_states) except ValueError as e: option_parser.error(e.message) else: option_parser.error('must pass either --sample_id_fp, -t, or -m and ' '-s') # note that negate gets a little weird here. The function we're calling # removes the specified samples from the distance matrix, but the other # QIIME filter scripts keep these samples specified. So, the interface of # this script is designed to keep the specified samples, and therefore # negate=True is passed to filter_samples_from_distance_matrix by default. d = filter_samples_from_distance_matrix( parse_distmat( open(opts.input_distance_matrix, 'U')), samples_to_keep, negate=not opts.negate) output_f.write(d) output_f.close()
def nmds(file, dimensions=2): samples, distmtx = parse_distmat(file) nmds_res = nmds_module.NMDS(distmtx, verbosity=0, dimension=dimensions) pts = nmds_res.getPoints() stress = nmds_res.getStress() return format_nmds_coords(samples, pts, stress)
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) if opts.output_path != None: outf = open(opts.output_path, 'w') else: outf = sys.stdout dists = parse_distmat(open(opts.input_path, 'U')) map_data = parse_mapping_file_to_dict(open(opts.map, 'U')) diff_dists, same_dists = clust_qual_ratio(dists, map_data, opts.category) if opts.short: print >> outf, numpy.mean(diff_dists) / numpy.mean(same_dists) else: print >> outf, "dissimilarity ratio between/within (large for clustered data):" print >> outf, numpy.mean(diff_dists) / numpy.mean(same_dists) print >> outf, "dissimilarities between clusters: mean, std, num:" print >> outf, '\t'.join( map(str, [ numpy.mean(diff_dists), numpy.std(diff_dists), len(diff_dists) ])) print >> outf, "dissimilarities within clusters: mean, std, num:" print >> outf, '\t'.join( map(str, [ numpy.mean(same_dists), numpy.std(same_dists), len(same_dists) ]))
def nmds(file,dimensions=2): samples, distmtx = parse_distmat(file) nmds_res = nmds_module.NMDS(distmtx,verbosity=0,dimension=dimensions) pts = nmds_res.getPoints() stress = nmds_res.getStress() return format_nmds_coords(samples, pts, stress)
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) output_f = open(opts.output_distance_matrix, 'w') if opts.otu_table_fp: otu_table = parse_biom_table(open(opts.otu_table_fp, 'U')) samples_to_keep = otu_table.SampleIds #samples_to_keep = \ # sample_ids_from_otu_table(open(opts.otu_table_fp,'U')) elif opts.sample_id_fp: samples_to_keep = \ get_seqs_to_keep_lookup_from_seq_id_file(open(opts.sample_id_fp,'U')) elif opts.mapping_fp and opts.valid_states: samples_to_keep = sample_ids_from_metadata_description( open(opts.mapping_fp, 'U'), opts.valid_states) else: option_parser.error( 'must pass either --sample_id_fp, -t, or -m and -s') # note that negate gets a little weird here. The function we're calling removes the specified # samples from the distance matrix, but the other QIIME filter scripts keep these samples specified. # So, the interface of this script is designed to keep the specified samples, and therefore # negate=True is passed to filter_samples_from_distance_matrix by default. d = filter_samples_from_distance_matrix(parse_distmat( open(opts.input_distance_matrix, 'U')), samples_to_keep, negate=not opts.negate) output_f.write(d) output_f.close()
def test_shuffle_dm(self): """Test shuffling labels of distance matrix.""" exp_labels, exp_dm = parse_distmat(self.dm_f1) order_changed = False for i in range(20): obs_labels, obs_dm = parse_distmat( shuffle_dm(self.dm_f1).split('\n')) self.assertFloatEqual(obs_dm, exp_dm) try: self.assertIsPermutation(obs_labels, exp_labels) except AssertionError: pass else: order_changed = True self.assertTrue(order_changed)
def test_subset_dm(self): """Test picking a subset of a distance matrix.""" # Don't actually subset. exp = parse_distmat(self.dm_f1) obs = parse_distmat(subset_dm(self.dm_f1, 3).split('\n')) self.assertFloatEqual(obs, exp) obs_labels, obs_dm = parse_distmat( subset_dm(self.dm_f1, 1).split('\n')) self.assertEqual(len(obs_labels), 1) self.assertTrue(obs_labels[0] in exp[0]) obs_labels, obs_dm = parse_distmat( subset_dm(self.dm_f1, 2).split('\n')) self.assertEqual(len(obs_labels), 2) self.assertTrue(obs_labels[0] in exp[0]) self.assertTrue(obs_labels[1] in exp[0]) self.assertRaises(ValueError, subset_dm, self.dm_f1, 4)
def test_subset_groups(self): """Test picking subsets of sample groups in distance matrix.""" # Don't filter anything out. exp = parse_distmat(self.dm_f1) obs = parse_distmat(subset_groups( self.dm_f1, self.map_f1, 'Category', 2).split('\n')) self.assertFloatEqual(obs, exp) obs = parse_distmat(subset_groups( self.dm_f1, self.map_f1, 'Category', 3).split('\n')) self.assertFloatEqual(obs, exp) # Pick groups of size 1. obs_labels, obs_dm = parse_distmat(subset_groups( self.dm_f1, self.map_f1, 'Category', 1).split('\n')) self.assertTrue('S2' in obs_labels) # XOR: either S1 or S3 should be in obs_labels, but not both. self.assertTrue(('S1' in obs_labels) != ('S3' in obs_labels))
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) # Open the input distance matrix and parse it. Shuffle its labels and write # them and the original data to the output file. labels, dm_data = parse_distmat(open(opts.input_distance_matrix, 'U')) shuffle(labels) output_f = open(opts.output_distance_matrix, 'w') output_f.write(format_distance_matrix(labels, dm_data)) output_f.close()
def test_parse_distmat(self): """parse_distmat should read distmat correctly""" lines = """\ta\tb\tc a\t0\t1\t2 b\t1\t0\t3.5 c\t1\t3.5\t0 """.splitlines() exp = (['a','b','c'], array([[0,1,2],[1,0,3.5],[1,3.5,0]])) obs = parse_distmat(lines) self.assertEqual(obs, exp)
def test_parse_distmat(self): """parse_distmat should read distmat correctly""" lines = """\ta\tb\tc a\t0\t1\t2 b\t1\t0\t3.5 c\t1\t3.5\t0 """.splitlines() exp = (['a', 'b', 'c'], array([[0, 1, 2], [1, 0, 3.5], [1, 3.5, 0]])) obs = parse_distmat(lines) self.assertEqual(obs, exp)
def group_distances(mapping_file, dmatrix_file, fields, dir_prefix='', subdir_prefix='group_distances'): """Calculate all lists of distance groups. WARNING: Only symmetric, hollow distance matrices may be used as input. Asymmetric distance matrices, such as those obtained by the UniFrac Gain metric (i.e. beta_diversity.py -m unifrac_g), should not be used as input. """ distance_groups = {} mapping, header, comments = parse_mapping_file(open(mapping_file, 'U')) header = [header] header.extend(mapping) mapping = header distance_header, distance_matrix = \ parse_distmat(open(dmatrix_file, 'U')) if fields == []: raise ValueError( 'Since no fields were defined and the values within your fields are either all the same or all unique, a field was not chosen for analysis. Please define a field to analyse.' ) single_field = defaultdict(dict) for i in range(len(fields)): field = fields[i] groups = group_by_field(mapping, field) data = distances_by_groups(distance_header, distance_matrix, groups) # Need to remove pound signs from field name. field_name = field.replace('#', '') single_field[field_name] = data write_distance_files(group_distance_dict=single_field, dir_prefix=dir_prefix, subdir_prefix=subdir_prefix + '_single') paired_field = defaultdict(dict) paired_field_for_writing = defaultdict(dict) for i in range(len(fields)): for j in range(i, len(fields)): fieldi = fields[i] fieldj = fields[j] groups = group_by_fields(mapping, [fieldi, fieldj]) data = distances_by_groups(distance_header, distance_matrix, groups) paired_field[fieldi + '_to_' + fieldj] = data paired_field_for_writing[fieldi + '_to_' + field] = data write_distance_files(group_distance_dict=paired_field_for_writing, dir_prefix=dir_prefix, subdir_prefix=subdir_prefix + '_pairs') return single_field, paired_field, distance_matrix
def pcoa(file): samples, distmtx = parse_distmat(file) # coords, each row is an axis coords, eigvals = ms.principal_coordinates_analysis(distmtx) pcnts = (numpy.abs(eigvals) / sum(numpy.abs(eigvals))) * 100 idxs_descending = pcnts.argsort()[::-1] coords = coords[idxs_descending] eigvals = eigvals[idxs_descending] pcnts = pcnts[idxs_descending] return format_coords(samples, coords.T, eigvals, pcnts)
def isMatrix(fstr): try: result = parse_distmat(fstr.splitlines()) if result[0] == None: return False else: return True except: return False
def group_distances(mapping_file, dmatrix_file, fields, dir_prefix='', subdir_prefix='group_distances'): """Calculate all lists of distance groups. WARNING: Only symmetric, hollow distance matrices may be used as input. Asymmetric distance matrices, such as those obtained by the UniFrac Gain metric (i.e. beta_diversity.py -m unifrac_g), should not be used as input. """ distance_groups = {} mapping, header, comments = parse_mapping_file(open(mapping_file, 'U')) header = [header] header.extend(mapping) mapping = header distance_header, distance_matrix = \ parse_distmat(open(dmatrix_file, 'U')) if fields == []: raise ValueError( 'Since no fields were defined and the values within your fields are either all the same or all unique, a field was not chosen for analysis. Please define a field to analyse.') single_field = defaultdict(dict) for i in range(len(fields)): field = fields[i] groups = group_by_field(mapping, field) data = distances_by_groups(distance_header, distance_matrix, groups) # Need to remove pound signs from field name. field_name = field.replace('#', '') single_field[field_name] = data write_distance_files(group_distance_dict=single_field, dir_prefix=dir_prefix, subdir_prefix=subdir_prefix + '_single') paired_field = defaultdict(dict) paired_field_for_writing = defaultdict(dict) for i in range(len(fields)): for j in range(i, len(fields)): fieldi = fields[i] fieldj = fields[j] groups = group_by_fields(mapping, [fieldi, fieldj]) data = distances_by_groups( distance_header, distance_matrix, groups) paired_field[fieldi + '_to_' + fieldj] = data paired_field_for_writing[fieldi + '_to_' + field] = data write_distance_files(group_distance_dict=paired_field_for_writing, dir_prefix=dir_prefix, subdir_prefix=subdir_prefix + '_pairs') return single_field, paired_field, distance_matrix
def test_get_adjacent_distances(self): """ extracting adjacent distances works as expected """ dm_str = ["\ts1\ts2\ts3", "s1\t0\t2\t4", "s2\t2\t0\t3.2", "s3\t4\t3.2\t0"] dm_header, dm = parse_distmat(dm_str) # error cases: fewer than 2 valid sample ids self.assertRaises(ValueError, get_adjacent_distances,dm_header, dm, []) self.assertRaises(ValueError, get_adjacent_distances,dm_header, dm, ['s1']) self.assertRaises(ValueError, get_adjacent_distances,dm_header, dm, ['s0','s1']) self.assertRaises(ValueError, get_adjacent_distances,dm_header, dm, ['s1','s4']) # one pair of valid distances self.assertEqual(get_adjacent_distances(dm_header, dm, ['s1','s2']), ([2],[('s1','s2')])) self.assertEqual(get_adjacent_distances(dm_header, dm, ['s1','s1']), ([0],[('s1','s1')])) self.assertEqual(get_adjacent_distances(dm_header, dm, ['s1','s3']), ([4],[('s1','s3')])) self.assertEqual(get_adjacent_distances(dm_header, dm, ['s2','s3']), ([3.2],[('s2','s3')])) # multiple valid distances self.assertEqual(get_adjacent_distances(dm_header, dm, ['s1','s2','s3']), ([2,3.2],[('s1','s2'),('s2','s3')])) self.assertEqual(get_adjacent_distances(dm_header, dm, ['s1','s3','s2','s1']), ([4,3.2,2],[('s1','s3'),('s3','s2'),('s2','s1')])) # mixed valid and invalid distances ignores invalid distances self.assertEqual(get_adjacent_distances(dm_header, dm, ['s1','s3','s4','s5','s6','s2','s1']), ([4,3.2,2],[('s1','s3'),('s3','s2'),('s2','s1')])) # strict=True results in missing sample ids raising an error self.assertRaises(ValueError,get_adjacent_distances, dm_header, dm, ['s1','s3','s4','s5','s6','s2','s1'], strict=True)
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) subject_header_name = opts.subject # gradient_header_name = opts.gradient matrix_fp = opts.matrix_fp mapping_fp = opts.mapping_fp matrix_header, matrix_data = parse_distmat(open(matrix_fp, 'U')) mapping_data, mapping_headers, _ = parse_mapping_file(open(mapping_fp, 'U')) out_dict = ratios_for_category(matrix_header, matrix_data, mapping_headers, mapping_data, subject_header_name) print 'Subject\tPercent' for key, value in out_dict.iteritems(): print '%s\t%f' % (key, (value[0]/value[1])*100)
def choose_gradient_subsets(dm_f, map_f, gradient, subset_sizes, num_subsets): subsets = [] mdm, _ = parse_mapping_file_to_dict(map_f) dm_labels, dm_data = parse_distmat(dm_f) # Only keep the sample IDs that are in both the mapping file and distance # matrix. samp_ids = [(samp_id, float(metadata[gradient])) for samp_id, metadata in mdm.items() if samp_id in dm_labels] samp_ids.sort(key=lambda samp_id: samp_id[1]) for subset_size in subset_sizes: # Adapted from http://stackoverflow.com/a/9873935 # We add 1 to the number of samples we want because we want subset_size # intervals to choose from. bin_idxs = [int(ceil(i * len(samp_ids) / (subset_size + 1))) for i in range(subset_size + 1)] for subset_num in range(num_subsets): samp_ids_to_keep = [] for i in range(len(bin_idxs) - 1): if i == len(bin_idxs) - 2: # We're at the last bin, so choose from the entire bin # range. if bin_idxs[i + 1] < len(samp_ids): end_idx = bin_idxs[i + 1] else: end_idx = bin_idxs[i + 1] - 1 samp_ids_to_keep.append( samp_ids[randint(bin_idxs[i], end_idx)][0]) else: # We subtract 1 since randint is inclusive on both sides, # and we don't want to choose the same sample ID multiple # times from different bins. samp_ids_to_keep.append( samp_ids[randint(bin_idxs[i], bin_idxs[i + 1] - 1)][0]) assert len(samp_ids_to_keep) == subset_size, \ "%d != %d" % (len(samp_ids_to_keep), subset_size) subsets.append(samp_ids_to_keep) return subsets
def compare_treatment_dists(chosen_samples, category, mf, bt, m, tr): """Calculate avg between, within, and to-all distances for chosen_samples. Notes: chosen_samples is a list of lists of ids that collectively have some amount of different values under category in the mapping file. these samples will br grouped by the value they have and then these groupings will be compared. the between distance is the all the pairwise distances between the groupings. the within distance is the distance between the samples in a single group. the to-all distance is the distance from the group to all other samples in the distmat. Inputs: chosen_samples - list of ids. e.g. [sam1,sam7,sam3,sam6,..] category - str, field in mf. mf - parsed mapping file, dict of sample_id:metadata. bt - biom table containing at least all samples contained in the mf. m - str, metric to used for beta diversity calculation. tr - tree object, containing at least all nodes in bt. Output: A list of marginals that are the treatments of the groups, i.e. ['HF','LF'] bt_wi_m - a 2d upper triangular array that has the average distances between treatment groups (or in the case of the main diagonal, the average within treatment/group distance). bt_wi_se - the standard errors for bt_wi_m. ta_m_se - 2d array with number of treatments/groups rows, and 2 cols where the first col is the average distance between that treatment and all others and the second col is the se. """ dm = single_object_beta(bt, m, tr) #make the sample-sample distance matrix samples, data = parse_distmat(dm) #parse dm which is list of strs tc = treatment_covering(chosen_samples, category, mf) output_marginals = tc.keys() # make 3 arrays for output, between-within means, between-within ses, # to-all means and ses, bt_wi_m = zeros((len(output_marginals),len(output_marginals))) bt_wi_se = zeros((len(output_marginals),len(output_marginals))) ta_m_se = zeros((len(output_marginals),2)) for i,t in enumerate(output_marginals): # calculate within and to-all ta_m_se[i][0], ta_m_se[i][1] = treatment_dist(tc[t], samples, data) bt_wi_m[i][i], bt_wi_se[i][i] = within_treatment_dist(tc[t], samples, data) for t1, t2 in combinations(output_marginals, 2): #calculate between dists t1_ind = output_marginals.index(t1) t2_ind = output_marginals.index(t2) bt_wi_m[t1_ind][t2_ind], bt_wi_se[t1_ind][t2_ind] = \ between_treatments_dist(tc[t1], tc[t2], samples, data) return output_marginals, bt_wi_m, bt_wi_se, ta_m_se
def filter_samples_from_distance_matrix(dm, samples_to_discard, negate=False): from numpy import array, inf """ Remove specified samples from distance matrix dm: (sample_ids, dm_data) tuple, as returned from qiime.parse.parse_distmat; or a file handle that can be passed to qiime.parse.parse_distmat """ try: sample_ids, dm_data = dm except ValueError: # input was provide as a file handle sample_ids, dm_data = parse_distmat(dm) sample_lookup = {}.fromkeys([e.split()[0] for e in samples_to_discard]) temp_dm_data = [] new_dm_data = [] new_sample_ids = [] if negate: def keep_sample(s): return s in sample_lookup else: def keep_sample(s): return s not in sample_lookup for row, sample_id in zip(dm_data, sample_ids): if keep_sample(sample_id): temp_dm_data.append(row) new_sample_ids.append(sample_id) temp_dm_data = array(temp_dm_data).transpose() for col, sample_id in zip(temp_dm_data, sample_ids): if keep_sample(sample_id): new_dm_data.append(col) new_dm_data = array(new_dm_data).transpose() return (new_sample_ids, new_dm_data)
def single_file_nj(input_file, output_file): # read in dist matrix f = open(input_file, 'U') headers, data = parse_distmat(f) f.close() # do nj distdict = {} for i in range(len(headers)): for j in range(len(headers)): distdict[(headers[i],headers[j])] = data[i,j] # need j,i too? tree = nj(distdict) # write output f = open(output_file,'w') f.write(tree.getNewick(with_distances=True)) f.close()
def single_file_upgma(input_file, output_file): # read in dist matrix f = open(input_file, 'U') headers, data = parse_distmat(f) f.close() # do upgma nodes = map(PhyloNode, headers) BIG = 1e305 U = data.copy() for i in range(len(U)): U[i,i] = BIG c = UPGMA_cluster(U, nodes, BIG) # write output f = open(output_file,'w') f.write(c.getNewick(with_distances=True)) f.close()
def single_file_nj(input_file, output_file): # read in dist matrix f = open(input_file, 'U') headers, data = parse_distmat(f) f.close() # do nj distdict = {} for i in range(len(headers)): for j in range(len(headers)): distdict[(headers[i], headers[j])] = data[i, j] # need j,i too? tree = nj(distdict) # write output f = open(output_file, 'w') f.write(tree.getNewick(with_distances=True)) f.close()
def group_distances(mapping_file,dmatrix_file,fields,dir_prefix='',\ subdir_prefix='group_distances'): """Calculate all lists of distance groups.""" distance_groups = {} mapping, header, comments = parse_mapping_file(open(mapping_file,'U')) header = [header] header.extend(mapping) mapping=header distance_header, distance_matrix = \ parse_distmat(open(dmatrix_file,'U')) if fields == []: raise ValueError, 'Since no fields were defined and the values within your fields are either all the same or all unique, a field was not chosen for analysis. Please define a field to analyse.' single_field = defaultdict(dict) for i in range(len(fields)): field = fields[i] groups = group_by_field(mapping, field) data = distances_by_groups(distance_header, distance_matrix, groups) #Need to remove pound signs from field name. field_name = field.replace('#','') single_field[field_name]=data write_distance_files(group_distance_dict=single_field,\ dir_prefix=dir_prefix,subdir_prefix=subdir_prefix+'_single') paired_field = defaultdict(dict) paired_field_for_writing = defaultdict(dict) for i in range(len(fields)): for j in range(i,len(fields)): fieldi = fields[i] fieldj = fields[j] groups = group_by_fields(mapping, [fieldi,fieldj]) data = distances_by_groups(distance_header, distance_matrix, groups) paired_field[fieldi+'_to_'+fieldj]=data paired_field_for_writing[fieldi+'_to_'+field]=data write_distance_files(group_distance_dict=paired_field_for_writing,\ dir_prefix=dir_prefix,subdir_prefix=subdir_prefix+'_pairs') return single_field, paired_field, distance_matrix
def subset_groups(dm_f, map_f, category, max_group_size): dm_labels, dm_data = parse_distmat(dm_f) metadata_map = MetadataMap.parseMetadataMap(map_f) category_map = defaultdict(list) for samp_id in metadata_map.SampleIds: # Mapping files can have more samples than distance matrices, which can # happen in this case since we are dealing with rarefied OTU tables # (samples get dropped). if samp_id in dm_labels: category_val = metadata_map.getCategoryValue(samp_id, category) category_map[category_val].append(samp_id) samp_ids_to_keep = [] for category_val, samp_ids in category_map.items(): samp_ids_to_keep.extend( sample(samp_ids, min(max_group_size, len(samp_ids)))) return filter_samples_from_distance_matrix((dm_labels, dm_data), samp_ids_to_keep, negate=True)
def group_distances(mapping_file,dmatrix_file,fields,dir_prefix='',\ subdir_prefix='group_distances'): """Calculate all lists of distance groups.""" distance_groups = {} mapping, header, comments = parse_mapping_file(open(mapping_file, 'U')) header = [header] header.extend(mapping) mapping = header distance_header, distance_matrix = \ parse_distmat(open(dmatrix_file,'U')) if fields is None: fields = [mapping[0][0]] single_field = defaultdict(dict) for i in range(len(fields)): field = fields[i] groups = group_by_field(mapping, field) data = distances_by_groups(distance_header, distance_matrix, groups) #Need to remove pound signs from field name. field_name = field.replace('#', '') single_field[field_name] = data write_distance_files(group_distance_dict=single_field,\ dir_prefix=dir_prefix,subdir_prefix=subdir_prefix+'_single') paired_field = defaultdict(dict) for i in range(len(fields)): for j in range(i, len(fields)): fieldi = fields[i] fieldj = fields[j] groups = group_by_fields(mapping, [fieldi, fieldj]) data = distances_by_groups(distance_header, distance_matrix, groups) paired_field[fieldi + '_to_' + fieldj] = data write_distance_files(group_distance_dict=paired_field,\ dir_prefix=dir_prefix,subdir_prefix=subdir_prefix+'_pairs') return single_field, paired_field, distance_matrix
def filter_samples_from_distance_matrix(dm, samples_to_discard, negate=False): """ Remove specified samples from distance matrix dm: (sample_ids, dm_data) tuple, as returned from qiime.parse.parse_distmat; or a file handle that can be passed to qiime.parse.parse_distmat """ try: sample_ids, dm_data = dm except ValueError: # input was provide as a file handle sample_ids, dm_data = parse_distmat(dm) sample_lookup = {}.fromkeys([e.split()[0] for e in samples_to_discard]) temp_dm_data = [] new_dm_data = [] new_sample_ids = [] if negate: def keep_sample(s): return s in sample_lookup else: def keep_sample(s): return s not in sample_lookup for row, sample_id in zip(dm_data, sample_ids): if keep_sample(sample_id): temp_dm_data.append(row) new_sample_ids.append(sample_id) temp_dm_data = array(temp_dm_data).transpose() for col, sample_id in zip(temp_dm_data, sample_ids): if keep_sample(sample_id): new_dm_data.append(col) new_dm_data = array(new_dm_data).transpose() return format_distance_matrix(new_sample_ids, new_dm_data)
def cogent_dist_to_qiime_dist(dist_tuple_dict): """ This takes a dict with tuple keys and distance values, such as is output by the getDistances() method of a PhyloNode object, and converts it to a QIIME-style distance matrix object: an ordered tuple with a list of samples in [0] and a numpy array of the distance matrix in [1]. EDITED AND UPDATED 2013-07-09 Aaron Behr """ headers = [] dist_dict = {} # loop through dist_tuple_dict, returning (k1,k2):v tuples simultaneously for item in dist_tuple_dict.iteritems(): # if k1 is not in headers, add it to headers if item[0][0] not in headers: headers.append(item[0][0]) dist_dict[item[0][0]] = {item[0][0]: 0.0} # null self-distance dist_dict[item[0][0]][item[0][1]] = item[1] # dist_dict[k1][k2] = v headers.sort() # Initialize dict2d, with data from dist_dict (dict of dicts). # Also, RowOrder and ColOrder are set to the order of the sorted headers. # NOTE: no longer using the fromDicts() method to pass dist_dict to dict2d dict2d = Dict2D(dist_dict, headers, headers) # reflect dict2d so that it is no longer sparse dict2d.reflect(largest) # output tab-delimited printable string of the items in dict2d including # headers. dist_delim = dict2d.toDelimited() # generate and return Qiime distance matrix return parse_distmat(StringIO(dist_delim[1:]))
def single_file_upgma(input_file, output_file): # read in dist matrix f = open(input_file, 'U') headers, data = parse_distmat(f) f.close() # do upgma nodes = map(PhyloNode, headers) BIG = 1e305 U = data.copy() for i in range(len(U)): U[i, i] = BIG c = UPGMA_cluster(U, nodes, BIG) # write output f = open(output_file, 'w') try: f.write(c.getNewick(with_distances=True)) except AttributeError: if c == None: raise RuntimeError("""input file %s did not make a UPGMA tree. Ensure it has more than one sample present""" % (str(input_file), )) raise f.close()
def generate_data_make_html(dm_lines): """Generates a dictionary from the distance matrix with the plot info Inputs: dm_lines: distance matrix open file object Return dict of: { LD_NAME: plot_name, LD_HEADERS: {LD_HEADERS_VER:[], LD_HEADERS_HOR:[]}, LD_MATRIX : list of lists containing the float values to plot LD_TRANSFORM_VALUES: {(val1, val2) : (plot_value, label)} must have a key of form (None, None) Is a dictionary which allows to transform the continue matrix values into a discrete values to plot. LD_TABLE_TITLE: table_title } Contains all the needed information to generate the html file. """ header, dist_mat = parse_distmat(dm_lines) # Distance matrix are symmetric, get only the upper triangle dist_mat = get_upper_triangle(dist_mat) # Generate the dictionary result = {} result[LD_NAME] = "Distance matrix" # In this case, the headers are symmetric headers = {} headers[LD_HEADERS_HOR] = header headers[LD_HEADERS_VER] = header result[LD_HEADERS] = headers result[LD_MATRIX] = dist_mat result[LD_TRANSFORM_VALUES] = generate_trans_values_dict(dist_mat) result[LD_TABLE_TITLE] = "Distance matrix" return result
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) indir = opts.input_dir outdir = opts.output_dir if not os.path.exists(outdir): os.makedirs(outdir) #input file_names = os.listdir(indir) file_names = [fname for fname in file_names if not fname.startswith('.')] distmats = [] headers_list = [] for fname in file_names: f = open(os.path.join(indir,fname), 'U') headers, data = parse_distmat(f) f.close() distmats.append(data) headers_list.append(headers) #calcs headers, means, medians, stdevs = matrix_stats(headers_list, distmats) #output f = open(os.path.join(outdir,'means.txt'), 'w') f.write(format_distance_matrix(headers,means)) f.close() f = open(os.path.join(outdir,'medians.txt'), 'w') f.write(format_distance_matrix(headers,medians)) f.close() f = open(os.path.join(outdir,'stdevs.txt'), 'w') f.write(format_distance_matrix(headers,stdevs)) f.close()
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) indir = opts.input_dir outdir = opts.output_dir if not os.path.exists(outdir): os.makedirs(outdir) # input file_names = os.listdir(indir) file_names = [fname for fname in file_names if not fname.startswith('.')] distmats = [] headers_list = [] for fname in file_names: f = open(os.path.join(indir, fname), 'U') headers, data = parse_distmat(f) f.close() distmats.append(data) headers_list.append(headers) # calcs headers, means, medians, stdevs = matrix_stats(headers_list, distmats) # output f = open(os.path.join(outdir, 'means.txt'), 'w') f.write(format_distance_matrix(headers, means)) f.close() f = open(os.path.join(outdir, 'medians.txt'), 'w') f.write(format_distance_matrix(headers, medians)) f.close() f = open(os.path.join(outdir, 'stdevs.txt'), 'w') f.write(format_distance_matrix(headers, stdevs)) f.close()
def single_file_upgma(input_file, output_file): # read in dist matrix f = open(input_file, 'U') headers, data = parse_distmat(f) f.close() # do upgma nodes = map(PhyloNode, headers) BIG = 1e305 U = data.copy() for i in range(len(U)): U[i,i] = BIG c = UPGMA_cluster(U, nodes, BIG) # write output f = open(output_file,'w') try: f.write(c.getNewick(with_distances=True)) except AttributeError: if c == None: raise RuntimeError("""input file %s did not make a UPGMA tree. Ensure it has more than one sample present""" % (str(input_file),)) raise f.close()
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) category = opts.category mapping_fp = opts.mapping_fp colors_used = [] if (category and mapping_fp == None) or (category == None and mapping_fp): option_parser.error('If coloring by a metadata category, both the ' 'category and the mapping file must be supplied.') elif mapping_fp and category: mapping_data, mapping_headers, _ = parse_mapping_file( open(mapping_fp, 'U')) if category not in mapping_headers: option_parser.error("The category supplied must exist in the " "metadata mapping file, '%s' does not exist." % category) index = mapping_headers.index(category) categories = list(set([line[index] for line in mapping_data])) list_of_plots = [] if opts.binning is None: ranges = [] else: # simple ranges format validation if opts.binning.count('[')!=opts.binning.count(']') or\ opts.binning.count('[')!=opts.binning.count(','): raise ValueError, "The binning input has an error: '%s'; " % +\ "\nthe format should be [increment1,top_limit1][increment2,top_limit2]" # spliting in ranges rgn_txt = opts.binning.split('][') # removing left [ and right ] rgn_txt[0] = rgn_txt[0][1:] rgn_txt[-1] = rgn_txt[-1][:-1] # converting into int ranges = [] max = 0 for i, r in enumerate(rgn_txt): try: values = map(float, r.split(',')) except ValueError: raise ValueError, "Not a valid format for binning %s" % opts.binning if len(values) != 2: raise ValueError, "All ranges must have only 2 values: [%s]" % r elif i + 1 != len(rgn_txt): if values[0] > values[1]: raise ValueError, "The bin value can't be greater than the max value: [%s]" % r elif values < 0: raise ValueError, "This value can not be negative: [%s]" % r elif max > values[1]: raise ValueError, "This value can not smaller than the previous one: [%s]" % r else: max = values[1] ranges.append(values) x_samples, x_distmtx = parse_distmat(open(opts.input_path_x, 'U')) y_samples, y_distmtx = parse_distmat(open(opts.input_path_y, 'U')) if opts.ignore_missing_samples: ignoring_from_x = list(set(x_samples) - set(y_samples)) ignoring_from_y = list(set(y_samples) - set(x_samples)) if opts.verbose: print '\nFrom %s we are ignoring: %s\n' % (opts.input_path_x, ignoring_from_x) print '\nFrom %s we are ignoring: %s\n' % (opts.input_path_y, ignoring_from_y) print '\nOnly using: %s\n' % ( list(set(x_samples) & set(y_samples))) x_file = StringIO(\ filter_samples_from_distance_matrix((x_samples, x_distmtx), ignoring_from_x)) x_samples, x_distmtx = parse_distmat(x_file) y_file = StringIO(\ filter_samples_from_distance_matrix((y_samples, y_distmtx), ignoring_from_y)) y_samples, y_distmtx = parse_distmat(y_file) else: if x_distmtx.shape != y_distmtx.shape: raise ValueError, 'The distance matrices have different sizes. ' +\ 'You can cancel this error by passing --ignore_missing_samples' figure() if category == None: x_val, y_val, x_fit, y_fit, func_text = fit_semivariogram( (x_samples, x_distmtx), (y_samples, y_distmtx), opts.model, ranges) plot(x_val, y_val, color=opts.dot_color, marker=opts.dot_marker, linestyle="None", alpha=opts.dot_alpha) plot(x_fit, y_fit, linewidth=2.0, color=opts.line_color, alpha=opts.line_alpha) else: for index, single_category in enumerate(categories): good_sample_ids = sample_ids_from_metadata_description( open(mapping_fp), '%s:%s' % (category, single_category)) _y_samples, _y_distmtx = parse_distmat( StringIO( filter_samples_from_distance_matrix((y_samples, y_distmtx), good_sample_ids, negate=True))) _x_samples, _x_distmtx = parse_distmat( StringIO( filter_samples_from_distance_matrix((x_samples, x_distmtx), good_sample_ids, negate=True))) x_val, y_val, x_fit, y_fit, func_text = fit_semivariogram( (_x_samples, _x_distmtx), (_y_samples, _y_distmtx), opts.model, ranges) # retrieve one of the colors the "QIIME" colors and add it to the # list of used colors for the creation of the legends in the plot color_only = get_qiime_hex_string_color(index) colors_used.append(color_only) plot(x_val, y_val, color=color_only, marker=opts.dot_marker, linestyle="None", alpha=opts.dot_alpha) plot(x_fit, y_fit, linewidth=2.0, color=color_only, alpha=opts.line_alpha, label=single_category) if opts.x_min != None and opts.x_max != None: xlim([opts.x_min, opts.x_max]) if opts.y_min != None and opts.y_max != None: ylim([opts.y_min, opts.y_max]) x_label = opts.x_label y_label = opts.y_label fig_title = '%s (%s)' % (opts.fig_title, opts.model) xlabel(x_label) ylabel(y_label) if opts.print_model: title(fig_title + ' ' + func_text) else: title(fig_title) savefig(opts.output_path) # print the legends after the figure is exported to avoid conflicts if category: # if there's a desired format, use that, else default it to png _, extension = splitext(opts.output_path) # remove the dot, else, make_legend will add it to the filename extension = extension.replace('.', '') if extension == '': extension = 'png' make_legend(categories, colors_used, 0, 0, 'black', 'white', opts.output_path, extension, 80)
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) input_path = opts.input_path output_path = opts.output_path iterations = opts.iterations verbose = opts.verbose y_max = opts.y_max labels = opts.labels.split(',') results = {} for input_file in input_path: if verbose: print input_file # Reading OTU/biom table samples, distmat = parse_distmat(open(input_file, 'U')) possible_samples = range(len(distmat[0])) result_iteration = [] for iteration in range(iterations): iter_vals = [] for n in possible_samples: if n < 1: continue curr_samples = sample(possible_samples, n+1) curr_vals = [] for curr_i, i in enumerate(curr_samples): for j in curr_samples[curr_i+1:]: curr_vals.append(distmat[i][j]) iter_vals.append(min(curr_vals)) result_iteration.append(iter_vals) results[input_file] = [mean(result_iteration, axis=0), std(result_iteration, axis=0)] if verbose: f = open(output_path + '.txt', 'a') f.write('\t'.join(map(str,results[input_file][0]))) f.write('\n') f.write('\t'.join(map(str,results[input_file][1]))) f.write('\n') f.close() # generating plot, some parts taken from # http://stackoverflow.com/questions/4700614 fig = figure() ax = subplot(111) max_x, max_y = -inf, -inf for i, (label, input_file) in enumerate(zip(labels, input_path)): len_x = len(results[input_file][0]) len_y = max(results[input_file][0]) if max_x < len_x: max_x = len_x if max_y < len_y: max_y = len_y if i % 2 == 0: coloring = (215/255.0, 48/255.0, 39/255.0) else: coloring = (69/255.0, 177/255.0, 180/255.0) ax.errorbar(range(1, len_x+1), results[input_file][0], yerr=results[input_file][1], fmt='o', color=coloring, label=label) if y_max: axis([0, max_x, 0, max_y]) else: axis([0, max_x, 0, y_max]) # Shrink current axis by 20% box = ax.get_position() ax.set_position([box.x0, box.y0, box.width * 0.8, box.height]) # Put a legend to the right of the current axis ax.legend(loc='center left', bbox_to_anchor=(1, 0.5)) title(opts.title) xlabel('Samples') ylabel(opts.ylabel) grid(True) savefig(output_path)
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) input_path = opts.input_path output_path = opts.output_path iterations = opts.iterations verbose = opts.verbose y_max = opts.y_max labels = opts.labels.split(',') results = {} for idx, input_file in enumerate(input_path): if verbose: print input_file # Reading OTU/biom table samples, distmat = parse_distmat(open(input_file, 'U')) possible_samples = range(len(distmat[0])) mask = np.ones(distmat.shape) n_possible_samples = len(possible_samples) result_iteration = np.zeros((iterations, n_possible_samples)) for iter_idx, iteration in enumerate(range(iterations)): iter_vals = np.zeros(n_possible_samples) for idx, n in enumerate(possible_samples): if n < 1: continue curr_samples = sample(possible_samples, n+1) # masked arrays are inverted apparently, so 0 means to keep mask.fill(1) mask[curr_samples] = 0 mask[:, curr_samples] = 0 np.fill_diagonal(mask, 1) masked_array = np.ma.array(distmat, mask=mask) iter_vals[idx] = masked_array.min() result_iteration[iter_idx] = iter_vals results[input_file] = [mean(result_iteration, axis=0), std(result_iteration, axis=0)] if verbose: f = open(output_path + '.txt', 'a') f.write('\t'.join(map(str, results[input_file][0]))) f.write('\n') f.write('\t'.join(map(str, results[input_file][1]))) f.write('\n') f.close() # generating plot, some parts taken from # http://stackoverflow.com/questions/4700614 figure() ax = subplot(111) max_x, max_y = -inf, -inf for i, (label, input_file) in enumerate(zip(labels, input_path)): len_x = len(results[input_file][0]) len_y = max(results[input_file][0]) if max_x < len_x: max_x = len_x if max_y < len_y: max_y = len_y if i % 2 == 0: coloring = (215/255.0, 48/255.0, 39/255.0) else: coloring = (69/255.0, 177/255.0, 180/255.0) ax.errorbar(range(1, len_x+1), results[input_file][0], yerr=results[input_file][1], fmt='o', color=coloring, label=label) if y_max: axis([0, max_x, 0, y_max]) else: axis([0, max_x, 0, max_y]) # Shrink current axis by 20% box = ax.get_position() ax.set_position([box.x0, box.y0, box.width * 0.8, box.height]) # Put a legend to the right of the current axis ax.legend(loc='center left', bbox_to_anchor=(1, 0.5)) title(opts.title) xlabel('Samples') ylabel(opts.ylabel) grid(True) savefig(output_path)
def monte_carlo_group_distances(mapping_file, dmatrix_file, prefs, \ dir_prefix = '', subdir_prefix='monte_carlo_group_distances',\ default_iters=10, fields=None): """Calculate Monte Carlo stats for specified group distances. Specifically: - find the groups for each specified col (or combination of cols) - do t test between each pair of groups - randomize matrix n times and find empirical value of t for each pair - compare the actual value of t to the randomized values WARNING: Only symmetric, hollow distance matrices may be used as input. Asymmetric distance matrices, such as those obtained by the UniFrac Gain metric (i.e. beta_diversity.py -m unifrac_g), should not be used as input. """ mapping, header, comments = parse_mapping_file(open(mapping_file, 'U')) header = [header] header.extend(mapping) mapping = header distance_header, distance_matrix = \ parse_distmat(open(dmatrix_file,'U')) orig_distance_matrix = distance_matrix.copy() path_prefix = path.join(dir_prefix, subdir_prefix) #if dir doesn't exist if not path.isdir(path_prefix): # make directory mkdir(path_prefix) if fields is None: fields = [mapping[0][0]] if prefs is None: prefs = {} if 'MONTE_CARLO_GROUP_DISTANCES' not in prefs: prefs = build_monte_carlo_prefs(fields, default_iters) for field, num_iters in prefs['MONTE_CARLO_GROUP_DISTANCES'].items(): if '&&' in field: groups = group_by_fields(mapping, field.split('&&')) else: groups = group_by_field(mapping, field) outfile = open( path.join(path_prefix, 'group_distances_' + field + '.txt'), 'w') outfile.write('\t'.join(['Category_1a','Category_1b','Avg',\ 'Category_2a','Category_2b','Avg','t','p',\ 'p_greater','p_less','Iterations\n'])) real_dists = distances_by_groups(distance_header, distance_matrix,\ groups) #iterate over the groups for i, (first_g1, second_g1, distances_g1) in \ enumerate(real_dists[:-1]): real_dist_1 = average(distances_g1) #then for each other pair (not including same group) for j in range(i + 1, len(real_dists)): first_g2, second_g2, distances_g2 = real_dists[j] real_dist_2 = average(distances_g2) # permute distances just within these groups! rand_dists_1, rand_dists_2 = \ permute_between_groups(distances_g1, distances_g2, num_iters) ttests = [t_two_sample(rand_dists_1[n].flatten(),rand_dists_2[n].flatten())[0] \ for n in range(num_iters)] real_ttest = t_two_sample(distances_g1.flatten(), distances_g2.flatten()) curr_line = [first_g1, second_g1, real_dist_1, \ first_g2, second_g2, real_dist_2] curr_line.extend([real_ttest[0], real_ttest[1],\ (array(ttests)>real_ttest[0]).sum()/float(num_iters), \ (array(ttests)<real_ttest[0]).sum()/float(num_iters), \ num_iters]) outfile.write('\t'.join(map(str, curr_line))) outfile.write('\n')
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) #Some code for error checking of input args: #Check if distance_matrix_file is valid: try: d_header, d_mat = parse_distmat(open(opts.distance_matrix_file, 'U')) except: option_parser.error( "This does not look like a valid distance matrix file. Please supply a valid distance matrix file using the -d option." ) if not is_symmetric_and_hollow(d_mat): option_parser.error("The distance matrix must be symmetric and " "hollow.") #Check if map_fname is valid: try: mapping, m_header, m_comments = \ parse_mapping_file(open(opts.map_fname,'U')) except QiimeParseError: option_parser.error( "This does not look like a valid metadata mapping file. Please supply a valid mapping file using the -m option." ) #make sure background_color is valid if opts.background_color not in ['black', 'white']: option_parser.error( "'%s' is not a valid background color. Please pass in either 'black' or 'white' using the -k option." % (opts.background_color)) #make sure prefs file is valid if it exists if opts.prefs_path is not None: try: prefs_file = open(opts.prefs_path, 'U').read() except IOError: option_parser.error( "Provided prefs file, '%s', does not exist. Please pass in a valid prefs file with the -p option." % (opts.prefs_path)) if opts.prefs_path is not None: prefs = parse_prefs_file(prefs_file) else: prefs = None color_prefs, color_data, background_color, label_color, ball_scale,\ arrow_colors=sample_color_prefs_and_map_data_from_options(opts) #list of labelname, groups, colors, data_colors, data_color_order groups_and_colors=list(iter_color_groups(mapping=color_data['map'],\ prefs=color_prefs)) #dict mapping labelname to list of: [groups, colors, data_colors, # data_color_order] field_to_colors = {} for color_info in groups_and_colors: field_to_colors[color_info[0]] = color_info[1:] qiime_dir = get_qiime_project_dir() + '/qiime/support_files/' fields = opts.fields if fields is not None: fields = map(strip, fields.split(',')) fields = [i.strip('"').strip("'") for i in fields] elif prefs is not None: fields = prefs.get('FIELDS', None) else: fields = get_interesting_mapping_fields(mapping, m_header) #Check that all provided fields are valid: if fields is not None: for f in fields: if f not in m_header: option_parser.error( "The field, %s, is not in the provided mapping file. Please supply correct fields (using the -f option or providing a 'FIELDS' list in the prefs file) corresponding to fields in mapping file." % (f)) within_distances, between_distances, dmat = \ group_distances(mapping_file=opts.map_fname,\ dmatrix_file=opts.distance_matrix_file,\ fields=fields,\ dir_prefix=get_random_directory_name(output_dir=opts.dir_path,\ prefix='distances')) if not opts.suppress_html_output: #histograms output path histograms_path = path.join(opts.dir_path, 'histograms') try: mkdir(histograms_path) except OSError: #raised if dir exists pass #draw all histograms distances_dict, label_to_histogram_filename = \ draw_all_histograms(single_field=within_distances, \ paired_field=between_distances, \ dmat=dmat,\ histogram_dir=histograms_path,\ field_to_color_prefs=field_to_colors,\ background_color=background_color) #Get relative path to histogram files. label_to_histogram_filename_relative = \ _make_relative_paths(label_to_histogram_filename, opts.dir_path) dm_fname = path.split(opts.distance_matrix_file)[-1] basename = path.splitext(dm_fname)[0] outfile_name = basename + '_distance_histograms.html' make_main_html(distances_dict=distances_dict,\ label_to_histogram_filename=label_to_histogram_filename_relative,\ root_outdir=opts.dir_path, \ outfile_name = outfile_name, \ title='Distance Histograms') #Handle saving web resources locally. #javascript file javascript_path = path.join(opts.dir_path, 'js') try: mkdir(javascript_path) except OSError: #raised if dir exists pass js_out = open(javascript_path + '/histograms.js', 'w') js_out.write(open(qiime_dir + 'js/histograms.js').read()) js_out.close() monte_carlo_iters = opts.monte_carlo_iters if monte_carlo_iters > 0: #Do Monte Carlo for all fields monte_carlo_group_distances(mapping_file=opts.map_fname,\ dmatrix_file=opts.distance_matrix_file,\ prefs=prefs, \ dir_prefix = opts.dir_path,\ fields=fields,\ default_iters=monte_carlo_iters) #Do Monte Carlo for within and between fields monte_carlo_group_distances_within_between(\ single_field=within_distances,\ paired_field=between_distances, dmat=dmat, \ dir_prefix = opts.dir_path,\ num_iters=monte_carlo_iters)
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) # Create the output dir if it doesn't already exist. try: create_dir(opts.output_dir) except: option_parser.error("Could not create or access output directory " "specified with the -o option.") # Parse the distance matrix and mapping file. try: dist_matrix_header, dist_matrix = parse_distmat( open(opts.distance_matrix_fp, 'U')) except: option_parser.error( "This does not look like a valid distance matrix " "file. Please supply a valid distance matrix file using the -d " "option.") try: mapping, mapping_header, mapping_comments = parse_mapping_file( open(opts.mapping_fp, 'U')) except QiimeParseError: option_parser.error( "This does not look like a valid metadata mapping " "file. Please supply a valid mapping file using the -m option.") # Make sure the y_min and y_max options make sense, as they can be either # 'auto' or a number. y_min = opts.y_min y_max = opts.y_max try: y_min = float(y_min) except ValueError: if y_min == 'auto': y_min = None else: option_parser.error("The --y_min option must be either a number " "or 'auto'.") try: y_max = float(y_max) except ValueError: if y_max == 'auto': y_max = None else: option_parser.error("The --y_max option must be either a number " "or 'auto'.") # Parse the field states that will be compared to every other field state. comparison_field_states = opts.comparison_groups comparison_field_states = map(strip, comparison_field_states.split(',')) comparison_field_states = [ field_state.strip('"').strip("'") for field_state in comparison_field_states ] if comparison_field_states is None: option_parser.error("You must provide at least one field state to " "compare (using the -c option).") # Get distance comparisons between each field state and each of the # comparison field states. field = opts.field comparison_groupings = get_field_state_comparisons( dist_matrix_header, dist_matrix, mapping_header, mapping, field, comparison_field_states) # Grab a list of all field states that had the comparison field states # compared against them. These will be plotted along the x-axis. field_states = comparison_groupings.keys() def custom_comparator(x, y): try: num_x = float(x) num_y = float(y) return int(num_x - num_y) except: if x < y: return -1 elif x > y: return 1 else: return 0 # Sort the field states as numbers if the elements are numbers, else sort # them lexically. field_states.sort(custom_comparator) # If the label type is numeric, get a list of all field states in sorted # numeric order. These will be used to determine the spacing of the # field state 'points' along the x-axis. x_spacing = None if opts.label_type == "numeric": try: x_spacing = map(float, field_states) x_spacing.sort() except: option_parser.error("The 'numeric' label type is invalid because " "not all field states could be converted into " "numbers. Please specify a different label " "type.") # Accumulate the data for each field state 'point' along the x-axis. plot_data = [] plot_x_axis_labels = [] for field_state in field_states: field_state_data = [] for comp_field_state in comparison_field_states: field_state_data.append( comparison_groupings[field_state][comp_field_state]) plot_data.append(field_state_data) plot_x_axis_labels.append(field_state) # Plot the data and labels. plot_title = "Distance Comparisons" plot_x_label = field plot_y_label = "Distance" # If we are creating a bar chart or box plot, grab a list of good data # colors to use. plot_type = opts.plot_type plot_colors = None if plot_type == "bar" or plot_type == "box": plot_colors = [matplotlib_rgb_color(data_colors[color].toRGB()) \ for color in data_color_order] assert plot_data, "Error: there is no data to plot!" width = opts.width height = opts.height if width <= 0 or height <= 0: option_parser.error("The specified width and height of the image must " "be greater than zero.") plot_figure = generate_comparative_plots( opts.plot_type, plot_data, x_values=x_spacing, data_point_labels=plot_x_axis_labels, distribution_labels=comparison_field_states, distribution_markers=plot_colors, x_label=plot_x_label, y_label=plot_y_label, title=plot_title, x_tick_labels_orientation=opts.x_tick_labels_orientation, y_min=y_min, y_max=y_max, whisker_length=opts.whisker_length, error_bar_type=opts.error_bar_type, distribution_width=opts.distribution_width, figure_width=width, figure_height=height) # Save the plot in the specified format. output_plot_fp = join( opts.output_dir, "%s_Distance_Comparisons.%s" % (field, opts.imagetype)) plot_figure.savefig(output_plot_fp, format=opts.imagetype, transparent=opts.transparent) if not opts.suppress_significance_tests: sig_tests_f = open(join(opts.output_dir, "%s_Stats.txt" % field), 'w') # Rearrange the plot data into a format suitable for all_pairs_t_test. sig_tests_labels = [] sig_tests_data = [] for data_point, data_point_label in zip(plot_data, plot_x_axis_labels): for dist, comp_field in zip(data_point, comparison_field_states): sig_tests_labels.append('%s vs %s' % (data_point_label, comp_field)) sig_tests_data.append(dist) sig_tests_results = all_pairs_t_test( sig_tests_labels, sig_tests_data, tail_type=opts.tail_type, num_permutations=opts.num_permutations) sig_tests_f.write(sig_tests_results) sig_tests_f.close() if opts.save_raw_data: # Write the raw plot data into a tab-delimited file, where each line # has the distances between a comparison group and another field state # 'point' along the x-axis. assert (len(plot_x_axis_labels) == len(plot_data)), "The number of " +\ "labels do not match the number of points along the x-axis." raw_data_fp = join(opts.output_dir, "%s_Distance_Comparisons.txt" % field) raw_data_f = open(raw_data_fp, 'w') raw_data_f.write("#ComparisonGroup\tFieldState\tDistances\n") for label, data in zip(plot_x_axis_labels, plot_data): assert (len(comparison_field_states) == len(data)), "The " +\ "number of specified comparison groups does not match " +\ "the number of groups found at the current point along " +\ "the x-axis." for comp_field_state, comp_grp_data in zip(comparison_field_states, data): raw_data_f.write(comp_field_state + "\t" + label + "\t" + "\t".join(map(str, comp_grp_data)) + "\n") raw_data_f.close()
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) # Create the output dir if it doesn't already exist. try: create_dir(opts.output_dir) except: option_parser.error("Could not create or access output directory " "specified with the -o option.") # Parse the distance matrix and mapping file. try: dist_matrix_header, dist_matrix = parse_distmat( open(opts.distance_matrix_fp, 'U')) except: option_parser.error( "This does not look like a valid distance matrix " "file. Please supply a valid distance matrix file using the -d " "option.") try: mapping, mapping_header, mapping_comments = parse_mapping_file( open(opts.mapping_fp, 'U')) except QiimeParseError: option_parser.error( "This does not look like a valid metadata mapping " "file. Please supply a valid mapping file using the -m option.") fields = opts.fields fields = map(strip, fields.split(',')) fields = [field.strip('"').strip("'") for field in fields] if fields is None: option_parser.error("You must provide at least one field using the -f " "option.") # Make sure each field is in the mapping file. for field in fields: if field not in mapping_header: option_parser.error( "The field '%s' is not in the provided " "mapping file. Please supply correct fields (using the -f " "option) corresponding to fields in the mapping file." % field) # Make sure the y_min and y_max options make sense, as they can be either # 'auto' or a number. y_min = opts.y_min y_max = opts.y_max try: y_min = float(y_min) except ValueError: if y_min == 'auto': y_min = None else: option_parser.error("The --y_min option must be either a number " "or 'auto'.") try: y_max = float(y_max) except ValueError: if y_max == 'auto': y_max = None else: option_parser.error("The --y_max option must be either a number " "or 'auto'.") # Generate the various boxplots, depending on what the user wanted # suppressed. Add them all to one encompassing plot. for field in fields: plot_data = [] plot_labels = [] if not opts.suppress_all_within: plot_data.append( get_all_grouped_distances(dist_matrix_header, dist_matrix, mapping_header, mapping, field, within=True)) plot_labels.append("All within %s" % field) if not opts.suppress_all_between: plot_data.append( get_all_grouped_distances(dist_matrix_header, dist_matrix, mapping_header, mapping, field, within=False)) plot_labels.append("All between %s" % field) if not opts.suppress_individual_within: within_dists = get_grouped_distances(dist_matrix_header, dist_matrix, mapping_header, mapping, field, within=True) for grouping in within_dists: plot_data.append(grouping[2]) plot_labels.append("%s vs. %s" % (grouping[0], grouping[1])) if not opts.suppress_individual_between: between_dists = get_grouped_distances(dist_matrix_header, dist_matrix, mapping_header, mapping, field, within=False) for grouping in between_dists: plot_data.append(grouping[2]) plot_labels.append("%s vs. %s" % (grouping[0], grouping[1])) # We now have our data and labels ready, so plot them! assert (len(plot_data) == len(plot_labels)), "The number " +\ "of boxplot labels does not match the number of " +\ "boxplots." if plot_data: if opts.sort: # Sort our plot data in order of increasing median. sorted_data = [] for label, distribution in zip(plot_labels, plot_data): sorted_data.append( (label, distribution, median(distribution))) sorted_data.sort(key=itemgetter(2)) plot_labels = [] plot_data = [] for label, distribution, median_value in sorted_data: plot_labels.append(label) plot_data.append(distribution) width = opts.width height = opts.height if width is None: width = len(plot_data) * opts.box_width + 2 if width <= 0 or height <= 0: option_parser.error("The specified width and height of the " "image must be greater than zero.") plot_figure = generate_box_plots( plot_data, x_tick_labels=plot_labels, title="%s Distances" % field, x_label="Grouping", y_label="Distance", x_tick_labels_orientation='vertical', y_min=y_min, y_max=y_max, whisker_length=opts.whisker_length, box_width=opts.box_width, box_color=opts.box_color, figure_width=width, figure_height=height) output_plot_fp = join(opts.output_dir, "%s_Distances.%s" % (field, opts.imagetype)) plot_figure.savefig(output_plot_fp, format=opts.imagetype, transparent=opts.transparent) else: option_parser.error("You have chosen to suppress all plots. At " "least one type of plot must be unsuppressed.") if not opts.suppress_significance_tests: sig_tests_f = open(join(opts.output_dir, "%s_Stats.xls" % field), 'w') sig_tests_results = all_pairs_t_test( plot_labels, plot_data, tail_type=opts.tail_type, num_permutations=opts.num_permutations) sig_tests_f.write(sig_tests_results) sig_tests_f.close() if opts.save_raw_data: # Write the raw plot data into a tab-delimited file. assert (len(plot_labels) == len(plot_data)) raw_data_fp = join(opts.output_dir, "%s_Distances.xls" % field) raw_data_f = open(raw_data_fp, 'w') for label, data in zip(plot_labels, plot_data): raw_data_f.write(label.replace(" ", "_") + "\t") raw_data_f.write("\t".join(map(str, data))) raw_data_f.write("\n") raw_data_f.close()
def single_object_beta(self, otu_table, metric, tree_string, missing_sams=None): """ running single_file_beta should give same result using --rows""" if missing_sams is None: missing_sams = [] metrics = list_known_nonphylogenetic_metrics() metrics.extend(list_known_phylogenetic_metrics()) # new metrics that don't trivially parallelize must be dealt with # carefully warnings.filterwarnings( 'ignore', 'dissimilarity binary_dist_chisq is\ not parallelized, calculating the whole matrix...') warnings.filterwarnings( 'ignore', 'dissimilarity dist_chisq is not\ parallelized, calculating the whole matrix...') warnings.filterwarnings( 'ignore', 'dissimilarity dist_gower is not\ parallelized, calculating the whole matrix...') warnings.filterwarnings( 'ignore', 'dissimilarity dist_hellinger is\ not parallelized, calculating the whole matrix...') warnings.filterwarnings('ignore', 'unifrac had no information for\ sample M*') # self.files_to_remove.extend([input_path,tree_path]) # self.folders_to_remove.append(output_dir) # os.mkdir(output_dir+'/ft/') for metric in metrics: # do it beta_out = single_object_beta(otu_table, metric, tree_string, rowids=None, full_tree=False) sams, dmtx = parse_distmat(beta_out) # do it by rows for i in range(len(sams)): if sams[i] in missing_sams: continue rows = sams[i] # row_outname = output_dir + '/' + metric + '_' +\ # in_fname r_out = single_object_beta(otu_table, metric, tree_string, rowids=rows, full_tree=False) col_sams, row_sams, row_dmtx = parse_matrix(r_out) self.assertEqual(row_dmtx.shape, (len(rows.split(',')), len(sams))) # make sure rows same as full for j in range(len(rows.split(','))): for k in range(len(sams)): row_v1 = row_dmtx[j, k] full_v1 =\ dmtx[sams.index(row_sams[j]), sams.index(col_sams[k])] npt.assert_almost_equal(row_v1, full_v1) # full tree run: if 'full_tree' in str(metric).lower(): continue # do it by rows with full tree for i in range(len(sams)): if sams[i] in missing_sams: continue rows = sams[i] #~ row_outname = output_dir + '/ft/' + metric + '_' +\ #~ in_fname r_out = single_object_beta(otu_table, metric, tree_string, rowids=None, full_tree=True) col_sams, row_sams, row_dmtx = parse_matrix(r_out) self.assertEqual(row_dmtx.shape, (len(rows.split(',')), len(sams))) # make sure rows same as full for j in range(len(rows.split(','))): for k in range(len(sams)): row_v1 = row_dmtx[j, k] full_v1 =\ dmtx[sams.index(row_sams[j]), sams.index(col_sams[k])] npt.assert_almost_equal(row_v1, full_v1) # do it with full tree r_out = single_object_beta(otu_table, metric, tree_string, rowids=None, full_tree=True) sams_ft, dmtx_ft = parse_distmat(r_out) self.assertEqual(sams_ft, sams) npt.assert_almost_equal(dmtx_ft, dmtx)
def single_file_beta(self, otu_table_string, tree_string, missing_sams=None, use_metric_list=False): """ running single_file_beta should give same result using --rows""" if missing_sams is None: missing_sams = [] # setup fd, input_path = mkstemp(suffix='.txt') os.close(fd) in_fname = os.path.split(input_path)[1] f = open(input_path, 'w') f.write(otu_table_string) f.close() fd, tree_path = mkstemp(suffix='.tre') os.close(fd) f = open(tree_path, 'w') f.write(tree_string) f.close() metrics = list_known_nonphylogenetic_metrics() metrics.extend(list_known_phylogenetic_metrics()) output_dir = mkdtemp() # new metrics that don't trivially parallelize must be dealt with # carefully warnings.filterwarnings( 'ignore', 'dissimilarity binary_dist_chisq is\ not parallelized, calculating the whole matrix...') warnings.filterwarnings( 'ignore', 'dissimilarity dist_chisq is not\ parallelized, calculating the whole matrix...') warnings.filterwarnings( 'ignore', 'dissimilarity dist_gower is not\ parallelized, calculating the whole matrix...') warnings.filterwarnings( 'ignore', 'dissimilarity dist_hellinger is\ not parallelized, calculating the whole matrix...') warnings.filterwarnings('ignore', 'unifrac had no information for\ sample M*') self.files_to_remove.extend([input_path, tree_path]) self.folders_to_remove.append(output_dir) os.mkdir(output_dir + '/ft/') for metric in metrics: # do it if use_metric_list: single_file_beta(input_path, [metric], tree_path, output_dir, rowids=None) else: single_file_beta(input_path, metric, tree_path, output_dir, rowids=None) sams, dmtx = parse_distmat( open(output_dir + '/' + metric + '_' + in_fname)) # do it by rows for i in range(len(sams)): if sams[i] in missing_sams: continue rows = sams[i] row_outname = output_dir + '/' + metric + '_' +\ in_fname if use_metric_list: single_file_beta(input_path, [metric], tree_path, output_dir, rowids=rows) else: single_file_beta(input_path, metric, tree_path, output_dir, rowids=rows) col_sams, row_sams, row_dmtx = parse_matrix(open(row_outname)) self.assertEqual(row_dmtx.shape, (len(rows.split(',')), len(sams))) # make sure rows same as full for j in range(len(rows.split(','))): for k in range(len(sams)): row_v1 = row_dmtx[j, k] full_v1 =\ dmtx[sams.index(row_sams[j]), sams.index(col_sams[k])] npt.assert_almost_equal(row_v1, full_v1) # full tree run: if 'full_tree' in str(metric).lower(): continue # do it by rows with full tree for i in range(len(sams)): if sams[i] in missing_sams: continue rows = sams[i] row_outname = output_dir + '/ft/' + metric + '_' +\ in_fname if use_metric_list: single_file_beta(input_path, [metric], tree_path, output_dir + '/ft/', rowids=rows, full_tree=True) else: single_file_beta(input_path, metric, tree_path, output_dir + '/ft/', rowids=rows, full_tree=True) col_sams, row_sams, row_dmtx = parse_matrix(open(row_outname)) self.assertEqual(row_dmtx.shape, (len(rows.split(',')), len(sams))) # make sure rows same as full for j in range(len(rows.split(','))): for k in range(len(sams)): row_v1 = row_dmtx[j, k] full_v1 =\ dmtx[sams.index(row_sams[j]), sams.index(col_sams[k])] npt.assert_almost_equal(row_v1, full_v1) # do it with full tree if use_metric_list: single_file_beta(input_path, [metric], tree_path, output_dir + '/ft/', rowids=None, full_tree=True) else: single_file_beta(input_path, metric, tree_path, output_dir + '/ft/', rowids=None, full_tree=True) sams_ft, dmtx_ft = parse_distmat( open(output_dir + '/ft/' + metric + '_' + in_fname)) self.assertEqual(sams_ft, sams) npt.assert_almost_equal(dmtx_ft, dmtx)
def setUp(self): """Create some data to be used in the tests.""" # Create the mapping file/distance matrix combo from the overview # tutorial. self.dist_matrix_string = [ "\tPC.354\tPC.355\tPC.356\tPC.481\tPC.593\ \tPC.607\tPC.634\tPC.635\tPC.636", "PC.354\t0.0\t0.625\t0.623\t0.61\t0.577\ \t0.729\t0.8\t0.721\t0.765", "PC.355\t0.625\t0.0\t0.615\t0.642\t0.673\ \t0.776\t0.744\t0.749\t0.677", "PC.356\t0.623\t0.615\t0.0\t0.682\t0.737\ \t0.734\t0.777\t0.733\t0.724", "PC.481\t0.61\t0.642\t0.682\t0.0\t0.704\ \t0.696\t0.675\t0.654\t0.696", "PC.593\t0.577\t0.673\t0.737\t0.704\t0.0\ \t0.731\t0.758\t0.738\t0.737", "PC.607\t0.729\t0.776\t0.734\t0.696\t0.731\ \t0.0\t0.718\t0.666\t0.727", "PC.634\t0.8\t0.744\t0.777\t0.675\t0.758\ \t0.718\t0.0\t0.6\t0.578", "PC.635\t0.721\t0.749\t0.733\t0.654\t0.738\ \t0.666\t0.6\t0.0\t0.623", "PC.636\t0.765\t0.677\t0.724\t0.696\t0.737\ \t0.727\t0.578\t0.623\t0.0" ] self.mapping_string = [ "#SampleID\tBarcodeSequence\tTreatment\tDOB", "PC.354\tAGCACGAGCCTA\tControl\t20061218", "PC.355\tAACTCGTCGATG\tControl\t20061218", "PC.356\tACAGACCACTCA\tControl\t20061126", "PC.481\tACCAGCGACTAG\tControl\t20070314", "PC.593\tAGCAGCACTTGT\tControl\t20071210", "PC.607\tAACTGTGCGTAC\tFast\t20071112", "PC.634\tACAGAGTCGGCT\tFast\t20080116", "PC.635\tACCGCAGAGTCA\tFast\t20080116", "PC.636\tACGGTGAGTGTC\tFast\t20080116" ] # Field to test on. Field values are either "Control" or "Fast". self.field = 'Treatment' # Create a tiny distancy matrix/mapping file with a single sample for # additional testing. self.tiny_dist_matrix_string = ["\tSamp.1", "Samp.1\t0"] self.tiny_mapping_string = [ "#SampleID\tBarcodeSequence\tSampleField", "Samp.1\tAGCACGAGCCTA\tSampleFieldState1" ] self.tiny_field = 'SampleField' self.small_dist_matrix_string = [ "\tSamp.1\tSamp.2", "Samp.1\t0\t0.5", "Samp.2\t0.5\t0" ] self.small_mapping_string = [ "#SampleID\tBarcodeSequence\tSampleField", "Samp.1\tAGCACGAGCCTA\tSampleFieldState1", "Samp.2\tAGCACGAGCCTG\tSampleFieldState2" ] self.small_field = 'SampleField' # Parse mapping "files" (faked here). self.mapping, self.mapping_header, self.comments = parse_mapping_file( self.mapping_string) mapping_data = [self.mapping_header] mapping_data.extend(self.mapping) self.groups = group_by_field(mapping_data, self.field) self.tiny_mapping, self.tiny_mapping_header, self.tiny_comments = \ parse_mapping_file(self.tiny_mapping_string) tiny_mapping_data = [self.tiny_mapping_header] tiny_mapping_data.extend(self.tiny_mapping) self.tiny_groups = group_by_field(tiny_mapping_data, self.tiny_field) self.small_mapping, self.small_mapping_header, self.small_comments = \ parse_mapping_file(self.small_mapping_string) small_mapping_data = [self.small_mapping_header] small_mapping_data.extend(self.small_mapping) self.small_groups = group_by_field(small_mapping_data, self.small_field) # Parse distance matrix "files" (faked here). self.dist_matrix_header, self.dist_matrix = parse_distmat( self.dist_matrix_string) self.tiny_dist_matrix_header, self.tiny_dist_matrix = parse_distmat( self.tiny_dist_matrix_string) self.small_dist_matrix_header, self.small_dist_matrix = parse_distmat( self.small_dist_matrix_string)
if opts.otu_table_fp: otu_table = parse_biom_table(open(opts.otu_table_fp, 'U')) samples_to_keep = otu_table.SampleIds #samples_to_keep = \ # sample_ids_from_otu_table(open(opts.otu_table_fp,'U')) elif opts.sample_id_fp: samples_to_keep = \ get_seqs_to_keep_lookup_from_seq_id_file(open(opts.sample_id_fp,'U')) elif opts.mapping_fp and opts.valid_states: try: samples_to_keep = sample_ids_from_metadata_description( open(opts.mapping_fp, 'U'), opts.valid_states) except ValueError, e: option_parser.error(e.message) else: option_parser.error( 'must pass either --sample_id_fp, -t, or -m and -s') # note that negate gets a little weird here. The function we're calling removes the specified # samples from the distance matrix, but the other QIIME filter scripts keep these samples specified. # So, the interface of this script is designed to keep the specified samples, and therefore # negate=True is passed to filter_samples_from_distance_matrix by default. d = filter_samples_from_distance_matrix(parse_distmat( open(opts.input_distance_matrix, 'U')), samples_to_keep, negate=not opts.negate) output_f.write(d) output_f.close() if __name__ == "__main__": main()
def setUp(self): """setup data function for DistanceHistogramsTests.""" self.working_dir = '/tmp/distance_histogram_tests/' try: mkdir(self.working_dir) except OSError: #except already exisits pass self.histogram_dir = self.working_dir + 'histograms/' try: mkdir(self.histogram_dir) except OSError: #except already exisits remove it and make a new one pass #Create distance matrix file self.dmat_file = self.working_dir + 'dmat.txt' dmat_out = open(self.dmat_file, 'w') dmat_out.write(DISTANCE_MATRIX_STRING) dmat_out.close() self.distance_header, self.dmat = \ parse_distmat(open(self.dmat_file,'U')) #Create mapping file self.map_file = self.working_dir + 'map.txt' map_out = open(self.map_file, 'w') map_out.write(MAPPING_STRING) map_out.close() mapping, header, comments = parse_mapping_file(open( self.map_file, 'U')) header[0] = '#' + header[0] header = [header] header.extend(mapping) self.mapping = header #Create prefs file self.prefs_file = self.working_dir + 'prefs.txt' prefs_out = open(self.prefs_file, 'w') prefs_out.write(str(PREFS)) prefs_out.close() #Build single field dict for 'Treatment' field. self.single_field_treatment = defaultdict(dict) self.treatment_groups = group_by_field(self.mapping, 'Treatment') self.single_field_treatment['Treatment'] = \ distances_by_groups(self.distance_header,self.dmat,\ self.treatment_groups) self.paired_field_treatment = {'Treatment_to_Treatment':[\ [('Control','Control'),('Fast','Fast'),\ array([[0.729, 0.8 , 0.721, 0.765], [0.776, 0.744, 0.749, 0.677], [0.734, 0.777, 0.733, 0.724], [0.696, 0.675, 0.654, 0.696], [0.731, 0.758, 0.738, 0.737]])],\ [('Control','Control'),('Control','Control'),\ array([0.625, 0.623, 0.61 , 0.577, 0.615, 0.642, 0.673, 0.682, 0.737, 0.704])],\ [('Fast','Fast'),('Fast','Fast'),\ array([0.718, 0.666, 0.727, 0.6, 0.578, 0.623])] ]} self.distances_file = self.working_dir + 'distances_out.txt' dist_out = open(self.distances_file, 'w') dist_out.write(DISTANCES_OUT) dist_out.close()
def make_distance_boxplots(dm_f, map_f, fields, width=None, height=6.0, suppress_all_within=False, suppress_all_between=False, suppress_individual_within=False, suppress_individual_between=False, y_min=0.0, y_max=1.0, whisker_length=1.5, box_width=0.5, box_color=None, color_individual_within_by_field=None, sort=None): """Generates various types of boxplots for distance comparisons. Returns a list of tuples, one for each field. Each tuple contains the following: 1) the name of the field (string) 2) a matplotlib.figure.Figure object containing the boxplots 3) a list of lists containing the raw plot data that was passed to mpl 4) a list of labels for each of the boxplots (string) 5) a list of mpl-compatible colors (one for each boxplot) The Figure can be saved, and the raw data and labels can be useful (for example) performing statistical tests or writing the raw data to disk. The input arguments are exactly derived from the make_distance_boxplots.py script (see the script options for details). To avoid duplicated effort, their descriptions are not reproduced here. """ # Parse data files and do some preliminary error checking. dm_header, dm_data = parse_distmat(dm_f) map_data, map_header, map_comments = parse_mapping_file(map_f) if fields is None or len(fields) < 1: raise ValueError("You must provide at least one field to analyze.") for field in fields: if field not in map_header: raise ValueError("The field '%s' is not in the provided mapping " "file. Please supply correct fields " "corresponding to fields in the mapping file." % field) # Make sure the y_min and y_max options make sense, as they can be either # 'auto' or a number. y_min = _cast_y_axis_extrema(y_min) y_max = _cast_y_axis_extrema(y_max) # Collate the distributions of distances that will comprise each boxplot. # Suppress the generation of the indicated types of boxplots. results = [] for field in fields: plot_data = [] plot_labels = [] plot_colors = [] legend = None # Little bit of duplicate code here... not sure it's worth the effort # to clean up though. if not suppress_all_within: plot_data.append( get_all_grouped_distances(dm_header, dm_data, map_header, map_data, field, within=True)) plot_labels.append("All within %s" % field) if color_individual_within_by_field is not None: plot_colors.append(None) else: plot_colors.append(box_color) if not suppress_all_between: plot_data.append( get_all_grouped_distances(dm_header, dm_data, map_header, map_data, field, within=False)) plot_labels.append("All between %s" % field) if color_individual_within_by_field is not None: plot_colors.append(None) else: plot_colors.append(box_color) if not suppress_individual_within: within_dists = get_grouped_distances(dm_header, dm_data, map_header, map_data, field, within=True) field_states = [] for grouping in within_dists: plot_data.append(grouping[2]) plot_labels.append("%s vs. %s" % (grouping[0], grouping[1])) field_states.append(grouping[0]) # If we need to color these boxplots by a field, build up a # list of colors and a legend. if color_individual_within_by_field is not None: colors, color_mapping = _color_field_states( format_mapping_file(map_header, map_data).split('\n'), dm_header, field, field_states, color_individual_within_by_field) plot_colors.extend(colors) legend = (color_mapping.values(), color_mapping.keys()) else: plot_colors.extend([box_color] * len(field_states)) if not suppress_individual_between: between_dists = get_grouped_distances(dm_header, dm_data, map_header, map_data, field, within=False) for grouping in between_dists: plot_data.append(grouping[2]) plot_labels.append("%s vs. %s" % (grouping[0], grouping[1])) if color_individual_within_by_field is not None: plot_colors.append(None) else: plot_colors.append(box_color) assert (len(plot_data) == len(plot_labels) and len(plot_labels) == len(plot_colors)), "The number " +\ "of boxplot labels and colors do not match the number of " +\ "boxplots." # We now have our data and labels ready, so plot them! if plot_data: if sort is not None: plot_data, plot_labels, plot_colors = _sort_distributions( plot_data, plot_labels, plot_colors, sort) if width is None: width = len(plot_data) * box_width + 2 if width <= 0 or height <= 0: raise ValueError("The specified width and height of the plot " "must be greater than zero.") plot_figure = boxplots(plot_data, x_tick_labels=plot_labels, title="%s Distances" % field, x_label="Grouping", y_label="Distance", x_tick_labels_orientation='vertical', y_min=y_min, y_max=y_max, whisker_length=whisker_length, box_width=box_width, box_colors=plot_colors, figure_width=width, figure_height=height, legend=legend) results.append( (field, plot_figure, plot_data, plot_labels, plot_colors)) else: raise ValueError("The generation of all plots was suppressed. At " "least one type of plot must be unsuppressed.") return results