def write_output_file(metric_file_data, output_dir, metric, all_samples): # now have matrix where output_row is rarefaction analysis metric_file_data = sorted(metric_file_data,key=operator.itemgetter(1,2)) row_names = [row.pop(0) for row in metric_file_data] col_names = ['sequences per sample', 'iteration'] + all_samples #Numpy shows weird behaviour when converting metric_file_data to array #it truncates some values, so better go with straight list of lists # format_matrix() now takes 2d lists as well as arrays. out_str = format_matrix(metric_file_data, row_names, col_names) f = open(os.path.join(output_dir,metric+'.txt'),'w') f.write(out_str) f.close()
def test_format_matrix(self): """format_matrix should return tab-delimited mat""" a = [[1, 2, 3], [4, 5, 6], [7, 8, 9]] row_labels = ["a", "b", "c"] col_labels = [11, 22, 33] res = format_matrix(a, row_labels, col_labels) # test as list self.assertEqual(res, "\t11\t22\t33\na\t1\t2\t3\nb\t4\t5\t6\nc\t7\t8\t9") self.assertRaises(ValueError, format_matrix, a, row_labels[:2], col_labels) self.assertRaises(ValueError, format_matrix, None, row_labels, col_labels) # tes as array a = array(a) self.assertEqual(res, "\t11\t22\t33\na\t1\t2\t3\nb\t4\t5\t6\nc\t7\t8\t9") self.assertRaises(ValueError, format_matrix, a, row_labels[:2], col_labels) self.assertRaises(ValueError, format_matrix, None, row_labels, col_labels)
def test_format_matrix(self): """format_matrix should return tab-delimited mat""" a = [[1,2,3], [4,5,6], [7,8,9]] row_labels = ['a','b','c'] col_labels = [11,22,33] res = format_matrix(a, row_labels, col_labels) #test as list self.assertEqual(res, '\t11\t22\t33\na\t1\t2\t3\nb\t4\t5\t6\nc\t7\t8\t9') self.assertRaises(ValueError, format_matrix, a, row_labels[:2], col_labels) self.assertRaises(ValueError, format_matrix, None, row_labels, col_labels) #tes as array a = array(a) self.assertEqual(res, '\t11\t22\t33\na\t1\t2\t3\nb\t4\t5\t6\nc\t7\t8\t9') self.assertRaises(ValueError, format_matrix, a, row_labels[:2], col_labels) self.assertRaises(ValueError, format_matrix, None, row_labels, col_labels)
def single_object_beta(otu_table, metrics, tr, rowids=None, full_tree=False): """mod of single_file_beta to recieve and return otu obj, tree str uses name in metrics to name output beta diversity files assumes input tree is already trimmed to contain only otus present in otu_table, doesn't call getSubTree() inputs: otu_table -- a otu_table in the biom format metrics -- metrics (str, comma delimited if more than 1 metric) tr -- a phylonode cogent tree object if needed by the chosen beta diversity metric rowids -- comma seperated string """ if isinstance(otu_table, DenseTable): otumtx = otu_table._data.T else: otumtx = asarray([v for v in otu_table.iterSampleData()]) if tr: tree = tr else: tree = None metrics_list = metrics.split(',') for metric in metrics_list: try: metric_f = get_nonphylogenetic_metric(metric) is_phylogenetic = False except AttributeError: try: metric_f = get_phylogenetic_metric(metric) is_phylogenetic = True if tree is None: stderr.write( "metric %s requires a tree, but none found\n" % (metric, )) exit(1) except AttributeError: stderr.write( "Could not find metric %s.\n\nKnown metrics are: %s\n" % (metric, ', '.join(list_known_metrics()))) exit(1) if rowids is None: # standard, full way if is_phylogenetic: dissims = metric_f(otumtx, otu_table.ObservationIds, tree, otu_table.SampleIds, make_subtree=(not full_tree)) else: dissims = metric_f(otumtx) return (format_distance_matrix(otu_table.SampleIds, dissims).split('\n')) else: # only calc d(rowid1, *) for each rowid rowids_list = rowids.split(',') row_dissims = [] # same order as rowids_list for rowid in rowids_list: rowidx = otu_table.SampleIds.index(rowid) # first test if we can the dissim is a fn of only the pair # if not, just calc the whole matrix if metric_f.__name__ == 'dist_chisq' or \ metric_f.__name__ == 'dist_gower' or \ metric_f.__name__ == 'dist_hellinger' or\ metric_f.__name__ == 'binary_dist_chisq': warnings.warn( 'dissimilarity ' + metric_f.__name__ + ' is not parallelized, calculating the whole matrix...' ) row_dissims.append(metric_f(otumtx)[rowidx]) else: try: row_metric = get_phylogenetic_row_metric(metric) except AttributeError: # do element by element dissims = [] for i in range(len(otu_table.SampleIds)): if is_phylogenetic: dissim = metric_f( otumtx[[rowidx, i], :], otu_table.ObservationIds, tree, [ otu_table.SampleIds[rowidx], otu_table.SampleIds[i] ], make_subtree=(not full_tree))[0, 1] else: dissim = metric_f(otumtx[[rowidx, i], :])[0, 1] dissims.append(dissim) row_dissims.append(dissims) else: # do whole row at once dissims = row_metric(otumtx, otu_table.ObservationIds, tree, otu_table.SampleIds, rowid, make_subtree=(not full_tree)) row_dissims.append(dissims) return format_matrix(row_dissims, rowids_list, otu_table.SampleIds)
def single_file_beta(input_path, metrics, tree_path, output_dir, rowids=None, full_tree=False): """ does beta diversity calc on a single otu table uses name in metrics to name output beta diversity files assumes input tree is already trimmed to contain only otus present in otu table, doesn't call getSubTree() inputs: input_path (str) metrics (str, comma delimited if more than 1 metric; or list) tree_path (str) output_dir (str) rowids (comma separated str) """ metrics_list = metrics try: metrics_list = metrics_list.split(',') except AttributeError: pass otu_table = parse_biom_table(open(input_path, 'U')) if isinstance(otu_table, DenseTable): otumtx = otu_table._data.T else: otumtx = asarray([v for v in otu_table.iterSampleData()]) if tree_path: tree = parse_newick(open(tree_path, 'U'), PhyloNode) else: tree = None input_dir, input_filename = os.path.split(input_path) input_basename, input_ext = os.path.splitext(input_filename) for metric in metrics_list: outfilepath = os.path.join(output_dir, metric + '_' + input_basename + '.txt') try: metric_f = get_nonphylogenetic_metric(metric) is_phylogenetic = False except AttributeError: try: metric_f = get_phylogenetic_metric(metric) is_phylogenetic = True if tree is None: stderr.write( "metric %s requires a tree, but none found\n" % (metric, )) exit(1) except AttributeError: stderr.write( "Could not find metric %s.\n\nKnown metrics are: %s\n" % (metric, ', '.join(list_known_metrics()))) exit(1) if rowids is None: # standard, full way if is_phylogenetic: dissims = metric_f(otumtx, otu_table.ObservationIds, tree, otu_table.SampleIds, make_subtree=(not full_tree)) else: dissims = metric_f(otumtx) f = open(outfilepath, 'w') f.write(format_distance_matrix(otu_table.SampleIds, dissims)) f.close() else: # only calc d(rowid1, *) for each rowid rowids_list = rowids.split(',') row_dissims = [] # same order as rowids_list for rowid in rowids_list: rowidx = otu_table.SampleIds.index(rowid) # first test if we can the dissim is a fn of only the pair # if not, just calc the whole matrix if metric_f.__name__ == 'dist_chisq' or \ metric_f.__name__ == 'dist_gower' or \ metric_f.__name__ == 'dist_hellinger' or\ metric_f.__name__ == 'binary_dist_chisq': warnings.warn( 'dissimilarity ' + metric_f.__name__ + ' is not parallelized, calculating the whole matrix...' ) row_dissims.append(metric_f(otumtx)[rowidx]) else: try: row_metric = get_phylogenetic_row_metric(metric) except AttributeError: # do element by element dissims = [] for i in range(len(otu_table.SampleIds)): if is_phylogenetic: dissim = metric_f( otumtx[[rowidx, i], :], otu_table.ObservationIds, tree, [ otu_table.SampleIds[rowidx], otu_table.SampleIds[i] ], make_subtree=(not full_tree))[0, 1] else: dissim = metric_f(otumtx[[rowidx, i], :])[0, 1] dissims.append(dissim) row_dissims.append(dissims) else: # do whole row at once dissims = row_metric(otumtx, otu_table.ObservationIds, tree, otu_table.SampleIds, rowid, make_subtree=(not full_tree)) row_dissims.append(dissims) # rows_outfilepath = os.path.join(output_dir, metric + '_' +\ # '_'.join(rowids_list) + '_' + os.path.split(input_path)[1]) f = open(outfilepath, 'w') f.write( format_matrix(row_dissims, rowids_list, otu_table.SampleIds)) f.close()
def single_object_beta(otu_table, metrics, tr, rowids=None, full_tree=False): """mod of single_file_beta to recieve and return otu obj, tree str uses name in metrics to name output beta diversity files assumes input tree is already trimmed to contain only otus present in otu_table, doesn't call getSubTree() inputs: otu_table -- a otu_table in the biom format metrics -- metrics (str, comma delimited if more than 1 metric) tr -- a phylonode cogent tree object if needed by the chosen beta diversity metric rowids -- comma seperated string """ if isinstance(otu_table, DenseTable): otumtx = otu_table._data.T else: otumtx = asarray([v for v in otu_table.iterSampleData()]) if tr: tree = tr else: tree = None metrics_list = metrics.split(',') for metric in metrics_list: try: metric_f = get_nonphylogenetic_metric(metric) is_phylogenetic = False except AttributeError: try: metric_f = get_phylogenetic_metric(metric) is_phylogenetic = True if tree == None: stderr.write("metric %s requires a tree, but none found\n"\ % (metric,)) exit(1) except AttributeError: stderr.write("Could not find metric %s.\n\nKnown metrics are: %s\n"\ % (metric, ', '.join(list_known_metrics()))) exit(1) if rowids == None: # standard, full way if is_phylogenetic: dissims = metric_f(otumtx, otu_table.ObservationIds, tree, otu_table.SampleIds, make_subtree = (not full_tree)) else: dissims = metric_f(otumtx) return format_distance_matrix(otu_table.SampleIds, dissims).split('\n') else: # only calc d(rowid1, *) for each rowid rowids_list = rowids.split(',') row_dissims = [] # same order as rowids_list for rowid in rowids_list: rowidx = otu_table.SampleIds.index(rowid) # first test if we can the dissim is a fn of only the pair # if not, just calc the whole matrix if metric_f.__name__ == 'dist_chisq' or \ metric_f.__name__ == 'dist_gower' or \ metric_f.__name__ == 'dist_hellinger' or\ metric_f.__name__ == 'binary_dist_chisq': warnings.warn('dissimilarity '+metric_f.__name__+\ ' is not parallelized, calculating the whole matrix...') row_dissims.append(metric_f(otumtx)[rowidx]) else: try: row_metric = get_phylogenetic_row_metric(metric) except AttributeError: # do element by element dissims = [] for i in range(len(otu_table.SampleIds)): if is_phylogenetic: dissim = metric_f(otumtx[[rowidx,i],:], otu_table.ObservationIds, tree, [otu_table.SampleIds[rowidx], otu_table.SampleIds[i]], make_subtree = (not full_tree))[0,1] else: dissim = metric_f(otumtx[[rowidx,i],:])[0,1] dissims.append(dissim) row_dissims.append(dissims) else: # do whole row at once dissims = row_metric(otumtx, otu_table.ObservationIds, tree, otu_table.SampleIds, rowid, make_subtree = (not full_tree)) row_dissims.append(dissims) return format_matrix(row_dissims,rowids_list,otu_table.SampleIds)
def single_file_beta(input_path, metrics, tree_path, output_dir, rowids=None, full_tree=False): """ does beta diversity calc on a single otu table uses name in metrics to name output beta diversity files assumes input tree is already trimmed to contain only otus present in otu table, doesn't call getSubTree() inputs: input_path (str) metrics (str, comma delimited if more than 1 metric; or list) tree_path (str) output_dir (str) rowids (comma separated str) """ metrics_list = metrics try: metrics_list = metrics_list.split(',') except AttributeError: pass otu_table = parse_biom_table(open(input_path,'U')) if isinstance(otu_table, DenseTable): otumtx = otu_table._data.T else: otumtx = asarray([v for v in otu_table.iterSampleData()]) if tree_path: tree = parse_newick(open(tree_path, 'U'), PhyloNode) else: tree = None input_dir, input_filename = os.path.split(input_path) input_basename, input_ext = os.path.splitext(input_filename) for metric in metrics_list: outfilepath = os.path.join(output_dir, metric + '_' + \ input_basename + '.txt') try: metric_f = get_nonphylogenetic_metric(metric) is_phylogenetic = False except AttributeError: try: metric_f = get_phylogenetic_metric(metric) is_phylogenetic = True if tree == None: stderr.write("metric %s requires a tree, but none found\n"\ % (metric,)) exit(1) except AttributeError: stderr.write("Could not find metric %s.\n\nKnown metrics are: %s\n"\ % (metric, ', '.join(list_known_metrics()))) exit(1) if rowids == None: # standard, full way if is_phylogenetic: dissims = metric_f(otumtx, otu_table.ObservationIds, \ tree, otu_table.SampleIds, make_subtree = (not full_tree)) else: dissims = metric_f(otumtx) f = open(outfilepath,'w') f.write(format_distance_matrix(otu_table.SampleIds, dissims)) f.close() else: # only calc d(rowid1, *) for each rowid rowids_list = rowids.split(',') row_dissims = [] # same order as rowids_list for rowid in rowids_list: rowidx = otu_table.SampleIds.index(rowid) # first test if we can the dissim is a fn of only the pair # if not, just calc the whole matrix if metric_f.__name__ == 'dist_chisq' or \ metric_f.__name__ == 'dist_gower' or \ metric_f.__name__ == 'dist_hellinger' or\ metric_f.__name__ == 'binary_dist_chisq': warnings.warn('dissimilarity '+metric_f.__name__+\ ' is not parallelized, calculating the whole matrix...') row_dissims.append(metric_f(otumtx)[rowidx]) else: try: row_metric = get_phylogenetic_row_metric(metric) except AttributeError: # do element by element dissims = [] for i in range(len(otu_table.SampleIds)): if is_phylogenetic: dissim = metric_f(otumtx[[rowidx,i],:], otu_table.ObservationIds, tree, [otu_table.SampleIds[rowidx], otu_table.SampleIds[i]], make_subtree = (not full_tree))[0,1] else: dissim = metric_f(otumtx[[rowidx,i],:])[0,1] dissims.append(dissim) row_dissims.append(dissims) else: # do whole row at once dissims = row_metric(otumtx, otu_table.ObservationIds, tree, otu_table.SampleIds, rowid, make_subtree = (not full_tree)) row_dissims.append(dissims) # rows_outfilepath = os.path.join(output_dir, metric + '_' +\ # '_'.join(rowids_list) + '_' + os.path.split(input_path)[1]) f = open(outfilepath,'w') f.write(format_matrix(row_dissims,rowids_list,otu_table.SampleIds)) f.close()
def formatResult(self, result): """Generate formatted distance - result is (data, sample_names)""" data, sample_names, calc_names = result res = format_matrix(data, sample_names, calc_names) return res
def single_file_beta(input_path, metrics, tree_path, output_dir, rowids=None, full_tree=False): """ does beta diversity calc on a single otu table uses name in metrics to name output beta diversity files assumes input tree is already trimmed to contain only otus present in otu table, doesn't call getSubTree() inputs: input_path (str) metrics (str, comma delimited if more than 1 metric) tree_path (str) output_dir (str) rowids (comma separated str) """ f = open(input_path, 'U') samids, otuids, otumtx, lineages = parse_otu_table(f) # otu mtx is otus by samples f.close() tree = None if tree_path: f = open(tree_path, 'U') tree = parse_newick(f, PhyloNode) f.close() if not full_tree: tree = tree.getSubTree(otuids, ignore_missing=True) metrics_list = metrics.split(',') for metric in metrics_list: outfilepath = os.path.join(output_dir, metric + '_' + os.path.split(input_path)[1]) try: metric_f = get_nonphylogenetic_metric(metric) is_phylogenetic = False except AttributeError: try: metric_f = get_phylogenetic_metric(metric) is_phylogenetic = True if tree == None: stderr.write("metric %s requires a tree, but none found\n"\ % (metric,)) exit(1) except AttributeError: stderr.write("Could not find metric %s.\n\nKnown metrics are: %s\n"\ % (metric, ', '.join(list_known_metrics()))) exit(1) if rowids == None: # standard, full way if is_phylogenetic: dissims = metric_f(otumtx.T, otuids, tree, samids) else: dissims = metric_f(otumtx.T) f = open(outfilepath, 'w') f.write(format_distance_matrix(samids, dissims)) f.close() else: # only calc d(rowid1, *) for each rowid rowids_list = rowids.split(',') row_dissims = [] # same order as rowids_list for rowid in rowids_list: rowidx = samids.index(rowid) # first test if we can the dissim is a fn of only the pair # if not, just calc the whole matrix if metric_f.__name__ == 'dist_chisq' or \ metric_f.__name__ == 'dist_gower' or \ metric_f.__name__ == 'dist_hellinger' or\ metric_f.__name__ == 'binary_dist_chisq': row_dissims.append(metric_f(otumtx.T)[rowidx]) else: try: row_metric = get_phylogenetic_row_metric(metric) except AttributeError: # do element by element dissims = [] for i in range(len(samids)): if is_phylogenetic: dissim = metric_f( otumtx.T[[rowidx, i], :], otuids, tree, [samids[rowidx], samids[i]])[0, 1] else: dissim = metric_f(otumtx.T[[rowidx, i], :])[0, 1] dissims.append(dissim) row_dissims.append(dissims) else: # do whole row at once dissims = row_metric(otumtx.T, otuids, tree, samids, rowid) row_dissims.append(dissims) # rows_outfilepath = os.path.join(output_dir, metric + '_' +\ # '_'.join(rowids_list) + '_' + os.path.split(input_path)[1]) f = open(outfilepath, 'w') f.write(format_matrix(row_dissims, rowids_list, samids)) f.close()