def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) if opts.show_metrics: print("Known metrics are: %s\n" % (', '.join(list_known_metrics()),)) exit(0) almost_required_options = ['input_path', 'output_dir', 'metrics'] for option in almost_required_options: if getattr(opts, option) is None: option_parser.error('Required option --%s omitted.' % option) if opts.output_dir.endswith('.txt'): stderr.write('output must be a directory, files will be named' + ' automatically. And we refuse to make .txt directories\n') exit(1) if opts.tree_path == "None": opts.tree_path = None try: os.makedirs(opts.output_dir) except OSError: pass # hopefully dir already exists if os.path.isdir(opts.input_path): multiple_file_beta(opts.input_path, opts.output_dir, opts.metrics, opts.tree_path, opts.rows, full_tree=opts.full_tree) elif os.path.isfile(opts.input_path): single_file_beta(opts.input_path, opts.metrics, opts.tree_path, opts.output_dir, opts.rows, full_tree=opts.full_tree) else: stderr.write("io error, input path not valid. Does it exist?") exit(1)
def single_file_beta(self, otu_table_string, tree_string, missing_sams=None, use_metric_list=False): """ running single_file_beta should give same result using --rows""" if missing_sams==None: missing_sams = [] # setup input_path = get_tmp_filename() in_fname = os.path.split(input_path)[1] f = open(input_path,'w') f.write(otu_table_string) f.close() tree_path = get_tmp_filename() f = open(tree_path,'w') f.write(tree_string) f.close() metrics = list_known_nonphylogenetic_metrics() metrics.extend(list_known_phylogenetic_metrics()) output_dir = get_tmp_filename(suffix = '') os.mkdir(output_dir) # new metrics that don't trivially parallelize must be dealt with # carefully warnings.filterwarnings('ignore','dissimilarity binary_dist_chisq is\ not parallelized, calculating the whole matrix...') warnings.filterwarnings('ignore','dissimilarity dist_chisq is not\ parallelized, calculating the whole matrix...') warnings.filterwarnings('ignore','dissimilarity dist_gower is not\ parallelized, calculating the whole matrix...') warnings.filterwarnings('ignore','dissimilarity dist_hellinger is\ not parallelized, calculating the whole matrix...') warnings.filterwarnings('ignore','unifrac had no information for\ sample M*') self.files_to_remove.extend([input_path,tree_path]) self.folders_to_remove.append(output_dir) os.mkdir(output_dir+'/ft/') for metric in metrics: # do it if use_metric_list: single_file_beta(input_path, [metric], tree_path, output_dir, rowids=None) else: single_file_beta(input_path, metric, tree_path, output_dir, rowids=None) sams, dmtx = parse_distmat(open(output_dir + '/' +\ metric + '_' + in_fname)) # do it by rows for i in range(len(sams)): if sams[i] in missing_sams: continue rows = sams[i] row_outname = output_dir + '/' + metric + '_' +\ in_fname if use_metric_list: single_file_beta(input_path, [metric], tree_path, output_dir, rowids=rows) else: single_file_beta(input_path, metric, tree_path, output_dir, rowids=rows) col_sams, row_sams, row_dmtx = parse_matrix(open(row_outname)) self.assertEqual(row_dmtx.shape, (len(rows.split(',')), len(sams))) # make sure rows same as full for j in range(len(rows.split(','))): for k in range(len(sams)): row_v1 = row_dmtx[j,k] full_v1 =\ dmtx[sams.index(row_sams[j]), sams.index(col_sams[k])] self.assertFloatEqual(row_v1, full_v1) ### full tree run: if 'full_tree' in str(metric).lower(): continue # do it by rows with full tree for i in range(len(sams)): if sams[i] in missing_sams: continue rows = sams[i] row_outname = output_dir + '/ft/' + metric + '_' +\ in_fname if use_metric_list: single_file_beta(input_path, [metric], tree_path, output_dir+'/ft/', rowids=rows, full_tree=True) else: single_file_beta(input_path, metric, tree_path, output_dir+'/ft/', rowids=rows, full_tree=True) col_sams, row_sams, row_dmtx = parse_matrix(open(row_outname)) self.assertEqual(row_dmtx.shape, (len(rows.split(',')), len(sams))) # make sure rows same as full for j in range(len(rows.split(','))): for k in range(len(sams)): row_v1 = row_dmtx[j,k] full_v1 =\ dmtx[sams.index(row_sams[j]), sams.index(col_sams[k])] self.assertFloatEqual(row_v1, full_v1) # # do it with full tree if use_metric_list: single_file_beta(input_path, [metric], tree_path, output_dir+'/ft/', rowids=None, full_tree=True) else: single_file_beta(input_path, metric, tree_path, output_dir+'/ft/', rowids=None, full_tree=True) sams_ft, dmtx_ft = parse_distmat(open(output_dir + '/ft/' +\ metric + '_' + in_fname)) self.assertEqual(sams_ft, sams) self.assertFloatEqual(dmtx_ft, dmtx)
def single_file_beta(self, otu_table_string, tree_string, missing_sams=None, use_metric_list=False): """ running single_file_beta should give same result using --rows""" if missing_sams is None: missing_sams = [] # setup fd, input_path = mkstemp(suffix='.txt') close(fd) in_fname = os.path.split(input_path)[1] f = open(input_path, 'w') f.write(otu_table_string) f.close() fd, tree_path = mkstemp(suffix='.tre') close(fd) f = open(tree_path, 'w') f.write(tree_string) f.close() metrics = list_known_nonphylogenetic_metrics() metrics.extend(list_known_phylogenetic_metrics()) output_dir = mkdtemp() # new metrics that don't trivially parallelize must be dealt with # carefully warnings.filterwarnings( 'ignore', 'dissimilarity binary_dist_chisq is\ not parallelized, calculating the whole matrix...') warnings.filterwarnings( 'ignore', 'dissimilarity dist_chisq is not\ parallelized, calculating the whole matrix...') warnings.filterwarnings( 'ignore', 'dissimilarity dist_gower is not\ parallelized, calculating the whole matrix...') warnings.filterwarnings( 'ignore', 'dissimilarity dist_hellinger is\ not parallelized, calculating the whole matrix...') warnings.filterwarnings('ignore', 'unifrac had no information for\ sample M*') self.files_to_remove.extend([input_path, tree_path]) self.folders_to_remove.append(output_dir) os.mkdir(output_dir + '/ft/') for metric in metrics: # do it if use_metric_list: single_file_beta(input_path, [metric], tree_path, output_dir, rowids=None) else: single_file_beta(input_path, metric, tree_path, output_dir, rowids=None) sams, dmtx = parse_distmat( open(output_dir + '/' + metric + '_' + in_fname)) # do it by rows for i in range(len(sams)): if sams[i] in missing_sams: continue rows = sams[i] row_outname = output_dir + '/' + metric + '_' +\ in_fname if use_metric_list: single_file_beta(input_path, [metric], tree_path, output_dir, rowids=rows) else: single_file_beta(input_path, metric, tree_path, output_dir, rowids=rows) col_sams, row_sams, row_dmtx = parse_matrix(open(row_outname)) self.assertEqual(row_dmtx.shape, (len(rows.split(',')), len(sams))) # make sure rows same as full for j in range(len(rows.split(','))): for k in range(len(sams)): row_v1 = row_dmtx[j, k] full_v1 =\ dmtx[sams.index(row_sams[j]), sams.index(col_sams[k])] assert_almost_equal(row_v1, full_v1) # full tree run: if 'full_tree' in str(metric).lower(): continue # do it by rows with full tree for i in range(len(sams)): if sams[i] in missing_sams: continue rows = sams[i] row_outname = output_dir + '/ft/' + metric + '_' +\ in_fname if use_metric_list: single_file_beta(input_path, [metric], tree_path, output_dir + '/ft/', rowids=rows, full_tree=True) else: single_file_beta(input_path, metric, tree_path, output_dir + '/ft/', rowids=rows, full_tree=True) col_sams, row_sams, row_dmtx = parse_matrix(open(row_outname)) self.assertEqual(row_dmtx.shape, (len(rows.split(',')), len(sams))) # make sure rows same as full for j in range(len(rows.split(','))): for k in range(len(sams)): row_v1 = row_dmtx[j, k] full_v1 =\ dmtx[sams.index(row_sams[j]), sams.index(col_sams[k])] assert_almost_equal(row_v1, full_v1) # do it with full tree if use_metric_list: single_file_beta(input_path, [metric], tree_path, output_dir + '/ft/', rowids=None, full_tree=True) else: single_file_beta(input_path, metric, tree_path, output_dir + '/ft/', rowids=None, full_tree=True) sams_ft, dmtx_ft = parse_distmat( open(output_dir + '/ft/' + metric + '_' + in_fname)) self.assertEqual(sams_ft, sams) assert_almost_equal(dmtx_ft, dmtx)
def single_file_beta(self, otu_table_string, tree_string, missing_sams=None, use_metric_list=False): """ running single_file_beta should give same result using --rows""" if missing_sams is None: missing_sams = [] # setup fd, input_path = mkstemp(suffix=".txt") os.close(fd) in_fname = os.path.split(input_path)[1] f = open(input_path, "w") f.write(otu_table_string) f.close() fd, tree_path = mkstemp(suffix=".tre") os.close(fd) f = open(tree_path, "w") f.write(tree_string) f.close() metrics = list_known_nonphylogenetic_metrics() metrics.extend(list_known_phylogenetic_metrics()) output_dir = mkdtemp() # new metrics that don't trivially parallelize must be dealt with # carefully warnings.filterwarnings( "ignore", "dissimilarity binary_dist_chisq is\ not parallelized, calculating the whole matrix...", ) warnings.filterwarnings( "ignore", "dissimilarity dist_chisq is not\ parallelized, calculating the whole matrix...", ) warnings.filterwarnings( "ignore", "dissimilarity dist_gower is not\ parallelized, calculating the whole matrix...", ) warnings.filterwarnings( "ignore", "dissimilarity dist_hellinger is\ not parallelized, calculating the whole matrix...", ) warnings.filterwarnings( "ignore", "unifrac had no information for\ sample M*", ) self.files_to_remove.extend([input_path, tree_path]) self.folders_to_remove.append(output_dir) os.mkdir(output_dir + "/ft/") for metric in metrics: # do it if use_metric_list: single_file_beta(input_path, [metric], tree_path, output_dir, rowids=None) else: single_file_beta(input_path, metric, tree_path, output_dir, rowids=None) sams, dmtx = parse_distmat(open(output_dir + "/" + metric + "_" + in_fname)) # do it by rows for i in range(len(sams)): if sams[i] in missing_sams: continue rows = sams[i] row_outname = output_dir + "/" + metric + "_" + in_fname if use_metric_list: single_file_beta(input_path, [metric], tree_path, output_dir, rowids=rows) else: single_file_beta(input_path, metric, tree_path, output_dir, rowids=rows) col_sams, row_sams, row_dmtx = parse_matrix(open(row_outname)) self.assertEqual(row_dmtx.shape, (len(rows.split(",")), len(sams))) # make sure rows same as full for j in range(len(rows.split(","))): for k in range(len(sams)): row_v1 = row_dmtx[j, k] full_v1 = dmtx[sams.index(row_sams[j]), sams.index(col_sams[k])] npt.assert_almost_equal(row_v1, full_v1) # full tree run: if "full_tree" in str(metric).lower(): continue # do it by rows with full tree for i in range(len(sams)): if sams[i] in missing_sams: continue rows = sams[i] row_outname = output_dir + "/ft/" + metric + "_" + in_fname if use_metric_list: single_file_beta(input_path, [metric], tree_path, output_dir + "/ft/", rowids=rows, full_tree=True) else: single_file_beta(input_path, metric, tree_path, output_dir + "/ft/", rowids=rows, full_tree=True) col_sams, row_sams, row_dmtx = parse_matrix(open(row_outname)) self.assertEqual(row_dmtx.shape, (len(rows.split(",")), len(sams))) # make sure rows same as full for j in range(len(rows.split(","))): for k in range(len(sams)): row_v1 = row_dmtx[j, k] full_v1 = dmtx[sams.index(row_sams[j]), sams.index(col_sams[k])] npt.assert_almost_equal(row_v1, full_v1) # do it with full tree if use_metric_list: single_file_beta(input_path, [metric], tree_path, output_dir + "/ft/", rowids=None, full_tree=True) else: single_file_beta(input_path, metric, tree_path, output_dir + "/ft/", rowids=None, full_tree=True) sams_ft, dmtx_ft = parse_distmat(open(output_dir + "/ft/" + metric + "_" + in_fname)) self.assertEqual(sams_ft, sams) npt.assert_almost_equal(dmtx_ft, dmtx)