def _build_real_data_commands(analysis_type, out_dir, even_otu_table_fp, map_fp, tree_fp, workflow): cmds = [] data_type_dir = join(out_dir, 'real') create_dir(data_type_dir) for metric in workflow['metrics']: metric_dir = join(data_type_dir, metric[0]) create_dir(metric_dir) orig_dir = join(metric_dir, 'original') create_dir(orig_dir) required_files = ['dm.txt', 'map.txt', 'pc.txt'] if analysis_type == 'gradient': for category in workflow['categories']: required_files.append('%s_dm.txt' % category[0]) has_orig_files = has_results(orig_dir, required_files=required_files) has_shuff_files = True for shuff_num in range(workflow['num_shuffled_trials']): shuff_num_dir = join(metric_dir, '%d' % shuff_num) has_shuff_files = has_results(shuff_num_dir, required_files) if not has_shuff_files: break if not (has_orig_files and has_shuff_files): cmds.append(_build_per_metric_real_data_commands(analysis_type, metric_dir, even_otu_table_fp, map_fp, tree_fp, metric, workflow['categories'], workflow['num_shuffled_trials'])) return cmds
def _build_real_data_methods_commands(out_dir, workflow): cmds = [] data_type_dir = join(out_dir, 'real') num_shuffled_trials = workflow['num_shuffled_trials'] num_perms = workflow['num_real_data_perms'] for metric in workflow['metrics']: metric_dir = join(data_type_dir, metric[0]) dirs_to_process = ['original'] + map(str, range(num_shuffled_trials)) for dir_to_process in dirs_to_process: dir_to_process = join(metric_dir, dir_to_process) dm_fp = join(dir_to_process, 'dm.txt') pc_fp = join(dir_to_process, 'pc.txt') map_fp = join(dir_to_process, 'map.txt') for category in workflow['categories']: category_dir = join(dir_to_process, category[0]) create_dir(category_dir) grad_dm_fp = join(dir_to_process, '%s_dm.txt' % category[0]) for method in workflow['methods']: if type(method) is Best or type(method) is PartialMantel: continue method_dir = join(category_dir, method.DirectoryName) create_dir(method_dir) if type(method) is MoransI: if not has_results(method_dir): cmds.append('compare_categories.py --method %s -i %s -m %s -c %s -o %s' % (method.DirectoryName, dm_fp, map_fp, category[0], method_dir)) else: for perms in num_perms: perms_dir = join(method_dir, '%d' % perms) create_dir(perms_dir) if not has_results(perms_dir): if type(method) is Mantel or type(method) is MantelCorrelogram: in_dm_fps = ','.join((dm_fp, grad_dm_fp)) cmds.append('compare_distance_matrices.py --method %s -n %d -i %s -o %s' % (method.DirectoryName, perms, in_dm_fps, perms_dir)) elif type(method) is PearsonOrdinationCorrelation: cmds.append('ordination_correlation.py -n %d -i %s -m %s -c %s -o %s -t pearson' % (perms, pc_fp, map_fp, category[0], perms_dir)) elif type(method) is SpearmanOrdinationCorrelation: cmds.append('ordination_correlation.py -n %d -i %s -m %s -c %s -o %s -t spearman' % (perms, pc_fp, map_fp, category[0], perms_dir)) else: cmds.append('compare_categories.py --method %s -i %s -m %s -c %s -o %s -n %d' % (method.DirectoryName, dm_fp, map_fp, category[0], perms_dir, perms)) if Best() in workflow['methods']: best_dir = join(dir_to_process, Best().DirectoryName) if not has_results(best_dir): env_vars = ','.join(workflow['best_method_env_vars']) cmds.append('compare_categories.py --method %s -i %s -m %s -c %s -o %s' % (Best().DirectoryName, dm_fp, map_fp, env_vars, best_dir)) return cmds
def _build_simulated_data_methods_commands(out_dir, workflow): cmds = [] data_type_dir = join(out_dir, 'simulated') num_sim_data_trials = workflow['num_sim_data_trials'] num_sim_data_perms = workflow['num_sim_data_perms'] for category in workflow['categories']: category_dir = join(data_type_dir, category[0]) for trial_num in range(num_sim_data_trials): trial_num_dir = join(category_dir, '%d' % trial_num) for samp_size in workflow['sample_sizes']: samp_size_dir = join(trial_num_dir, '%d' % samp_size) for d in workflow['dissim']: dissim_dir = join(samp_size_dir, repr(d)) for metric in workflow['metrics']: metric_dir = join(dissim_dir, metric[0]) dm_fp = join(metric_dir, 'dm.txt') pc_fp = join(metric_dir, 'pc.txt') map_fp = join(metric_dir, 'map.txt') grad_dm_fp = join(metric_dir, '%s_dm.txt' % category[0]) assert get_num_samples_in_distance_matrix(dm_fp) == samp_size assert get_num_samples_in_map(map_fp) == samp_size for method in workflow['methods']: if type(method) is Best or type(method) is PartialMantel: continue method_dir = join(metric_dir, method.DirectoryName) create_dir(method_dir) if not has_results(method_dir): if type(method) is Mantel or type(method) is MantelCorrelogram: assert get_num_samples_in_distance_matrix(grad_dm_fp) == samp_size in_dm_fps = ','.join((dm_fp, grad_dm_fp)) cmds.append('compare_distance_matrices.py --method %s -n %d -i %s -o %s' % (method.DirectoryName, num_sim_data_perms, in_dm_fps, method_dir)) elif type(method) is PearsonOrdinationCorrelation: cmds.append('ordination_correlation.py -n %d -i %s -m %s -c %s -o %s -t pearson' % (num_sim_data_perms, pc_fp, map_fp, category[0], method_dir)) elif type(method) is SpearmanOrdinationCorrelation: cmds.append('ordination_correlation.py -n %d -i %s -m %s -c %s -o %s -t spearman' % (num_sim_data_perms, pc_fp, map_fp, category[0], method_dir)) else: cmds.append('compare_categories.py --method %s -i %s -m %s -c %s -o %s -n %d' % (method.DirectoryName, dm_fp, map_fp, category[0], method_dir, num_sim_data_perms)) return cmds
def test_has_results(self): """Test checking a directory for results.""" # Dir that doesn't exist. obs = has_results('/foobarbazbazbarfoo1234567890') self.assertFalse(obs) # Dir that exists but is empty. obs = has_results(self.input_dir) self.assertFalse(obs) # Dir that exists, with no required files, but is empty. obs = has_results(self.input_dir, required_files=[]) self.assertFalse(obs) # Dir that exists and is not empty. tmp_fp = join(self.input_dir, 'foo.txt') tmp_f = open(tmp_fp, 'w') tmp_f.write('foo') tmp_f.close() self.files_to_remove.append(tmp_fp) obs = has_results(self.input_dir) self.assertTrue(obs) # Dir that exists and is not empty, with no required files. obs = has_results(self.input_dir, required_files=[]) self.assertTrue(obs) # Dir that exists, is not empty, and has the required file. obs = has_results(self.input_dir, required_files=['foo.txt']) self.assertTrue(obs) # Dir that exists and is not empty, but doesn't have required files. obs = has_results(self.input_dir, required_files=['foo.txt', 'bar.txt', 'baz.txt']) self.assertFalse(obs)
def _build_simulated_data_commands(analysis_type, out_dir, even_otu_table_fp, map_fp, tree_fp, workflow): cmds = [] data_type_dir = join(out_dir, 'simulated') create_dir(data_type_dir) num_samps = get_num_samples_in_table(even_otu_table_fp) for category in workflow['categories']: category_dir = join(data_type_dir, category[0]) create_dir(category_dir) for trial_num in range(workflow['num_sim_data_trials']): trial_num_dir = join(category_dir, '%d' % trial_num) create_dir(trial_num_dir) for samp_size in workflow['sample_sizes']: samp_size_dir = join(trial_num_dir, '%d' % samp_size) create_dir(samp_size_dir) # Lots of duplicate code between these two blocks... # need to refactor and test. if samp_size <= num_samps: simsam_rep_num = 1 subset_otu_table_fp = join(samp_size_dir, basename(even_otu_table_fp)) subset_map_fp = join(samp_size_dir, basename(map_fp)) if not has_results(samp_size_dir, required_files=[basename(subset_otu_table_fp), basename(subset_map_fp)]): run_command('choose_data_subset.py -t %s -i %s -m %s -c %s -n %d -o %s' % (analysis_type, even_otu_table_fp, map_fp, category[0], samp_size, samp_size_dir)) assert get_num_samples_in_table(subset_otu_table_fp) == samp_size assert get_num_samples_in_map(subset_map_fp) == samp_size for d in workflow['dissim']: dissim_dir = join(samp_size_dir, repr(d)) create_dir(dissim_dir) simsam_map_fp = join(dissim_dir, add_filename_suffix(subset_map_fp, '_n%d_d%r' % (simsam_rep_num, d))) simsam_otu_table_fp = join(dissim_dir, add_filename_suffix(subset_otu_table_fp, '_n%d_d%r' % (simsam_rep_num, d))) # Check for simulated table/map and various # distance matrices / coordinates files. required_simsam_files = [basename(simsam_map_fp), basename(simsam_otu_table_fp)] has_simsam_files = has_results(dissim_dir, required_files=required_simsam_files) has_metric_files = True for metric in workflow['metrics']: required_metric_files = ['dm.txt', 'map.txt', 'pc.txt'] if analysis_type == 'gradient': required_metric_files.append('%s_dm.txt' % category[0]) metric_dir = join(dissim_dir, metric[0]) has_metric_files = has_results(metric_dir, required_metric_files) if not has_metric_files: break if not (has_simsam_files and has_metric_files): cmd = ['simsam.py -i %s -t %s -o %s -d %r -n %d -m %s' % (subset_otu_table_fp, tree_fp, dissim_dir, d, simsam_rep_num, subset_map_fp)] for metric in workflow['metrics']: metric_dir = join(dissim_dir, metric[0]) create_dir(metric_dir) if analysis_type == 'gradient': cmd.append('distance_matrix_from_mapping.py -i %s -c %s -o %s' % (simsam_map_fp, category[0], join(metric_dir, '%s_dm.txt' % category[0]))) cmd.append('beta_diversity.py -i %s -o %s -m %s -t %s' % (simsam_otu_table_fp, metric_dir, metric[0], tree_fp)) cmd.append('mv %s %s' % (join(metric_dir, '%s_%s.txt' % (metric[0], splitext(basename(simsam_otu_table_fp))[0])), join(metric_dir, 'dm.txt'))) cmd.append('cp %s %s' % (simsam_map_fp, join(metric_dir, 'map.txt'))) cmd.append('principal_coordinates.py -i %s -o %s' % (join(metric_dir, 'dm.txt'), join(metric_dir, 'pc.txt'))) cmds.append(' && '.join(cmd)) else: # We need to simulate more samples than we originally have. simsam_rep_num = get_simsam_rep_num(samp_size, num_samps) for d in workflow['dissim']: dissim_dir = join(samp_size_dir, repr(d)) create_dir(dissim_dir) simsam_map_fp = join(dissim_dir, add_filename_suffix(map_fp, '_n%d_d%r' % (simsam_rep_num, d))) simsam_otu_table_fp = join(dissim_dir, add_filename_suffix(even_otu_table_fp, '_n%d_d%r' % (simsam_rep_num, d))) required_simsam_files = [basename(simsam_map_fp), basename(simsam_otu_table_fp)] has_simsam_files = has_results(dissim_dir, required_files=required_simsam_files) required_subset_files = [basename(simsam_map_fp), basename(simsam_otu_table_fp)] has_subset_files = has_results(join(dissim_dir, 'subset'), required_files=required_subset_files) has_metric_files = True for metric in workflow['metrics']: required_metric_files = ['dm.txt', 'map.txt', 'pc.txt'] if analysis_type == 'gradient': required_metric_files.append('%s_dm.txt' % category[0]) metric_dir = join(dissim_dir, metric[0]) has_metric_files = has_results(metric_dir, required_metric_files) if not has_metric_files: break if not (has_simsam_files and has_subset_files and has_metric_files): cmd = ['simsam.py -i %s -t %s -o %s -d %r -n %d -m %s' % (even_otu_table_fp, tree_fp, dissim_dir, d, simsam_rep_num, map_fp)] subset_dir = join(dissim_dir, 'subset') cmd.append('choose_data_subset.py -t %s -i %s -m %s -c %s -n %d -o %s' % (analysis_type, simsam_otu_table_fp, simsam_map_fp, category[0], samp_size, subset_dir)) subset_otu_table_fp = join(subset_dir, basename(simsam_otu_table_fp)) subset_map_fp = join(subset_dir, basename(simsam_map_fp)) for metric in workflow['metrics']: metric_dir = join(dissim_dir, metric[0]) create_dir(metric_dir) if analysis_type == 'gradient': cmd.append('distance_matrix_from_mapping.py -i %s -c %s -o %s' % (subset_map_fp, category[0], join(metric_dir, '%s_dm.txt' % category[0]))) cmd.append('beta_diversity.py -i %s -o %s -m %s -t %s' % (subset_otu_table_fp, metric_dir, metric[0], tree_fp)) cmd.append('mv %s %s' % (join(metric_dir, '%s_%s.txt' % (metric[0], splitext(basename(subset_otu_table_fp))[0])), join(metric_dir, 'dm.txt'))) cmd.append('cp %s %s' % (subset_map_fp, join(metric_dir, 'map.txt'))) cmd.append('principal_coordinates.py -i %s -o %s' % (join(metric_dir, 'dm.txt'), join(metric_dir, 'pc.txt'))) cmds.append(' && '.join(cmd)) return cmds