def process_data(in_dir, workflow, ipython_profile=None): """Run statistical methods over generated data. For real data, creates category and method dirs for original and shuffled data. Under each method dir, permutation dirs will also be created, e.g.: in_dir/ ... category/ method/ num_perms/ <method>_results.txt For simulated data, creates method dirs under metric dirs in in_dir, e.g.: in_dir/ ... metric/ method/ <method>_results.txt """ # Process each compare_categories.py/compare_distance_matrices.py run in # parallel. cmds = [] for study in workflow: study_dir = join(in_dir, study) for depth in workflow[study]['depths']: depth_dir = join(study_dir, '%d' % depth[0]) cmds.extend(_build_real_data_methods_commands(depth_dir, workflow[study])) cmds.extend(_build_simulated_data_methods_commands(depth_dir, workflow[study])) run_parallel_jobs(cmds, run_command, ipython_profile=ipython_profile)
def generate_data(analysis_type, in_dir, out_dir, workflow, tree_fp, ipython_profile=None): """Generates real and simulated data for each study. Distance matrices will be created at each even sampling depth and metric using the provided tree if necessary. Shuffled versions of each distance matrix will also be created, which can be used as negative controls. Additionally, simulated gradient or cluster data will be created at varying sample sizes and dissimilarity levels (using simsam.py). data_type should be either 'gradient' or 'cluster'. Will create the following (heavily nested) output directory structure: out_dir/ study/ depth/ even depth otu table (.biom) real/ metric/ original/ map.txt dm.txt pc.txt <category>_dm.txt (if gradient) shuff_num map.txt dm.txt pc.txt <category>_dm.txt (if gradient) simulated/ category/ trial_num/ samp_size/ (optional) subset files dependent on samp_size dissim/ subset files/dirs dependent on samp_size metric/ map.txt dm.txt pc.txt <category>_dm.txt (if gradient) """ create_dir(out_dir) cmds = [] for study in workflow: study_dir = join(out_dir, study) create_dir(study_dir) otu_table_fp = join(in_dir, study, 'otu_table.biom') map_fp = join(in_dir, study, 'map.txt') map_f = open(map_fp, 'U') for depth in workflow[study]['depths']: depth_dir = join(study_dir, '%d' % depth[0]) create_dir(depth_dir) # Rarefy the table first since simsam.py's output tables will still # have even sampling depth and we don't want to lose simulated # samples after the fact. even_otu_table_fp = join(depth_dir, basename(otu_table_fp)) if not exists(even_otu_table_fp): run_command('single_rarefaction.py -i %s -o %s -d %d;' % ( otu_table_fp, even_otu_table_fp, depth[0])) cmds.extend(_build_real_data_commands(analysis_type, depth_dir, even_otu_table_fp, map_fp, tree_fp, workflow[study])) cmds.extend(_build_simulated_data_commands(analysis_type, depth_dir, even_otu_table_fp, map_fp, tree_fp, workflow[study])) run_parallel_jobs(cmds, run_command, ipython_profile=ipython_profile)
def test_run_parallel_jobs(self): """Test running jobs in parallel.""" # Doesn't error out if no jobs are submitted, which can happend during # a rerun of the workflow. self.assertTrue(run_parallel_jobs([], int) is None)