示例#1
0
def _build_real_data_commands(analysis_type, out_dir, even_otu_table_fp,
                              map_fp, tree_fp, workflow):
    cmds = []

    data_type_dir = join(out_dir, 'real')
    create_dir(data_type_dir)

    for metric in workflow['metrics']:
        metric_dir = join(data_type_dir, metric[0])
        create_dir(metric_dir)

        orig_dir = join(metric_dir, 'original')
        create_dir(orig_dir)

        required_files = ['dm.txt', 'map.txt', 'pc.txt']
        if analysis_type == 'gradient':
            for category in workflow['categories']:
                required_files.append('%s_dm.txt' % category[0])

        has_orig_files = has_results(orig_dir, required_files=required_files)

        has_shuff_files = True
        for shuff_num in range(workflow['num_shuffled_trials']):
            shuff_num_dir = join(metric_dir, '%d' % shuff_num)
            has_shuff_files = has_results(shuff_num_dir, required_files)
            if not has_shuff_files:
                break

        if not (has_orig_files and has_shuff_files):
            cmds.append(_build_per_metric_real_data_commands(analysis_type,
                    metric_dir, even_otu_table_fp, map_fp, tree_fp, metric,
                    workflow['categories'], workflow['num_shuffled_trials']))
    return cmds
示例#2
0
def _build_real_data_methods_commands(out_dir, workflow):
    cmds = []

    data_type_dir = join(out_dir, 'real')

    num_shuffled_trials = workflow['num_shuffled_trials']
    num_perms = workflow['num_real_data_perms']

    for metric in workflow['metrics']:
        metric_dir = join(data_type_dir, metric[0])

        dirs_to_process = ['original'] + map(str, range(num_shuffled_trials))
        for dir_to_process in dirs_to_process:
            dir_to_process = join(metric_dir, dir_to_process)

            dm_fp = join(dir_to_process, 'dm.txt')
            pc_fp = join(dir_to_process, 'pc.txt')
            map_fp = join(dir_to_process, 'map.txt')

            for category in workflow['categories']:
                category_dir = join(dir_to_process, category[0])
                create_dir(category_dir)

                grad_dm_fp = join(dir_to_process, '%s_dm.txt' % category[0])

                for method in workflow['methods']:
                    if type(method) is Best or type(method) is PartialMantel:
                        continue

                    method_dir = join(category_dir, method.DirectoryName)
                    create_dir(method_dir)

                    if type(method) is MoransI:
                        if not has_results(method_dir):
                            cmds.append('compare_categories.py --method %s -i %s -m %s -c %s -o %s' % (method.DirectoryName, dm_fp, map_fp, category[0], method_dir))
                    else:
                        for perms in num_perms:
                            perms_dir = join(method_dir, '%d' % perms)
                            create_dir(perms_dir)

                            if not has_results(perms_dir):
                                if type(method) is Mantel or type(method) is MantelCorrelogram:
                                    in_dm_fps = ','.join((dm_fp, grad_dm_fp))
                                    cmds.append('compare_distance_matrices.py --method %s -n %d -i %s -o %s' % (method.DirectoryName, perms, in_dm_fps, perms_dir))
                                elif type(method) is PearsonOrdinationCorrelation:
                                    cmds.append('ordination_correlation.py -n %d -i %s -m %s -c %s -o %s -t pearson' % (perms, pc_fp, map_fp, category[0], perms_dir))
                                elif type(method) is SpearmanOrdinationCorrelation:
                                    cmds.append('ordination_correlation.py -n %d -i %s -m %s -c %s -o %s -t spearman' % (perms, pc_fp, map_fp, category[0], perms_dir))
                                else:
                                    cmds.append('compare_categories.py --method %s -i %s -m %s -c %s -o %s -n %d' % (method.DirectoryName, dm_fp, map_fp, category[0], perms_dir, perms))

            if Best() in workflow['methods']:
                best_dir = join(dir_to_process, Best().DirectoryName)

                if not has_results(best_dir):
                    env_vars = ','.join(workflow['best_method_env_vars'])
                    cmds.append('compare_categories.py --method %s -i %s -m %s -c %s -o %s' % (Best().DirectoryName, dm_fp, map_fp, env_vars, best_dir))
    return cmds
示例#3
0
def _build_simulated_data_methods_commands(out_dir, workflow):
    cmds = []

    data_type_dir = join(out_dir, 'simulated')

    num_sim_data_trials = workflow['num_sim_data_trials']
    num_sim_data_perms = workflow['num_sim_data_perms']

    for category in workflow['categories']:
        category_dir = join(data_type_dir, category[0])

        for trial_num in range(num_sim_data_trials):
            trial_num_dir = join(category_dir, '%d' % trial_num)

            for samp_size in workflow['sample_sizes']:
                samp_size_dir = join(trial_num_dir, '%d' % samp_size)

                for d in workflow['dissim']:
                    dissim_dir = join(samp_size_dir, repr(d))

                    for metric in workflow['metrics']:
                        metric_dir = join(dissim_dir, metric[0])

                        dm_fp = join(metric_dir, 'dm.txt')
                        pc_fp = join(metric_dir, 'pc.txt')
                        map_fp = join(metric_dir, 'map.txt')
                        grad_dm_fp = join(metric_dir,
                                          '%s_dm.txt' % category[0])
                        assert get_num_samples_in_distance_matrix(dm_fp) == samp_size
                        assert get_num_samples_in_map(map_fp) == samp_size

                        for method in workflow['methods']:
                            if type(method) is Best or type(method) is PartialMantel:
                                continue
                            method_dir = join(metric_dir, method.DirectoryName)
                            create_dir(method_dir)

                            if not has_results(method_dir):
                                if type(method) is Mantel or type(method) is MantelCorrelogram:
                                    assert get_num_samples_in_distance_matrix(grad_dm_fp) == samp_size
                                    in_dm_fps = ','.join((dm_fp,
                                                          grad_dm_fp))
                                    cmds.append('compare_distance_matrices.py --method %s -n %d -i %s -o %s' % (method.DirectoryName, num_sim_data_perms, in_dm_fps, method_dir))
                                elif type(method) is PearsonOrdinationCorrelation:
                                    cmds.append('ordination_correlation.py -n %d -i %s -m %s -c %s -o %s -t pearson' % (num_sim_data_perms, pc_fp, map_fp, category[0], method_dir))
                                elif type(method) is SpearmanOrdinationCorrelation:
                                    cmds.append('ordination_correlation.py -n %d -i %s -m %s -c %s -o %s -t spearman' % (num_sim_data_perms, pc_fp, map_fp, category[0], method_dir))
                                else:
                                    cmds.append('compare_categories.py --method %s -i %s -m %s -c %s -o %s -n %d' % (method.DirectoryName, dm_fp, map_fp, category[0], method_dir, num_sim_data_perms))
    return cmds
示例#4
0
    def test_has_results(self):
        """Test checking a directory for results."""
        # Dir that doesn't exist.
        obs = has_results('/foobarbazbazbarfoo1234567890')
        self.assertFalse(obs)

        # Dir that exists but is empty.
        obs = has_results(self.input_dir)
        self.assertFalse(obs)

        # Dir that exists, with no required files, but is empty.
        obs = has_results(self.input_dir, required_files=[])
        self.assertFalse(obs)

        # Dir that exists and is not empty.
        tmp_fp = join(self.input_dir, 'foo.txt')
        tmp_f = open(tmp_fp, 'w')
        tmp_f.write('foo')
        tmp_f.close()
        self.files_to_remove.append(tmp_fp)

        obs = has_results(self.input_dir)
        self.assertTrue(obs)

        # Dir that exists and is not empty, with no required files.
        obs = has_results(self.input_dir, required_files=[])
        self.assertTrue(obs)

        # Dir that exists, is not empty, and has the required file.
        obs = has_results(self.input_dir, required_files=['foo.txt'])
        self.assertTrue(obs)

        # Dir that exists and is not empty, but doesn't have required files.
        obs = has_results(self.input_dir,
                          required_files=['foo.txt', 'bar.txt', 'baz.txt'])
        self.assertFalse(obs)
示例#5
0
def _build_simulated_data_commands(analysis_type, out_dir, even_otu_table_fp,
                                   map_fp, tree_fp, workflow):
    cmds = []

    data_type_dir = join(out_dir, 'simulated')
    create_dir(data_type_dir)

    num_samps = get_num_samples_in_table(even_otu_table_fp)

    for category in workflow['categories']:
        category_dir = join(data_type_dir, category[0])
        create_dir(category_dir)

        for trial_num in range(workflow['num_sim_data_trials']):
            trial_num_dir = join(category_dir, '%d' % trial_num)
            create_dir(trial_num_dir)

            for samp_size in workflow['sample_sizes']:
                samp_size_dir = join(trial_num_dir, '%d' % samp_size)
                create_dir(samp_size_dir)

                # Lots of duplicate code between these two blocks...
                # need to refactor and test.
                if samp_size <= num_samps:
                    simsam_rep_num = 1

                    subset_otu_table_fp = join(samp_size_dir, basename(even_otu_table_fp))
                    subset_map_fp = join(samp_size_dir, basename(map_fp))

                    if not has_results(samp_size_dir, required_files=[basename(subset_otu_table_fp), basename(subset_map_fp)]):
                        run_command('choose_data_subset.py -t %s -i %s -m %s -c %s -n %d -o %s' % (analysis_type, even_otu_table_fp, map_fp, category[0], samp_size, samp_size_dir))
                    assert get_num_samples_in_table(subset_otu_table_fp) == samp_size
                    assert get_num_samples_in_map(subset_map_fp) == samp_size

                    for d in workflow['dissim']:
                        dissim_dir = join(samp_size_dir, repr(d))
                        create_dir(dissim_dir)

                        simsam_map_fp = join(dissim_dir, add_filename_suffix(subset_map_fp, '_n%d_d%r' % (simsam_rep_num, d)))
                        simsam_otu_table_fp = join(dissim_dir, add_filename_suffix(subset_otu_table_fp, '_n%d_d%r' % (simsam_rep_num, d)))

                        # Check for simulated table/map and various
                        # distance matrices / coordinates files.
                        required_simsam_files = [basename(simsam_map_fp), basename(simsam_otu_table_fp)]
                        has_simsam_files = has_results(dissim_dir, required_files=required_simsam_files)

                        has_metric_files = True
                        for metric in workflow['metrics']:
                            required_metric_files = ['dm.txt', 'map.txt', 'pc.txt']
                            if analysis_type == 'gradient':
                                required_metric_files.append('%s_dm.txt' % category[0])

                            metric_dir = join(dissim_dir, metric[0])
                            has_metric_files = has_results(metric_dir, required_metric_files)
                            if not has_metric_files:
                                break

                        if not (has_simsam_files and has_metric_files):
                            cmd = ['simsam.py -i %s -t %s -o %s -d %r -n %d -m %s' % (subset_otu_table_fp, tree_fp, dissim_dir, d, simsam_rep_num, subset_map_fp)]

                            for metric in workflow['metrics']:
                                metric_dir = join(dissim_dir, metric[0])
                                create_dir(metric_dir)

                                if analysis_type == 'gradient':
                                    cmd.append('distance_matrix_from_mapping.py -i %s -c %s -o %s' % (simsam_map_fp, category[0], join(metric_dir, '%s_dm.txt' % category[0])))

                                cmd.append('beta_diversity.py -i %s -o %s -m %s -t %s' % (simsam_otu_table_fp, metric_dir, metric[0], tree_fp))
                                cmd.append('mv %s %s' % (join(metric_dir, '%s_%s.txt' % (metric[0], splitext(basename(simsam_otu_table_fp))[0])), join(metric_dir, 'dm.txt')))
                                cmd.append('cp %s %s' % (simsam_map_fp, join(metric_dir, 'map.txt')))
                                cmd.append('principal_coordinates.py -i %s -o %s' % (join(metric_dir, 'dm.txt'), join(metric_dir, 'pc.txt')))
                            cmds.append(' && '.join(cmd))
                else:
                    # We need to simulate more samples than we originally have.
                    simsam_rep_num = get_simsam_rep_num(samp_size, num_samps)

                    for d in workflow['dissim']:
                        dissim_dir = join(samp_size_dir, repr(d))
                        create_dir(dissim_dir)

                        simsam_map_fp = join(dissim_dir, add_filename_suffix(map_fp, '_n%d_d%r' % (simsam_rep_num, d)))
                        simsam_otu_table_fp = join(dissim_dir, add_filename_suffix(even_otu_table_fp, '_n%d_d%r' % (simsam_rep_num, d)))

                        required_simsam_files = [basename(simsam_map_fp), basename(simsam_otu_table_fp)]
                        has_simsam_files = has_results(dissim_dir, required_files=required_simsam_files)

                        required_subset_files = [basename(simsam_map_fp), basename(simsam_otu_table_fp)]
                        has_subset_files = has_results(join(dissim_dir, 'subset'), required_files=required_subset_files)

                        has_metric_files = True
                        for metric in workflow['metrics']:
                            required_metric_files = ['dm.txt', 'map.txt', 'pc.txt']
                            if analysis_type == 'gradient':
                                required_metric_files.append('%s_dm.txt' % category[0])

                            metric_dir = join(dissim_dir, metric[0])
                            has_metric_files = has_results(metric_dir, required_metric_files)
                            if not has_metric_files:
                                break

                        if not (has_simsam_files and has_subset_files and has_metric_files):
                            cmd = ['simsam.py -i %s -t %s -o %s -d %r -n %d -m %s' % (even_otu_table_fp, tree_fp, dissim_dir, d, simsam_rep_num, map_fp)]

                            subset_dir = join(dissim_dir, 'subset')
                            cmd.append('choose_data_subset.py -t %s -i %s -m %s -c %s -n %d -o %s' % (analysis_type, simsam_otu_table_fp, simsam_map_fp, category[0], samp_size, subset_dir))
                            subset_otu_table_fp = join(subset_dir, basename(simsam_otu_table_fp))
                            subset_map_fp = join(subset_dir, basename(simsam_map_fp))

                            for metric in workflow['metrics']:
                                metric_dir = join(dissim_dir, metric[0])
                                create_dir(metric_dir)

                                if analysis_type == 'gradient':
                                    cmd.append('distance_matrix_from_mapping.py -i %s -c %s -o %s' % (subset_map_fp, category[0], join(metric_dir, '%s_dm.txt' % category[0])))

                                cmd.append('beta_diversity.py -i %s -o %s -m %s -t %s' % (subset_otu_table_fp, metric_dir, metric[0], tree_fp))
                                cmd.append('mv %s %s' % (join(metric_dir, '%s_%s.txt' % (metric[0], splitext(basename(subset_otu_table_fp))[0])), join(metric_dir, 'dm.txt')))
                                cmd.append('cp %s %s' % (subset_map_fp, join(metric_dir, 'map.txt')))
                                cmd.append('principal_coordinates.py -i %s -o %s' % (join(metric_dir, 'dm.txt'), join(metric_dir, 'pc.txt')))
                            cmds.append(' && '.join(cmd))
    return cmds