예제 #1
0
    def test_read_meta_pd(self):
        expected = pd.DataFrame({'sample_id': ['s1'], 'A': [1], 'B': [2]})
        observed = read_meta_pd(self.tsv_a, 'sample_id')
        assert_frame_equal(observed, expected)

        expected = pd.DataFrame({'any_id': ['s1'], 'A': [1], 'B': [2]})
        observed = read_meta_pd(self.tsv_a, 'any_id')
        assert_frame_equal(observed, expected)
예제 #2
0
 def make_train_test(self):
     if self.mmvecs.shape[0]:
         for _, mmvec in self.mmvecs.groupby(['pair', 'filter', 'subset']):
             d = mmvec.iloc[0, :].to_dict()
             fps = [
                 'dataset1', 'dataset2', 'meta_fp', 'new_tsv1', 'new_tsv2',
                 'new_qza1', 'new_qza2'
             ]
             dat1, dat2, meta_fp, tsv1, tsv2, qza1, qza2 = [
                 d[x] for x in fps
             ]
             meta_subset = read_meta_pd(meta_fp)
             train_tests = self.make_train_test_column(
                 meta_fp, self.config.train_test_dict, meta_subset, dat1,
                 dat2)
             # print()
             # print()
             # print("['mmvec']", meta_fp)
             # print("['mmvec']", [x for x in meta_subset.columns
             #                     if 'train' in x])
             rewrite = False
             meta_subset_cols = set(meta_subset.columns)
             for train_col, train_samples in train_tests.items():
                 # print("['mmvec']", train_col, len(train_samples))
                 if train_col not in meta_subset_cols:
                     rewrite = True
                     meta_subset[train_col] = [
                         'Train' if x in set(train_samples) else 'Test'
                         for x in meta_subset.sample_name.tolist()
                     ]
             if self.config.force or rewrite:
                 meta_subset.to_csv(meta_fp, index=False, sep='\t')
예제 #3
0
def fix_collapsed_data(remove_empty: set, coll_pd: pd.DataFrame, coll_tsv: str,
                       coll_qza: str, coll_meta: str):
    """
    Parameters
    ----------
    remove_empty : set
    coll_pd : pd.DataFrame
    coll_tsv : str
    coll_qza : str
    coll_meta : str

    Returns
    -------
    cmd : str
    """
    cmd = ''
    if len(remove_empty & set(coll_pd.index)):
        coll_pd = coll_pd.drop(index=list(remove_empty & set(coll_pd.index)))
        coll_pd = coll_pd.loc[:, coll_pd.sum() > 0]
        coll_pd.to_csv(coll_tsv, index=True, sep='\t')

        coll_meta_pd = read_meta_pd(coll_meta)
        if coll_meta_pd.index.size != coll_pd.columns.size:
            coll_meta_pd = coll_meta_pd.loc[coll_meta_pd.sample_name.isin(
                coll_pd.columns.tolist())]
            coll_meta_pd.to_csv(coll_meta, index=False, sep='\t')

    if not isfile(coll_qza):
        cmd = run_import(coll_tsv, coll_qza, 'FeatureTable[Frequency]')
    return cmd
예제 #4
0
 def get_common_paths(self):
     cmds = {}
     paths = []
     pfs = ['pair', 'filter', 'subset']
     for (pair, filter, subset), mmvec in self.mmvecs.groupby(pfs):
         data_dir = get_analysis_folder(
             self.config.i_datasets_folder,
             'mmvec/common/data/%s/%s' % (pair, subset))
         meta_dir = get_analysis_folder(
             self.config.i_datasets_folder,
             'mmvec/common/metadata/%s/%s' % (pair, subset))
         mmvec_d = mmvec.iloc[0, :].to_dict()
         dat1, dat2 = mmvec_d['dataset1'], mmvec_d['dataset2']
         prev1, prev2 = mmvec_d['prevalence1'], mmvec_d['prevalence2']
         abun1, abun2 = mmvec_d['abundance1'], mmvec_d['abundance2']
         qza1, meta1 = self.get_dataset_path(dat1, filter, subset)
         qza2, meta2 = self.get_dataset_path(dat2, filter, subset)
         if not isfile(meta1) or not isfile(meta2):
             continue
         meta1_pd, meta2_pd = read_meta_pd(meta1), read_meta_pd(meta2)
         sams = set(meta1_pd.sample_name) & set(meta2_pd.sample_name)
         if len(sams) < 10:
             print('Not enough samples in pair %s: %s (%s) vs %s (%s)' %
                   (pair, mmvec_d['dataset1'], meta1_pd.shape[0],
                    mmvec_d['dataset2'], meta2_pd.shape[0]))
             continue
         meta_fp, new_tsv1, new_qza1, new_tsv2, new_qza2 = self.get_new_fps(
             meta_dir, data_dir, qza1, qza2, dat1, prev1, abun1, dat2,
             prev2, abun2, pair, len(sams), cmds)
         meta_subset = get_meta_subset(meta1_pd, meta2_pd, sams)
         meta_subset.to_csv(meta_fp, index=False, sep='\t')
         paths.append([
             pair, filter, subset, sams, meta_fp, new_tsv1, new_tsv2,
             new_qza1, new_qza2
         ])
         print('\t\t\t* [TODO]', pair, filter, subset, ':', dat1, 'vs',
               dat2, '(%s samples)' % meta_subset.shape[0])
     if paths:
         common_paths_pd = pd.DataFrame(
             paths,
             columns=(pfs + [
                 'common_sams', 'meta_fp', 'new_tsv1', 'new_tsv2',
                 'new_qza1', 'new_qza2'
             ]))
         self.mmvecs = self.mmvecs.merge(common_paths_pd,
                                         on=['pair', 'filter', 'subset'])
     self.register_command('mmvec_paired_imports', cmds)
def run_distance_decay(i_datasets_folder: str, betas: dict,
                       p_distance_decay: str, datasets_rarefs: dict,
                       force: bool, prjct_nm: str, qiime_env: str, chmod: str,
                       noloc: bool, slurm: bool, split: bool, run_params: dict,
                       filt_raref: str, jobs: bool,
                       chunkit: int) -> (dict, list):

    job_folder2 = get_job_folder(i_datasets_folder, 'decay/chunks')
    decay_config = read_yaml_file(p_distance_decay)
    subsets, modes, params = get_decay_config(decay_config)

    all_sh_pbs = {}
    decay_res = {}
    for dat, rarefs_metrics_groups_metas_qzas_dms_trees in betas.items():
        if not split:
            out_sh = '%s/run_decay_%s_%s%s.sh' % (job_folder2, prjct_nm, dat,
                                                  filt_raref)
        decay_res[dat] = []
        for idx, metrics_groups_metas_qzas_dms_trees in enumerate(
                rarefs_metrics_groups_metas_qzas_dms_trees):
            decay_raref = {}
            cur_raref = datasets_rarefs[dat][idx]
            odir = get_analysis_folder(i_datasets_folder,
                                       'decay/%s%s' % (dat, cur_raref))
            if split:
                out_sh = '%s/run_decay_%s_%s%s%s.sh' % (
                    job_folder2, prjct_nm, dat, cur_raref, filt_raref)
            for metric, groups_metas_qzas_dms_trees in metrics_groups_metas_qzas_dms_trees.items(
            ):
                for group, metas_qzas_mat_qzas_trees in groups_metas_qzas_dms_trees.items(
                ):
                    for (meta, qza, mat_qza,
                         tree) in metas_qzas_mat_qzas_trees:
                        meta_pd = read_meta_pd(meta).set_index('sample_name')
                        cases_dict = check_metadata_cases_dict(
                            meta, meta_pd, dict(subsets), 'decay')
                        for case_var, case_vals_list in cases_dict.items():
                            for case_vals in case_vals_list:
                                case = get_case(case_vals,
                                                case_var).replace(' ', '_')
                                cur_sh = '%s/run_decay_%s%s_%s_%s_%s%s.sh' % (
                                    job_folder2, dat, cur_raref, metric, group,
                                    case, filt_raref)
                                cur_sh = cur_sh.replace(' ', '-')
                                all_sh_pbs.setdefault((dat, out_sh),
                                                      []).append(cur_sh)
                                new_meta_pd = get_new_meta_pd(
                                    meta_pd, case, case_var, case_vals)
                                res = run_single_decay(
                                    odir, group, new_meta_pd, cur_sh, mat_qza,
                                    case, modes, force, run_params["n_nodes"],
                                    run_params["n_procs"],
                                    int(params['iteration']),
                                    int(params['step']))
                                decay_raref[(metric, group, case)] = res
            decay_res[dat].append(decay_raref)
 def make_train_test(self):
     if self.songbirds.shape[0]:
         for _, sb in self.songbirds.groupby(
             ['dataset', 'filter', 'subset']):
             d = sb.iloc[0, :].to_dict()
             fps = ['dataset', 'tsv', 'qza', 'meta']
             dat, tsv, qza, meta_fp = [d[x] for x in fps]
             meta_subset = read_meta_pd(meta_fp)
             train_tests = self.make_train_test_column(
                 meta_fp, self.config.train_test_dict, meta_subset, dat)
             rewrite = False
             meta_subset_cols = set(meta_subset.columns)
             for train_col, train_samples in train_tests.items():
                 if train_col not in meta_subset_cols:
                     rewrite = True
                     meta_subset[train_col] = [
                         'Train' if x in set(train_samples) else 'Test'
                         for x in meta_subset.sample_name.tolist()
                     ]
             if self.config.force or rewrite:
                 meta_subset.to_csv(meta_fp, index=False, sep='\t')
예제 #7
0
def run_nestedness(i_datasets_folder: str, betas: dict,
                   datasets_collapsed_map: dict, p_nestedness_groups: str,
                   datasets_rarefs: dict, force: bool, prjct_nm: str,
                   qiime_env: str, chmod: str, noloc: bool, slurm: bool,
                   split: bool, run_params: dict, filt_raref: str, jobs: bool,
                   chunkit: int) -> (dict, list, dict):

    job_folder2 = get_job_folder(i_datasets_folder, 'nestedness/chunks')

    nestedness_config = read_yaml_file(p_nestedness_groups)
    if 'soft' not in nestedness_config:
        print(
            'Must provide the path to the Nestedness soft (containing bin/Autocorrelation.jar)'
        )
        return {}
    if nestedness_config['soft'].endswith('Autocorrelation.jar') and isfile(
            nestedness_config['soft']):
        binary = nestedness_config['soft']
    else:
        binary = '%s/bin/Autocorrelation.jar' % nestedness_config['soft']
        if not isfile(binary):
            print(
                'Must provide the path to the Nestedness soft (containing bin/Autocorrelation.jar)'
            )
            return {}

    subsets, nodfs, colors, nulls, modes, params = get_nestedness_config(
        nestedness_config)

    nodfs_fps = {}
    all_sh_pbs = {}
    nestedness_res = {}
    for dat, rarefs_metrics_groups_metas_qzas_dms_trees in betas.items():
        if not split:
            out_sh = '%s/run_nestedness_%s_%s%s.sh' % (job_folder2, prjct_nm,
                                                       dat, filt_raref)

        stats_tax_dat, level = get_stats_tax_dat(dat, datasets_collapsed_map)

        nestedness_res[dat] = []
        for idx, metrics_groups_metas_qzas_dms_trees in enumerate(
                rarefs_metrics_groups_metas_qzas_dms_trees):
            nestedness_raref = {}
            cur_raref = datasets_rarefs[dat][idx]
            odir = get_analysis_folder(i_datasets_folder,
                                       'nestedness/%s%s' % (dat, cur_raref))
            if split:
                out_sh = '%s/run_nestedness_%s_%s%s%s.sh' % (
                    job_folder2, prjct_nm, dat, cur_raref, filt_raref)
            for _, groups_metas_qzas_dms_trees in metrics_groups_metas_qzas_dms_trees.items(
            ):
                for group, metas_qzas_mat_qzas_trees in groups_metas_qzas_dms_trees.items(
                ):

                    meta, qza, mat_qza, tree = metas_qzas_mat_qzas_trees[0]
                    meta_pd = read_meta_pd(meta).set_index('sample_name')
                    cases_dict = check_metadata_cases_dict(
                        meta, meta_pd, dict(subsets), 'nestedness')
                    for case_var, case_vals_list in cases_dict.items():
                        for case_vals in case_vals_list:
                            case = get_case(case_vals,
                                            case_var).replace(' ', '_')
                            cur_sh = '%s/run_nestedness_%s%s_%s_%s%s.sh' % (
                                job_folder2, dat, cur_raref, group, case,
                                filt_raref)
                            cur_sh = cur_sh.replace(' ', '-')
                            # print("case", case)
                            all_sh_pbs.setdefault((dat, out_sh),
                                                  []).append(cur_sh)
                            res, group_case_nodfs = run_single_nestedness(
                                odir, cur_raref, level, group, meta_pd, nodfs,
                                nulls, modes, cur_sh, qza, case, case_var,
                                case_vals, binary, params, force)
                            nodfs_fps.setdefault(stats_tax_dat,
                                                 []).extend(group_case_nodfs)
                            nestedness_raref[(group, case)] = res
                break
            nestedness_res[dat].append(nestedness_raref)
def run_mantel(i_datasets_folder: str, datasets_filt: dict, p_mantel: str,
               betas: dict, force: bool, prjct_nm: str, qiime_env: str,
               chmod: str, noloc: bool, slurm: bool, split: bool,
               run_params: dict, filt_raref: str, filt_only: bool,
               eval_depths: dict, jobs: bool, chunkit: int) -> None:
    """
    """
    evaluation = ''
    if eval_depths:
        evaluation = '_eval'
        mantel_pairs = {}
        for dat, depths in eval_depths.items():
            sorted_depths = sorted(depths, key=lambda x: int(x.split('_')[-1]))
            for idx, x in enumerate(sorted_depths[:-1]):
                y = sorted_depths[(idx + 1)]
                n0 = x.split('_')[-1]
                n1 = y.split('_')[-1]
                mantel_pairs['%s_%s' % (n0, n1)] = [x, y]
        mantel_subsets = {'ALL': [[]]}
    else:
        mantel_pairs, mantel_subsets = get_procrustes_mantel_dicts(p_mantel)

    get_job_folder(i_datasets_folder, 'mantel%s' % evaluation)

    all_sh_pbs = {}
    missing_dats = set()
    for pair, (dat1_, dat2_) in mantel_pairs.items():

        dat1, raref1 = get_dat_idx(dat1_, evaluation, datasets_filt, filt_only)
        dat2, raref2 = get_dat_idx(dat2_, evaluation, datasets_filt, filt_only)

        if check_dat_exists(betas, dat1, missing_dats) or check_dat_exists(
                betas, dat2, missing_dats):
            continue

        if evaluation:
            metrics_groups_metas_qzas_dms_trees1 = betas[dat1]
            metrics_groups_metas_qzas_dms_trees2 = betas[dat2]
        else:
            metrics_groups_metas_qzas_dms_trees1 = betas[dat1][0]
            metrics_groups_metas_qzas_dms_trees2 = betas[dat2][0]

        job_folder2 = get_job_folder(
            i_datasets_folder,
            'mantel%s/chunks/%s%s' % (evaluation, pair, filt_raref))
        if not split:
            out_sh = '%s/run_mantel_%s%s_%s%s.sh' % (
                job_folder2, prjct_nm, evaluation, pair, filt_raref)

        for metric, groups_metas_qzas_dms_trees1 in metrics_groups_metas_qzas_dms_trees1.items(
        ):
            if split:
                out_sh = '%s/run_mantel_%s%s_%s_%s%s.sh' % (
                    job_folder2, prjct_nm, evaluation, pair, metric,
                    filt_raref)
            if metric not in metrics_groups_metas_qzas_dms_trees2:
                continue
            groups_metas_qzas_dms_trees2 = metrics_groups_metas_qzas_dms_trees2[
                metric]
            groups1 = sorted(groups_metas_qzas_dms_trees1.keys())
            groups2 = sorted(groups_metas_qzas_dms_trees2.keys())
            for (group1_, group2_) in itertools.product(*[groups1, groups2]):
                if group1_ == '':
                    group1 = 'full'
                else:
                    group1 = group1_
                if group2_ == '':
                    group2 = 'full'
                else:
                    group2 = group2_

                meta1, qza1, dm1, tree1 = groups_metas_qzas_dms_trees1[
                    group1_][0]
                meta2, qza2, dm2, tree2 = groups_metas_qzas_dms_trees2[
                    group2_][0]

                skip = 0
                if not evaluation:
                    if '__raref' in dat1_:
                        dm1, meta1 = get_dm_meta(dat1, dm1, meta1, raref1,
                                                 metric, i_datasets_folder,
                                                 skip)
                    if '__raref' in dat2_:
                        dm2, meta2 = get_dm_meta(dat2, dm2, meta2, raref2,
                                                 metric, i_datasets_folder,
                                                 skip)
                if skip:
                    print(
                        '[Mantels] One desired rarefaction depth not run (pair %s)'
                        % pair)
                    continue

                meta_pd1 = read_meta_pd(meta1)
                meta_pd2 = read_meta_pd(meta2)
                common_sams = list(
                    set(meta_pd1.sample_name) & set(meta_pd2.sample_name))
                if len(common_sams) < 3:
                    continue

                meta_pd = meta_pd1.loc[meta_pd1.sample_name.isin(common_sams)]
                cases_dict = check_metadata_cases_dict(meta1, meta_pd,
                                                       dict(mantel_subsets),
                                                       'mantel')
                odir = get_analysis_folder(
                    i_datasets_folder, 'mantel%s/%s%s/%s_vs_%s' %
                    (evaluation, pair, filt_raref, group1, group2))
                job_folder3 = get_job_folder(
                    i_datasets_folder, 'mantel%s/chunks/%s%s/%s_vs_%s' %
                    (evaluation, pair, filt_raref, group1, group2))

                for case_var, case_vals_list in cases_dict.items():
                    for case_vals in case_vals_list:
                        case_ = get_case(case_vals, case_var).replace(' ', '_')
                        cur = '%s__%s' % (metric, case_)
                        cur_sh = '%s/run_mantel%s_%s%s.sh' % (
                            job_folder3, evaluation, cur, filt_raref)
                        cur_sh = cur_sh.replace(' ', '-')
                        all_sh_pbs.setdefault((pair, out_sh),
                                              []).append(cur_sh)

                        dm_out1 = '%s/dm_%s__%s_DM.qza' % (odir, dat1_, cur)
                        dm_out2 = '%s/dm_%s__%s_DM.qza' % (odir, dat2_, cur)
                        mantel_out = '%s/mantel%s_%s__%s__%s.qzv' % (
                            odir, evaluation, dat1_, dat2_, cur)
                        run_single_procrustes_mantel('mantel', odir, dm1, dm2,
                                                     meta_pd, dm_out1, dm_out2,
                                                     mantel_out, cur_sh, cur,
                                                     case_var, case_vals,
                                                     force)

    job_folder = get_job_folder(i_datasets_folder, 'mantel%s' % evaluation)
    main_sh = write_main_sh(
        job_folder, '4_run_mantel_%s%s%s' % (prjct_nm, evaluation, filt_raref),
        all_sh_pbs, '%s.mntl%s%s' % (prjct_nm, evaluation, filt_raref),
        run_params["time"], run_params["n_nodes"], run_params["n_procs"],
        run_params["mem_num"], run_params["mem_dim"], qiime_env, chmod, noloc,
        slurm, jobs, chunkit)
    if main_sh:
        if p_mantel and p_mantel != 1:
            if p_mantel.startswith('/panfs'):
                p_mantel = p_mantel.replace(os.getcwd(), '')
            print('# Mantels (pairs and samples subsets config in %s)' %
                  p_mantel)
        else:
            print('# Mantels')
        print_message('', 'sh', main_sh, jobs)
def export_meta_alpha(datasets: dict, filt_raref: str, datasets_rarefs: dict,
                      to_export: dict, dropout: bool) -> None:
    """
    Export the alpha diversity vectors.

    :param datasets: list of datasets.
    :param to_export: files to export per dataset.
    """
    first_print = True

    for dat, meta_alphas_fps_ in to_export.items():

        all_meta_alphas_pds = []
        for idx, meta_alphas_fps in enumerate(meta_alphas_fps_):
            tsv, meta = datasets[dat][idx]
            cur_raref = datasets_rarefs[dat][idx]
            meta_alphas_fps_exist = [x for x in meta_alphas_fps if isfile(x)]
            if len(meta_alphas_fps_exist) != len(meta_alphas_fps):
                if first_print:
                    print(
                        '\nWarning: First make sure you run alpha -> alpha merge/export (2_run_merge_alphas.sh) '
                        ' before running volatility\n\t(if you need the alpha as a response variable)!'
                    )
                    first_print = False
                continue

            meta_alphas_pds = []
            for meta_alpha_fp in meta_alphas_fps_exist:
                with open(meta_alpha_fp) as f:
                    for line in f:
                        break
                meta_alpha_pd = pd.read_csv(meta_alpha_fp,
                                            header=0,
                                            sep='\t',
                                            dtype={line.split('\t')[0]: str})
                meta_alpha_pd.rename(
                    columns={line.split('\t')[0]: 'sample_name'}, inplace=True)
                meta_alpha_pd.set_index('sample_name', inplace=True)

                if filt_raref:
                    fr = filt_raref.replace('_rrf', '')
                    replace_cols = dict((x, '%s%s%s' % (x, fr, cur_raref))
                                        for x in meta_alpha_pd.columns)
                    meta_alpha_pd.rename(columns=replace_cols, inplace=True)

                group = meta_alpha_fp.split('_alphas__')[-1].split('.tsv')[0]
                if group != '':
                    if dropout:
                        replace_cols = dict((x, '__'.join([x, group]))
                                            for x in meta_alpha_pd.columns)
                    else:
                        replace_cols = dict(
                            (x, '__'.join([x, group, 'noDropout']))
                            for x in meta_alpha_pd.columns)
                    meta_alpha_pd.rename(columns=replace_cols, inplace=True)
                meta_alphas_pds.append(meta_alpha_pd)
            meta_alphas_pd = pd.concat(meta_alphas_pds, axis=1, sort=False)
            if meta_alphas_pd.index.tolist()[0] == '#q2:types':
                meta_alphas_pd = meta_alphas_pd.iloc[1:, :]
            meta_alphas_pd = meta_alphas_pd.reset_index()
            meta_alphas_pd.rename(
                columns={meta_alphas_pd.columns[0]: 'sample_name'},
                inplace=True)
            all_meta_alphas_pds.append(meta_alphas_pd.set_index('sample_name'))

            meta_alpha_fpo = '%s_alphas.tsv' % splitext(meta)[0]
            if isfile(meta_alpha_fpo):
                meta_pd = read_meta_pd(meta_alpha_fpo)
            else:
                meta_pd = read_meta_pd(meta)
            col_to_remove = meta_alphas_pd.columns.tolist()[1:]
            shared_cols = list(
                set(col_to_remove) & set(meta_pd.columns.tolist()))
            if len(shared_cols):
                meta_pd.drop(columns=shared_cols, inplace=True)
            meta_alphas_pd = meta_pd.merge(meta_alphas_pd,
                                           on='sample_name',
                                           how='left')
            meta_alphas_pd.to_csv(meta_alpha_fpo, index=False, sep='\t')
            if os.getcwd().startswith('/panfs'):
                meta_alpha_fpo = meta_alpha_fpo.replace(os.getcwd(), '')
            print(' -> Written:', meta_alpha_fpo)

        if all_meta_alphas_pds:
            all_meta_alphas_pd = pd.concat(all_meta_alphas_pds,
                                           axis=1,
                                           sort=False)
            main_meta = datasets[dat][0][1]
            meta_alpha_fpo = '%s_alphas_full.tsv' % splitext(main_meta)[0]
            if isfile(meta_alpha_fpo):
                meta_pd = read_meta_pd(meta_alpha_fpo)
            else:
                meta_pd = read_meta_pd(main_meta)
            col_to_remove = all_meta_alphas_pd.columns.tolist()
            shared_cols = list(
                set(col_to_remove) & set(meta_pd.columns.tolist()))
            if len(shared_cols):
                meta_pd.drop(columns=shared_cols, inplace=True)
            all_meta_alphas_pd = all_meta_alphas_pd.reset_index()
            all_meta_alphas_pd.rename(
                columns={all_meta_alphas_pd.columns[0]: 'sample_name'},
                inplace=True)
            all_meta_alphas_pd = meta_pd.merge(all_meta_alphas_pd,
                                               on='sample_name',
                                               how='left')
            all_meta_alphas_pd.to_csv(meta_alpha_fpo, index=False, sep='\t')
            if os.getcwd().startswith('/panfs'):
                meta_alpha_fpo = meta_alpha_fpo.replace(os.getcwd(), '')
            print(' -> Written:', meta_alpha_fpo)
예제 #10
0
                    meta = meta_
                    if not first_print:
                        print('\nWarning: Make sure you first run alpha -> '
                              'alpha merge -> alpha export\n'
                              '\t(if you have alpha diversity as a factors '
                              'in the models)!')
                        first_print += 1
            if pair:
                dat_pair = '%s_%s' % (dat, pair)
                dat_pair_path = '%s/%s' % (dat, pair)
            else:
                dat_pair = dat
                dat_pair_path = dat

            qza = '%s.qza' % splitext(tsv)[0]
            meta_pd = read_meta_pd(meta)
            meta_pd = rename_duplicate_columns(meta_pd)
            meta_pd = meta_pd.set_index('sample_name')
            meta_pd.columns = [x.lower() for x in meta_pd.columns]

            if dat in songbird_models:
                models = check_metadata_models(meta, meta_pd,
                                               songbird_models[dat])
            else:
                continue
            # print("models")
            # print(models)
            #####################################################################
            # snakemake here: config to organise the inputs/depedencies (joblib)
            #####################################################################
            for idx, it in enumerate(
예제 #11
0
def run_adonis(p_formulas: str, i_datasets_folder: str, betas: dict,
               datasets_rarefs: dict, p_perm_groups: str, force: bool,
               prjct_nm: str, qiime_env: str, chmod: str, noloc: bool,
               slurm: bool, split: bool, run_params: dict, filt_raref: str,
               jobs: bool, chunkit: int) -> None:
    """
    Run beta-group-significance: Beta diversity group significance.
    https://docs.qiime2.org/2019.10/plugins/available/diversity/beta-group-significance/
    Main per-dataset looper for the ADONIS tests on beta diversity matrices.

    :param p_formulas: formulas to test.
    :param i_data_sets_folder: Path to the folder containing the data/metadata subfolders.
    :param data_sets: list of datasets.
    :param betas: beta diversity matrices.
    :param p_perm_groups: groups to subset.
    :param force: Force the re-writing of scripts for all commands.
    :param prjct_nm: Nick name for your project.
    :param qiime_env: qiime2-xxxx.xx conda environment.
    :param chmod: whether to change permission of output files (defalt: 775).
    """

    job_folder2 = get_job_folder(i_datasets_folder, 'adonis/chunks')

    main_cases_dict = get_main_cases_dict(p_perm_groups)
    formulas = get_formulas_dict(p_formulas)

    metric_check = set()
    all_sh_pbs = {}
    first_print = 0

    for dat, metric_groups_metas_qzas_dms_trees_ in betas.items():
        if dat not in formulas:
            continue
        if not split:
            out_sh = '%s/run_adonis_%s_%s%s.sh' % (job_folder2, prjct_nm, dat, filt_raref)
        for idx, metric_groups_metas_qzas_dms_trees in enumerate(metric_groups_metas_qzas_dms_trees_):
            cur_depth = datasets_rarefs[dat][idx]
            odir = get_analysis_folder(i_datasets_folder, 'adonis/%s%s' % (dat, cur_depth))
            for metric, subset_files in metric_groups_metas_qzas_dms_trees.items():
                if split:
                    out_sh = '%s/run_adonis_%s_%s_%s%s.sh' % (job_folder2, prjct_nm, dat, metric, filt_raref)

                for subset, metas_qzas_mat_qzas_trees in subset_files.items():
                    for meta, qza, mat_qza, tree in metas_qzas_mat_qzas_trees:
                        if not isfile(mat_qza):
                            if not first_print:
                                print('Beta diversity, distances matrices must be generated already to automatise PERMANOVA\n'
                                      '\t(re-run this after steps "2_run_beta.sh" and "2x_run_beta_export.pbs" are done)')
                                first_print += 1
                            continue

                        if (dat, subset) not in metric_check:
                            meta_pd = read_meta_pd(meta).set_index('sample_name')
                            cases_dict = check_metadata_cases_dict(meta, meta_pd, dict(main_cases_dict), 'ADONIS')
                            formulas = check_metadata_formulas(meta, meta_pd, formulas[dat], 'ADONIS')
                            metric_check.add((dat, subset))

                        for fdx, form in enumerate(formulas[dat].keys()):
                            formula = formulas[dat][form]
                            for cdx, case_var in enumerate(cases_dict.keys()):
                                case_vals_list = cases_dict[case_var]
                                cur_sh = '%s/run_adonis_%s%s_%s_%s_%s%s.sh' % (
                                    job_folder2, dat, cur_depth, metric, fdx, cdx, filt_raref)
                                cur_sh = cur_sh.replace(' ', '-')
                                all_sh_pbs.setdefault((dat, out_sh), []).append(cur_sh)
                                run_single_adonis(odir, subset, case_vals_list, metric, case_var,
                                                  form, formula, qza, mat_qza, meta_pd, cur_sh, force)

    job_folder = get_job_folder(i_datasets_folder, 'adonis')
    main_sh = write_main_sh(job_folder, '3_run_adonis_%s%s' % (prjct_nm, filt_raref), all_sh_pbs,
                            '%s.dns%s' % (prjct_nm, filt_raref),
                            run_params["time"], run_params["n_nodes"], run_params["n_procs"],
                            run_params["mem_num"], run_params["mem_dim"],
                            qiime_env, chmod, noloc, slurm, jobs, chunkit)
    if main_sh:
        if p_perm_groups:
            print("# Run Adonis (groups config in %s)" % p_perm_groups)
        else:
            print("# Run Adonis")
        print_message('', 'sh', main_sh, jobs)
예제 #12
0
def run_doc(i_datasets_folder: str, datasets: dict, p_doc_config: str,
            datasets_rarefs: dict, force: bool, prjct_nm: str, qiime_env: str,
            chmod: str, noloc: bool, slurm: bool, run_params: dict,
            filt_raref: str, phates: dict, doc_phate: bool, split: bool,
            jobs: bool, chunkit: int) -> None:

    job_folder2 = get_job_folder(i_datasets_folder, 'doc/chunks')
    doc_filtering, doc_params, main_cases_dict = get_doc_config(p_doc_config)

    all_sh_pbs = {}
    all_import_sh_pbs = {}
    dat_cases_tabs = {}
    need_to_run_phate = []
    need_to_run_less_phate = []
    for dat, tsv_meta_pds_ in datasets.items():
        dat_cases_tabs[dat] = {}
        if dat in doc_filtering:
            filters = doc_filtering[dat]
        else:
            filters = {'0-0': ['0', '0']}
        for idx, tsv_meta_pds in enumerate(tsv_meta_pds_):
            dat_phates = []
            if dat in phates:
                dat_phates = phates[dat][idx]
            tsv, meta = tsv_meta_pds
            meta_pd = read_meta_pd(meta)
            meta_pd = meta_pd.set_index('sample_name')
            cases_dict = check_metadata_cases_dict(meta, meta_pd,
                                                   dict(main_cases_dict),
                                                   'DOC')
            cur_raref = datasets_rarefs[dat][idx]
            dat_cases_tabs[dat][cur_raref] = {}
            if not split:
                out_sh = '%s/run_doc_%s%s%s.sh' % (job_folder2, dat,
                                                   filt_raref, cur_raref)
                out_import_sh = '%s/run_import_doc_%s%s%s.sh' % (
                    job_folder2, dat, filt_raref, cur_raref)
            odir = get_analysis_folder(i_datasets_folder, 'doc/%s' % dat)
            for filt, (fp, fa) in filters.items():
                if split:
                    out_sh = '%s/run_doc_%s%s%s_%s.sh' % (
                        job_folder2, dat, filt_raref, cur_raref, filt)
                    out_import_sh = '%s/run_import_doc_%s%s%s_%s.sh' % (
                        job_folder2, dat, filt_raref, cur_raref, filt)
                for case_var, case_vals_list in cases_dict.items():
                    cur_sh = '%s/run_doc_%s_%s%s%s_%s.sh' % (
                        job_folder2, dat, case_var, filt_raref, cur_raref,
                        filt)
                    cur_sh = cur_sh.replace(' ', '-')
                    cur_import_sh = '%s/run_import_doc_%s_%s%s%s_%s.sh' % (
                        job_folder2, dat, case_var, filt_raref, cur_raref,
                        filt)
                    cur_import_sh = cur_import_sh.replace(' ', '-')
                    all_sh_pbs.setdefault((dat, out_sh), []).append(cur_sh)
                    all_import_sh_pbs.setdefault((dat, out_import_sh),
                                                 []).append(cur_import_sh)
                    cases = run_single_doc(
                        i_datasets_folder, odir, tsv, meta_pd, case_var,
                        doc_params, case_vals_list, cur_sh, cur_import_sh,
                        force, filt, cur_raref, fp, fa, run_params["n_nodes"],
                        run_params["n_procs"], dat_phates, doc_phate,
                        need_to_run_phate, need_to_run_less_phate)
                    dat_cases_tabs[dat][cur_raref].setdefault(case_var,
                                                              []).extend(cases)

    for need_to_run in need_to_run_phate:
        print(' -', need_to_run)

    job_folder = get_job_folder(i_datasets_folder, 'doc')
    main_sh = write_main_sh(job_folder, '3_run_import_doc%s' % filt_raref,
                            all_import_sh_pbs,
                            '%s.doc.mpt%s' % (prjct_nm, filt_raref), "4", "1",
                            "1", "500", "mb", qiime_env, chmod, noloc, slurm,
                            jobs, chunkit)
    if main_sh:
        if p_doc_config:
            if p_doc_config.startswith('/panfs'):
                p_doc_config = p_doc_config.replace(os.getcwd(), '')
            print('# Import for DOC (groups config in %s)' % p_doc_config)
        else:
            print('# Import DOC')
        print_message('', 'sh', main_sh, jobs)

    main_sh = write_main_sh(job_folder, '3_run_doc%s' % filt_raref, all_sh_pbs,
                            '%s.doc%s' % (prjct_nm, filt_raref),
                            run_params["time"], run_params["n_nodes"],
                            run_params["n_procs"], run_params["mem_num"],
                            run_params["mem_dim"], qiime_env, chmod, noloc,
                            jobs, slurm, chunkit, '~/.')
    if main_sh:
        if p_doc_config:
            if p_doc_config.startswith('/panfs'):
                p_doc_config = p_doc_config.replace(os.getcwd(), '')
            print('# DOC (groups config in %s)' % p_doc_config)
        else:
            print('# DOC')
        print_message('', 'sh', main_sh, jobs)

    do_r = 1
    if do_r:
        job_folder = get_job_folder(i_datasets_folder, 'doc/R')
        job_folder2 = get_job_folder(i_datasets_folder, 'doc/R/chunks')
        main_written = 0
        main_sh = '%s/run_R_doc%s.sh' % (job_folder, filt_raref)
        with open(main_sh, 'w') as main_o:
            for dat, raref_case_var_cases in dat_cases_tabs.items():

                shs = []
                written = 0
                odir = get_analysis_folder(i_datasets_folder, 'doc/%s' % dat)
                log_error = '%s/log.error' % odir
                for raref, case_var_cases in raref_case_var_cases.items():
                    for case_var, cases in case_var_cases.items():
                        for cdx, case in enumerate(cases):
                            plot = '%s_%s_%s_%s' % (dat, raref, case_var, cdx)
                            case_r = '%s/R' % case
                            pdf = '%s/plot.pdf' % case_r
                            do = '%s/DO.tsv' % case_r
                            if not isfile(pdf):
                                cur_r = '%s/run_R_doc_%s_%s_%s_vanilla.R' % (
                                    job_folder2, dat, case_var, cdx)
                                cur_sh = 'echo "*** %s" >> %s\n' % (plot,
                                                                    log_error)
                                cur_sh += 'R -f %s --vanilla 2>> %s\n' % (
                                    cur_r, log_error)
                                cur_sh += 'echo "end" >> %s\n' % log_error
                                shs.append(cur_sh)
                                with open(cur_r, 'w') as o:
                                    o.write("library(DOC)\n")
                                    o.write("library(ggplot2)\n")
                                    if not isfile(do):
                                        o.write(
                                            "otu <- read.table('%s/tab.tsv', header=T, sep='\\t', comment.char='', check.names=F, nrows=2)\n"
                                            % case)
                                        o.write(
                                            "index_name <- colnames(otu)[1]\n")
                                        o.write(
                                            "otu <- read.table('%s/tab.tsv', header=T, sep='\\t', comment.char='', check.names=F, row.names=index_name)\n"
                                            % case)
                                        o.write("if (dim(otu)[1] > 100) {\n")
                                        o.write("    res <- DOC(otu)\n")
                                        o.write(
                                            "    res.null <- DOC.null(otu)\n")
                                        o.write(
                                            "    write.table(x=res$DO, file='%s/DO.tsv', sep='\\t', quote=F, row.names=F)\n"
                                            % case_r)
                                        o.write(
                                            "    write.table(x=res$LME, file='%s/LME.tsv', sep='\\t', quote=F, row.names=F)\n"
                                            % case_r)
                                        o.write(
                                            "    colnames(res$NEG) <- c('Neg_Slope', 'Data')\n"
                                        )
                                        o.write(
                                            "    write.table(x=res$NEG, file='%s/NEG.tsv', sep='\\t', quote=F, row.names=F)\n"
                                            % case_r)
                                        o.write(
                                            "    write.table(x=res$FNS, file='%s/FNS.tsv', sep='\\t', quote=F, row.names=F)\n"
                                            % case_r)
                                        o.write(
                                            "    write.table(x=res$BOOT, file='%s/BOOT.tsv', sep='\\t', quote=F, row.names=F)\n"
                                            % case_r)
                                        o.write(
                                            "    write.table(x=res$CI, file='%s/CI.tsv', sep='\\t', quote=F, row.names=F)\n"
                                            % case_r)
                                        o.write(
                                            "    write.table(x=res.null$DO, file='%s/null_DO.tsv', sep='\\t', quote=F, row.names=F)\n"
                                            % case_r)
                                        o.write(
                                            "    write.table(x=res.null$LME, file='%s/null_LME.tsv', sep='\\t', quote=F, row.names=F)\n"
                                            % case_r)
                                        o.write(
                                            "    colnames(res.null$NEG) <- c('Neg_Slope', 'Data')\n"
                                        )
                                        o.write(
                                            "    write.table(x=res.null$NEG, file='%s/null_NEG.tsv', sep='\\t', quote=F, row.names=F)\n"
                                            % case_r)
                                        o.write(
                                            "    write.table(x=res.null$FNS, file='%s/null_FNS.tsv', sep='\\t', quote=F, row.names=F)\n"
                                            % case_r)
                                        o.write(
                                            "    write.table(x=res.null$BOOT, file='%s/null_BOOT.tsv', sep='\\t', quote=F, row.names=F)\n"
                                            % case_r)
                                        o.write(
                                            "    write.table(x=res.null$CI, file='%s/null_CI.tsv', sep='\\t', quote=F, row.names=F)\n"
                                            % case_r)
                                        o.write("}\n")
                                    o.write(
                                        "res = list(BOOT=read.table('%s/BOOT.tsv', h=T, sep='\\t'), CI=read.table('%s/CI.tsv', h=T, sep='\\t'), DO=read.table('%s/DO.tsv', h=T, sep='\\t'), LME=read.table('%s/LME.tsv', h=T, sep='\\t'), FNS=read.table('%s/FNS.tsv', h=T, sep='\\t'), NEG=read.table('%s/NEG.tsv', h=T, sep='\\t'))\n"
                                        % (case_r, case_r, case_r, case_r,
                                           case_r, case_r))
                                    o.write(
                                        "res.null = list(BOOT=read.table('%s/null_BOOT.tsv', h=T, sep='\\t'), CI=read.table('%s/null_CI.tsv', h=T, sep='\\t'), DO=read.table('%s/null_DO.tsv', h=T, sep='\\t'), LME=read.table('%s/null_LME.tsv', h=T, sep='\\t'), FNS=read.table('%s/null_FNS.tsv', h=T, sep='\\t'), NEG=read.table('%s/null_NEG.tsv', h=T, sep='\\t'))\n"
                                        % (case_r, case_r, case_r, case_r,
                                           case_r, case_r))
                                    o.write(
                                        "colnames(res$NEG) <- c('Neg.Slope', 'Data')\n"
                                    )
                                    o.write(
                                        "colnames(res.null$NEG) <- c('Neg.Slope', 'Data')\n"
                                    )
                                    o.write(
                                        "res$DO <- res$DO[which(res$DO$Overlap <= 1),]\n"
                                    )
                                    o.write(
                                        "res.null$DO <- res.null$DO[which(res.null$DO$Overlap <= 1),]\n"
                                    )
                                    o.write("pdf('%s')\n" % pdf)
                                    o.write(
                                        "merged <- DOC.merge(list(s_%s = res, s_%s=res.null))\n"
                                        % (plot, plot))
                                    o.write("plot(merged)\n")
                                    o.write("dev.off()\n")
                                    main_written += 1
                                    written += 1
                if written:
                    if chunkit and len(shs) >= chunkit:
                        chunks = [
                            list(x)
                            for x in np.array_split(np.array(shs), chunkit)
                        ]
                    if split and len(shs) >= 3:
                        chunks = [
                            list(x) for x in np.array_split(np.array(shs), 3)
                        ]
                    else:
                        chunks = [shs]
                    for cdx, chunk in enumerate(chunks):
                        out_sh = '%s/run_R_doc_%s%s_%s.sh' % (job_folder2, dat,
                                                              filt_raref, cdx)
                        out_pbs = '%s.pbs' % splitext(out_sh)[0]
                        with open(out_sh, 'w') as o:
                            for c in chunk:
                                o.write('echo "%s"\n\n' % c)
                                o.write('%s\n\n' % c)
                        run_xpbs(
                            out_sh, out_pbs, '%s.doc.R.%s%s_%s' %
                            (prjct_nm, dat, filt_raref, cdx), 'xdoc',
                            run_params["time"], run_params["n_nodes"],
                            run_params["n_procs"], run_params["mem_num"],
                            run_params["mem_dim"], chmod, written, 'single',
                            main_o, noloc, slurm, jobs)
        if main_written:
            print_message('# DOC (R)', 'sh', main_sh, jobs)
def run_permanova(i_datasets_folder: str, betas: dict,
                  main_testing_groups: tuple, p_perm_tests_min: int,
                  p_beta_type: tuple, datasets_rarefs: dict,
                  p_perm_groups: str, force: bool, prjct_nm: str,
                  qiime_env: str, chmod: str, noloc: bool, slurm: bool,
                  split: bool, run_params: dict, filt_raref: str, jobs: bool,
                  chunkit: int) -> dict:
    """
    Run beta-group-significance: Beta diversity group significance.
    https://docs.qiime2.org/2019.10/plugins/available/diversity/beta-group-significance/
    Main per-dataset looper for the PERMANOVA tests on beta diversity matrices.

    :param i_datasets_folder: Path to the folder containing the data/metadata subfolders.
    :param datasets: list of datasets.
    :param betas: beta diversity matrices.
    :param main_testing_groups: groups to test.
    :param p_perm_groups: groups to subset.
    :param force: Force the re-writing of scripts for all commands.
    :param prjct_nm: Nick name for your project.
    :param qiime_env: qiime2-xxxx.xx conda environment.
    :param chmod: whether to change permission of output files (defalt: 775).
    """
    permanovas = {}
    job_folder2 = get_job_folder(i_datasets_folder, 'permanova/chunks')
    main_cases_dict = get_main_cases_dict(p_perm_groups)

    npermutations = 999

    metric_check = set()
    all_sh_pbs = {}
    first_print = 0
    for dat, metric_groups_metas_qzas_dms_trees_ in betas.items():
        permanovas[dat] = []
        if not split:
            out_sh = '%s/run_beta_group_significance_%s_%s%s.sh' % (
                job_folder2, prjct_nm, dat, filt_raref)
        for idx, metric_groups_metas_qzas_dms_trees in enumerate(
                metric_groups_metas_qzas_dms_trees_):
            cur_depth = datasets_rarefs[dat][idx]
            odir = get_analysis_folder(i_datasets_folder,
                                       'permanova/%s%s' % (dat, cur_depth))
            for metric, subset_files in metric_groups_metas_qzas_dms_trees.items(
            ):
                permanovas.setdefault(dat, []).append(metric)
                if split:
                    out_sh = '%s/run_beta_group_significance_%s_%s_%s%s.sh' % (
                        job_folder2, prjct_nm, dat, metric, filt_raref)
                for subset, metas_qzas_mat_qzas_trees in subset_files.items():
                    (meta, qza, mat_qza, tree) = metas_qzas_mat_qzas_trees[0]
                    if not isfile(mat_qza):
                        if not first_print:
                            print(
                                'Beta diversity, distances matrices must be generated already to automatise PERMANOVA\n'
                                '\t(re-run this after steps "2_run_beta.sh" and "2x_run_beta_export.pbs" are done)'
                            )
                            first_print += 1
                        continue
                    if (dat, subset) not in metric_check:
                        meta_pd = read_meta_pd(meta)
                        meta_pd = meta_pd.set_index('sample_name')
                        cases_dict = check_metadata_cases_dict(
                            meta, meta_pd, dict(main_cases_dict), 'PERMANOVA')
                        testing_groups = check_metadata_testing_groups(
                            meta, meta_pd, main_testing_groups,
                            p_perm_tests_min, 'PERMANOVA')
                        metric_check.add((dat, subset))

                    for case_var, case_vals_list in cases_dict.items():
                        testing_groups_case_var = list(
                            set(testing_groups + [case_var]))
                        for case_vals in case_vals_list:
                            case = get_case(case_vals,
                                            case_var).replace(' ', '_')
                            for testing_group in testing_groups_case_var:
                                if testing_group == 'ALL':
                                    continue
                                cur_sh = '%s/run_beta_group_significance_%s%s_%s_%s_%s_%s%s.sh' % (
                                    job_folder2, dat, cur_depth, metric,
                                    subset, case, testing_group, filt_raref)
                                cur_sh = cur_sh.replace(' ', '-')
                                all_sh_pbs.setdefault((dat, out_sh),
                                                      []).append(cur_sh)
                                run_single_perm(odir, subset, meta_pd, cur_sh,
                                                metric, case, testing_group,
                                                p_perm_tests_min, p_beta_type,
                                                qza, mat_qza, case_var,
                                                case_vals, npermutations,
                                                force)
예제 #14
0
def run_deicode(i_datasets_folder: str, datasets: dict, datasets_rarefs: dict,
                p_perm_groups: str, force: bool, prjct_nm: str, qiime_env: str,
                chmod: str, noloc: bool, slurm: bool, run_params: dict,
                filt_raref: str, jobs: bool, chunkit: int) -> None:
    """
    Performs robust center log-ratio transform robust PCA and
    ranks the features by the loadings of the resulting SVD.
    https://library.qiime2.org/plugins/deicode/19/
    Main per-dataset looper for the ADONIS tests on beta diversity matrices.

    :param i_data_sets_folder: Path to the folder containing the data/metadata subfolders.
    :param data_sets: list of data_sets.
    :param p_perm_groups: groups to subset.
    :param force: Force the re-writing of scripts for all commands.
    :param prjct_nm: Nick name for your project.
    :param qiime_env: qiime2-xxxx.xx conda environment.
    :param chmod: whether to change permission of output files (defalt: 775).
    """
    job_folder2 = get_job_folder(i_datasets_folder, 'deicode/chunks')
    main_cases_dict = get_main_cases_dict(p_perm_groups)
    # jobs = []
    all_sh_pbs = {}
    for dat, tsv_meta_pds_ in datasets.items():
        out_sh = '%s/run_deicode_%s_%s%s.sh' % (job_folder2, prjct_nm, dat,
                                                filt_raref)
        for idx, tsv_meta_pds in enumerate(tsv_meta_pds_):
            cur_raref = datasets_rarefs[dat][idx]
            tsv, meta = tsv_meta_pds
            meta_alphas = meta.replace('.tsv', '_alphas.tsv')
            meta_alphas_full = meta.replace('.tsv', '_alphas_full.tsv')
            if isfile(meta_alphas_full):
                meta = meta_alphas_full
            elif isfile(meta_alphas):
                meta = meta_alphas
            meta_pd = read_meta_pd(meta)
            meta_pd = meta_pd.set_index('sample_name')
            cases_dict = check_metadata_cases_dict(meta, meta_pd,
                                                   dict(main_cases_dict),
                                                   'DEICODE')
            odir = get_analysis_folder(i_datasets_folder,
                                       'deicode/%s%s' % (dat, cur_raref))
            for case_var, case_vals_list in cases_dict.items():
                cur_sh = '%s/run_beta_deicode_%s_%s%s_%s%s.sh' % (
                    job_folder2, prjct_nm, dat, cur_raref, case_var,
                    filt_raref)
                cur_sh = cur_sh.replace(' ', '-')
                all_sh_pbs.setdefault((dat, out_sh), []).append(cur_sh)
                run_single_deicode(odir, tsv, meta_pd, case_var,
                                   case_vals_list, cur_sh, force)

    job_folder = get_job_folder(i_datasets_folder, 'deicode')
    main_sh = write_main_sh(
        job_folder, '3_run_beta_deicode_%s%s' % (filt_raref, prjct_nm),
        all_sh_pbs, '%s.dcd%s' % (prjct_nm, filt_raref), run_params["time"],
        run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"],
        run_params["mem_dim"], qiime_env, chmod, noloc, slurm, jobs, chunkit)
    if main_sh:
        if p_perm_groups:
            if p_perm_groups.startswith('/panfs'):
                p_perm_groups = p_perm_groups.replace(os.getcwd(), '')
            print('# DEICODE (groups config in %s)' % p_perm_groups)
        else:
            print('# DEICODE')
        print_message('', 'sh', main_sh, jobs)
 def read_meta_pd(self):
     meta_pd = read_meta_pd(self.meta[0], 'sample_name')
     self.metadata.append(meta_pd)
    def songbird(self) -> None:
        """Main script for the creation of songbird jobs.
        It iterates over the rows of the table created
        upfront and over each combination of parameters
        and collect the output info for potential reuse
        in figure generation and post-analysis.

        Parameters
        ----------
        config : Class instance of AnalysesConfig
            Contains all the routine analyses config info.
        project
            Darasets.
        """
        cmds = {}
        mess = set()
        songbird = []
        dat_cmds, dat_bcmds = {}, {}
        params_pd = self.get_params_combinations()
        for r, row in self.songbirds.iterrows():
            qza, pair, meta_fp = row['qza'], row['pair'], row['meta']
            dat, filt, subset = row['dataset'], row['filter'], row['subset']
            if dat not in self.songbird_models:
                continue
            dat_pair, pair_dir = self.get_dat_pair_dir(dat, pair)
            meta_pd = read_meta_pd(meta_fp)
            models = self.check_metadata_models(meta_fp, meta_pd,
                                                self.songbird_models[dat])
            row_params_pd = params_pd.copy()
            self.process_params_combinations(dat, meta_pd, row_params_pd, mess)
            for p, params in row_params_pd.iterrows():
                params_dir = self.get_params_dir(params)
                baselines, model_baselines = {}, {'1': '1'}
                for modx, model in enumerate(models.keys()):
                    formula, meta_vars, drop = models[model]
                    datdir, odir, new_qza, new_meta = self.get_main_dirs(
                        pair_dir, filt, subset, params_dir, model, self.config)
                    self.write_new_meta(meta_pd, new_meta, meta_vars, drop,
                                        params)
                    if dat in self.models_baselines and model in \
                            self.models_baselines[dat]:
                        model_baselines = self.models_baselines[dat][model]
                    for mdx, model_baseline in enumerate(model_baselines):
                        bformula = model_baselines[model_baseline]
                        bodir = get_analysis_folder(
                            self.config.i_datasets_folder,
                            'songbird/%s/b-%s' % (datdir, model_baseline))
                        out_paths = self.get_out_paths(odir, bodir,
                                                       model_baseline,
                                                       baselines)
                        # convergence = self.check_stats_convergence(out_paths)
                        cmd, bcmd = songbird_cmd(qza, new_qza, new_meta,
                                                 params, formula, bformula,
                                                 out_paths)
                        songbird.append([
                            dat, filt,
                            '%s_%s' % (params_dir.replace('/', '__'), model),
                            subset, out_paths['diff'], model_baseline,
                            out_paths['html'], pair
                        ])
                        if cmd:
                            dat_cmds.setdefault(dat, []).append(cmd)
                        if bcmd:
                            dat_bcmds.setdefault(dat, []).append(bcmd)

        for dat in dat_bcmds:
            # first come the scripts generating (reused) baselines models
            if dat_bcmds[dat]:
                cmds.setdefault(dat, []).extend(dat_bcmds[dat])
        for dat in dat_cmds:
            # and then the scripts generating the actual models
            if dat_cmds[dat]:
                cmds.setdefault(dat, []).extend(dat_cmds[dat])
        if songbird:
            self.get_songbird_pd(songbird)

        self.show_models_issues()
        self.register_command('songbird', cmds)
        self.summarize_songbirds()
        self.create_songbird_feature_metadata()
def run_procrustes(i_datasets_folder: str, datasets_filt: dict,
                   p_procrustes: str, betas: dict, force: bool, prjct_nm: str,
                   qiime_env: str, chmod: str, noloc: bool, slurm: bool,
                   split: bool, run_params: dict, filt_raref: str,
                   filt_only: bool, eval_depths: dict, jobs: bool,
                   chunkit: int) -> None:
    """
    """
    evaluation = ''
    if eval_depths:
        evaluation = '_eval'
        procrustes_pairs = {}
        for dat, depths in eval_depths.items():
            sorted_depths = sorted(depths, key=lambda x: int(x.split('_')[-1]))
            for idx, x in enumerate(sorted_depths[:-1]):
                y = sorted_depths[(idx + 1)]
                n0 = x.split('_')[-1]
                n1 = y.split('_')[-1]
                procrustes_pairs['%s_%s' % (n0, n1)] = [x, y]
        procrustes_subsets = {'ALL': [[]]}
    else:
        procrustes_pairs, procrustes_subsets = get_procrustes_mantel_dicts(
            p_procrustes)
    get_job_folder(i_datasets_folder, 'procrustes%s' % evaluation)
    dms_tab = []
    all_sh_pbs = {}
    missing_dats = set()
    for pair, (dat1_, dat2_) in procrustes_pairs.items():

        dat1, raref1 = get_dat_idx(dat1_, evaluation, datasets_filt, filt_only)
        dat2, raref2 = get_dat_idx(dat2_, evaluation, datasets_filt, filt_only)

        if check_dat_exists(betas, dat1, missing_dats) or check_dat_exists(
                betas, dat2, missing_dats):
            continue

        if evaluation:
            metrics_groups_metas_qzas_dms_trees1 = betas[dat1]
            metrics_groups_metas_qzas_dms_trees2 = betas[dat2]
        else:
            metrics_groups_metas_qzas_dms_trees1 = betas[dat1][0]
            metrics_groups_metas_qzas_dms_trees2 = betas[dat2][0]

        job_folder2 = get_job_folder(
            i_datasets_folder,
            'procrustes%s/chunks/%s%s' % (evaluation, pair, filt_raref))
        if not split:
            out_sh = '%s/run_procrustes_%s%s_%s%s.sh' % (
                job_folder2, prjct_nm, evaluation, pair, filt_raref)

        for metric, groups_metas_qzas_dms_trees1 in metrics_groups_metas_qzas_dms_trees1.items(
        ):
            if split:
                out_sh = '%s/run_procrustes_%s%s_%s_%s%s.sh' % (
                    job_folder2, prjct_nm, evaluation, pair, metric,
                    filt_raref)
            if metric not in metrics_groups_metas_qzas_dms_trees2:
                continue
            groups_metas_qzas_dms_trees2 = metrics_groups_metas_qzas_dms_trees2[
                metric]
            groups1 = sorted(groups_metas_qzas_dms_trees1.keys())
            groups2 = sorted(groups_metas_qzas_dms_trees2.keys())
            for (group1_, group2_) in itertools.product(*[groups1, groups2]):
                if group1_ == '':
                    group1 = 'full'
                else:
                    group1 = group1_
                if group2_ == '':
                    group2 = 'full'
                else:
                    group2 = group2_

                meta1, qza1, dm1, tree1 = groups_metas_qzas_dms_trees1[
                    group1_][0]
                meta2, qza2, dm2, tree2 = groups_metas_qzas_dms_trees2[
                    group2_][0]

                skip = 0
                if not evaluation:
                    if '__raref' in dat1_:
                        dm1, meta1 = get_dm_meta(dat1, dm1, meta1, raref1,
                                                 metric, i_datasets_folder,
                                                 skip)
                    if '__raref' in dat2_:
                        dm2, meta2 = get_dm_meta(dat2, dm2, meta2, raref2,
                                                 metric, i_datasets_folder,
                                                 skip)
                if skip:
                    print(
                        '[Proscustes] One desired rarefaction depth not run (pair %s)'
                        % pair)
                    continue

                meta_pd1 = read_meta_pd(meta1)
                meta_pd2 = read_meta_pd(meta2)
                common_sams = list(
                    set(meta_pd1.sample_name) & set(meta_pd2.sample_name))
                if len(common_sams) < 3:
                    continue

                meta_pd = meta_pd1.loc[meta_pd1.sample_name.isin(common_sams)]
                cases_dict = check_metadata_cases_dict(
                    meta1, meta_pd, dict(procrustes_subsets), 'procrustes')
                odir = get_analysis_folder(
                    i_datasets_folder, 'procrustes%s/%s%s/%s_vs_%s' %
                    (evaluation, pair, filt_raref, group1, group2))
                job_folder3 = get_job_folder(
                    i_datasets_folder, 'procrustes%s/chunks/%s%s/%s_vs_%s' %
                    (evaluation, pair, filt_raref, group1, group2))
                for case_var, case_vals_list in cases_dict.items():
                    for case_vals in case_vals_list:
                        case_ = get_case(case_vals, case_var).replace(' ', '_')
                        cur = '%s__%s' % (metric, case_)
                        cur_sh = '%s/run_procrustes%s_%s%s.sh' % (
                            job_folder3, evaluation, cur, filt_raref)
                        cur_sh = cur_sh.replace(' ', '-')
                        all_sh_pbs.setdefault((pair, out_sh),
                                              []).append(cur_sh)

                        dm_out1 = '%s/dm_%s__%s_DM.qza' % (odir, dat1_, cur)
                        dm_out2 = '%s/dm_%s__%s_DM.qza' % (odir, dat2_, cur)
                        dm_out1_tsv = '%s.tsv' % splitext(dm_out1)[0]
                        dm_out2_tsv = '%s.tsv' % splitext(dm_out2)[0]
                        biplot = '%s/procrustes%s_%s__%s__%s.qzv' % (
                            odir, evaluation, dat1_, dat2_, cur)
                        run_single_procrustes_mantel('procrustes', odir, dm1,
                                                     dm2, meta_pd, dm_out1,
                                                     dm_out2, biplot, cur_sh,
                                                     cur, case_var, case_vals,
                                                     force)
                        dms_tab.append([
                            pair, dat1_, dat2_, group1, group2, case_, metric,
                            dm_out1_tsv, dm_out2_tsv
                        ])

    job_folder = get_job_folder(i_datasets_folder, 'procrustes%s' % evaluation)
    main_sh = write_main_sh(
        job_folder,
        '4_run_procrustes_%s%s%s' % (prjct_nm, evaluation, filt_raref),
        all_sh_pbs, '%s.prcst%s%s' % (prjct_nm, evaluation, filt_raref),
        run_params["time"], run_params["n_nodes"], run_params["n_procs"],
        run_params["mem_num"], run_params["mem_dim"], qiime_env, chmod, noloc,
        slurm, jobs, chunkit)
    if main_sh:
        if p_procrustes and p_procrustes != 1:
            if p_procrustes.startswith('/panfs'):
                p_procrustes = p_procrustes.replace(os.getcwd(), '')
            print('# Procrustes (pairs and samples subsets config in %s)' %
                  p_procrustes)
        else:
            print('# Procrustes')
        print_message('', 'sh', main_sh, jobs)

    dms_tab_pd = pd.DataFrame(dms_tab,
                              columns=[
                                  'pair',
                                  'dat1',
                                  'dat2',
                                  'metric',
                                  'group1',
                                  'group2',
                                  'case',
                                  'dm_out1',
                                  'dm_out2',
                              ])

    odir = get_analysis_folder(i_datasets_folder,
                               'procrustes%s/R' % evaluation)
    out_Rs = glob.glob('%s/pairs_proscrustes_results%s%s*.tsv' %
                       (odir, evaluation, filt_raref))
    if len(out_Rs):
        done_R = pd.concat([pd.read_table(x, sep=' ') for x in out_Rs])
        dms_tab_pd = dms_tab_pd.loc[~dms_tab_pd[['dm_out1', 'dm_out2']].sum(1).
                                    isin(done_R[['f1', 'f2']].sum(1))]

    if dms_tab_pd.shape[0]:
        fp_num = 0
        if len(out_Rs):
            last = sorted(
                out_Rs, key=lambda fp: int(fp.split('.tsv')[0].split('_')[-1]))
            fp_num = int(last[-1].split('.tsv')[0].split('_')[-1]) + 1

        dms_tab_fp = '%s/pairs%s%s_%s.tsv' % (odir, evaluation, filt_raref,
                                              fp_num)
        dms_tab_pd.to_csv(dms_tab_fp, index=False, sep='\t')
        out_R = '%s/pairs_proscrustes_results%s%s_%s.tsv' % (
            odir, evaluation, filt_raref, fp_num)
        job_folder = get_job_folder(i_datasets_folder, 'procrustes/R')
        R_script = '%s/4_run_procrustes_%s%s.R' % (job_folder, prjct_nm,
                                                   filt_raref)
        with open(R_script, 'w') as o:
            o.write("library(vegan)\n")
            o.write("dms_files <- read.table('%s', h=T)\n" % dms_tab_fp)
            o.write(
                "cols <- c('pair', 'd1', 'd2', 'g1', 'g2', 'case', 'metric', 'f1', 'f2', 'samples', 'M2', 'p-value')\n"
            )
            o.write(
                "res <- setNames(data.frame(matrix(ncol = 12, nrow = 0)), cols)\n"
            )
            o.write("for (i in seq(1, dim(dms_files)[1])) {\n")
            o.write("    row <- as.vector(unlist(dms_files[i,]))\n")
            o.write("    pair <- row[1]\n")
            o.write("    d1 <- row[2]\n")
            o.write("    d2 <- row[3]\n")
            o.write("    group1 <- row[4]\n")
            o.write("    group2 <- row[5]\n")
            o.write("    case <- row[6]\n")
            o.write("    metric <- row[7]\n")
            o.write("    f1 <- row[8]\n")
            o.write("    f2 <- row[9]\n")
            o.write("    if (sum(file.exists(f1, f2)) == 2) {\n")
            o.write(
                "        filin_tsv_pd1 <- read.csv(f1, header = TRUE, check.names=FALSE,\n"
            )
            o.write(
                "                                  row.names = 1, colClasses = 'character', sep = '\\t')\n"
            )
            o.write(
                "        filin_tsv_pd2 <- read.csv(f2, header = TRUE, check.names=FALSE,\n"
            )
            o.write(
                "                                  row.names = 1, colClasses = 'character', sep = '\\t')\n"
            )
            o.write("        filin_tsv_pd1 <- data.matrix(filin_tsv_pd1)\n")
            o.write("        filin_tsv_pd2 <- data.matrix(filin_tsv_pd2)\n")
            o.write(
                "        filin_tsv_pd1 <- filin_tsv_pd1[rownames(filin_tsv_pd2), rownames(filin_tsv_pd2)]\n"
            )
            o.write(
                "        # procrustes12 <- procrustes(filin_tsv_pd1, filin_tsv_pd2, kind=2, permutations=999)\n"
            )
            o.write(
                "        prtst <- protest(filin_tsv_pd1, filin_tsv_pd2, permutations = 999)\n"
            )
            o.write("        n <- dim(filin_tsv_pd1)[1]\n")
            o.write(
                "        res[i,] <- c(pair, d1, d2, group1, group2, case, metric, f1, f2, n, prtst$ss, prtst$signif)\n"
            )
            o.write("    }\n")
            o.write("}\n")
            o.write("write.table(x = res, file = '%s')\n" % out_R)

        out_sh = '%s/4_run_procrustes_%s%s_R%s.sh' % (job_folder, prjct_nm,
                                                      evaluation, filt_raref)
        out_pbs = '%s.pbs' % splitext(out_sh)[0]
        with open(out_sh, 'w') as o:
            o.write('R -f %s --vanilla\n' % R_script)

        run_xpbs(
            out_sh, out_pbs,
            '%s.prcrt%s.R%s' % (prjct_nm, evaluation, filt_raref), 'renv',
            run_params["time"], run_params["n_nodes"], run_params["n_procs"],
            run_params["mem_num"], run_params["mem_dim"], chmod, 1,
            '# Procrustes for stats in R (pairs and samples subsets config in %s)'
            % p_procrustes, None, False, jobs)
def run_phate(p_phate_config: str, i_datasets_folder: str, datasets: dict,
              datasets_rarefs: dict, force: bool, prjct_nm: str,
              qiime_env: str, chmod: str, noloc: bool, slurm: bool,
              split: bool, run_params: dict, filt_raref: str, jobs: bool,
              chunkit: int) -> dict:

    job_folder2 = get_job_folder(i_datasets_folder, 'phate/chunks')
    phate_dicts = get_phate_dicts(p_phate_config)
    phate_filtering, phate_labels, phate_params, main_cases_dict = phate_dicts

    phates = {}
    all_sh_pbs = {}
    all_import_sh_pbs = {}
    for dat, tsv_meta_pds_ in datasets.items():
        phates[dat] = []
        if dat in phate_filtering:
            filters = phate_filtering[dat]
        else:
            filters = {'0_0': ['0', '0']}
        for idx, tsv_meta_pds in enumerate(tsv_meta_pds_):
            tsv, meta = tsv_meta_pds
            meta_pd = read_meta_pd(meta)
            meta_pd = meta_pd.set_index('sample_name')
            cases_dict = check_metadata_cases_dict(meta, meta_pd,
                                                   dict(main_cases_dict),
                                                   'phate')
            cur_raref = datasets_rarefs[dat][idx]
            if not split:
                out_sh = '%s/run_phate_%s_%s%s%s.sh' % (
                    job_folder2, prjct_nm, dat, filt_raref, cur_raref)
                out_import_sh = '%s/run_import_phate_%s_%s%s%s.sh' % (
                    job_folder2, prjct_nm, dat, filt_raref, cur_raref)
            odir = get_analysis_folder(i_datasets_folder, 'phate/%s' % dat)
            raref_phates = {}
            for filt, (fp, fa) in filters.items():
                raref_phates[filt] = {}
                if split:
                    out_sh = '%s/run_phate_%s_%s%s%s%s.sh' % (
                        job_folder2, prjct_nm, dat, filt_raref, cur_raref,
                        filt)
                    out_import_sh = '%s/run_import_phate_%s_%s%s%s%s.sh' % (
                        job_folder2, prjct_nm, dat, filt_raref, cur_raref,
                        filt)
                for case_var, case_vals_list in cases_dict.items():
                    cur_sh = '%s/run_phate_%s_%s%s%s_%s_%s.sh' % (
                        job_folder2, prjct_nm, dat, filt_raref, cur_raref,
                        case_var, filt)
                    cur_sh = cur_sh.replace(' ', '-')
                    cur_import_sh = '%s/run_import_phate_%s_%s%s%s_%s_%s.sh' % (
                        job_folder2, prjct_nm, dat, filt_raref, cur_raref,
                        case_var, filt)
                    cur_import_sh = cur_import_sh.replace(' ', '-')
                    all_sh_pbs.setdefault((dat, out_sh), []).append(cur_sh)
                    all_import_sh_pbs.setdefault((dat, out_import_sh),
                                                 []).append(cur_import_sh)
                    phate = run_single_phate(dat, odir, tsv, meta_pd, case_var,
                                             phate_labels, phate_params,
                                             run_params, case_vals_list,
                                             cur_sh, cur_import_sh, force,
                                             filt, cur_raref, fp, fa)
                    raref_phates[filt][case_var] = phate
            phates[dat].append(raref_phates)

    job_folder = get_job_folder(i_datasets_folder, 'phate')
    main_sh = write_main_sh(
        job_folder, '3_run_import_phate_%s%s' % (prjct_nm, filt_raref),
        all_import_sh_pbs, '%s.mrt.pht%s' % (prjct_nm, filt_raref),
        run_params["time"], run_params["n_nodes"], run_params["n_procs"],
        run_params["mem_num"], run_params["mem_dim"], qiime_env, chmod, noloc,
        slurm, jobs, chunkit)
    if main_sh:
        if p_phate_config:
            if p_phate_config.startswith('/panfs'):
                p_phate_config = p_phate_config.replace(os.getcwd(), '')
            print('# Import for PHATE (groups config in %s)' % p_phate_config)
        else:
            print('# Import for PHATE')
        print_message('', 'sh', main_sh, jobs)

    main_sh = write_main_sh(
        job_folder, '3_run_phate_%s%s' % (prjct_nm, filt_raref), all_sh_pbs,
        '%s.pht%s' % (prjct_nm, filt_raref), run_params["time"],
        run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"],
        run_params["mem_dim"], 'xphate', chmod, noloc, slurm, jobs, chunkit)
    if main_sh:
        if p_phate_config:
            if p_phate_config.startswith('/panfs'):
                p_phate_config = p_phate_config.replace(os.getcwd(), '')
            print('# PHATE (groups config in %s)' % p_phate_config)
        else:
            print('# PHATE')
        print_message('', 'sh', main_sh, jobs)
    return phates
def run_alpha_group_significance(i_datasets_folder: str, datasets: dict,
                                 diversities: dict, datasets_rarefs: dict,
                                 p_perm_groups: str, force: bool,
                                 prjct_nm: str, qiime_env: str, chmod: str,
                                 noloc: bool, slurm: bool, As: tuple,
                                 split: bool, run_params: dict,
                                 filt_raref: str, jobs: bool,
                                 chunkit: int) -> None:
    """
    Run alpha-group-significance: Alpha diversity comparisons.
    https://docs.qiime2.org/2019.10/plugins/available/diversity/alpha-group-significance/
    Main per-dataset looper for the Kruskal-Wallis tests on alpha diversity vectors.

    :param i_datasets_folder: Path to the folder containing the data/metadata subfolders.
    :param datasets: list of datasets.
    :param diversities: alpha diversity qiime2 Arfetact per dataset.
    :param p_perm_groups: path to the subsets file.
    :param force: Force the re-writing of scripts for all commands.
    :param prjct_nm: Nick name for your project.
    :param qiime_env: qiime2-xxxx.xx conda environment.
    :param chmod: whether to change permission of output files (defalt: 775).
    """

    job_folder2 = get_job_folder(i_datasets_folder,
                                 'alpha_group_significance/chunks')
    # alpha_metrics = get_metrics('alpha_metrics', As)
    main_cases_dict = get_main_cases_dict(p_perm_groups)

    jobs = []
    all_sh_pbs = {}
    first_print = 0

    for dat, tsv_meta_pds_ in datasets.items():
        for idx, tsv_meta_pds in enumerate(tsv_meta_pds_):
            meta = tsv_meta_pds[1]
            cur_raref = datasets_rarefs[dat][idx]
            raref_diversities = diversities[dat][idx]

            presence_mat = [
                1 for (qza, metric) in raref_diversities[''] if isfile(qza)
            ]
            if not presence_mat:
                if not first_print:
                    print(
                        'Alpha diversity must be measured already to automatise Kruskal-Wallis tests\n'
                        '\t(re-run this after step "1_run_alpha.sh" is done)')
                    first_print += 1
                continue

            meta_pd = read_meta_pd(meta)
            meta_pd = meta_pd.set_index('sample_name')
            cases_dict = check_metadata_cases_dict(meta, meta_pd,
                                                   dict(main_cases_dict),
                                                   'alpha Kruskal-Wallis')

            odir = get_analysis_folder(
                i_datasets_folder,
                'alpha_group_significance/%s%s' % (dat, cur_raref))
            for (qza, metric) in raref_diversities['']:
                # metric = get_metric(alpha_metrics, qza)
                div_tsv = '%s.tsv' % splitext(qza)[0]
                if not isfile(div_tsv) or not isfile(div_tsv):
                    print(
                        '  [KRUSKAL-WALLIS] metric %s not calculated\nSkipping it...'
                        % metric)
                    continue
                out_sh = '%s/run_alpha_group_significance_%s%s_%s%s.sh' % (
                    job_folder2, dat, cur_raref, metric, filt_raref)
                for case_var, case_vals_list in cases_dict.items():
                    cur_sh = '%s/run_alpha_group_significance_%s%s_%s_%s%s.sh' % (
                        job_folder2, dat, cur_raref, metric, case_var,
                        filt_raref)
                    cur_sh = cur_sh.replace(' ', '-')
                    all_sh_pbs.setdefault((dat, out_sh), []).append(cur_sh)
                    p = multiprocessing.Process(target=run_multi_kw,
                                                args=(odir, meta_pd, qza,
                                                      case_vals_list, case_var,
                                                      cur_sh, force))
                    p.start()
                    jobs.append(p)
    for j in jobs:
        j.join()

    job_folder = get_job_folder(i_datasets_folder, 'alpha_group_significance')
    main_sh = write_main_sh(
        job_folder,
        '6_run_alpha_group_significance_%s%s' % (filt_raref, prjct_nm),
        all_sh_pbs, '%s.kv%s' % (prjct_nm, filt_raref), run_params["time"],
        run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"],
        run_params["mem_dim"], qiime_env, chmod, noloc, slurm, jobs, chunkit)
    if main_sh:
        if p_perm_groups:
            print("# Kruskal-Wallis on alpha diversity (groups config in %s)" %
                  p_perm_groups)
        else:
            print("# Kruskal-Wallis on alpha diversity")
        print_message('', 'sh', main_sh, jobs)
예제 #20
0
def run_sourcetracking(i_datasets_folder: str, datasets: dict,
                       p_sourcetracking_config: str, datasets_rarefs: dict,
                       force: bool, prjct_nm: str, qiime_env: str, chmod: str,
                       noloc: bool, slurm: bool, run_params: dict,
                       filt_raref: str, split: bool, jobs: bool,
                       chunkit: int) -> None:

    job_folder2 = get_job_folder(i_datasets_folder, 'sourcetracking/chunks')
    sourcetracking_dicts = get_sourcetracking_config(p_sourcetracking_config)
    sourcetracking_sourcesink = sourcetracking_dicts[0]
    sourcetracking_filtering = sourcetracking_dicts[1]
    sourcetracking_params = sourcetracking_dicts[2]
    main_cases_dict = sourcetracking_dicts[3]

    all_sh_pbs = {}
    all_import_sh_pbs = {}
    for dat, tsv_meta_pds_ in datasets.items():
        if dat in sourcetracking_filtering:
            filters = sourcetracking_filtering[dat]
        else:
            filters = {'0_0': ['0', '0']}
        for idx, tsv_meta_pds in enumerate(tsv_meta_pds_):
            tsv, meta = tsv_meta_pds
            meta_pd = read_meta_pd(meta)
            meta_pd = meta_pd.set_index('sample_name')
            cases_dict = check_metadata_cases_dict(meta, meta_pd,
                                                   dict(main_cases_dict),
                                                   'sourcetracking')
            cur_raref = datasets_rarefs[dat][idx]
            out_import_sh = '%s/run_import_sourcetracking_%s_%s%s%s.sh' % (
                job_folder2, prjct_nm, dat, filt_raref, cur_raref)
            imports = set()
            odir = get_analysis_folder(i_datasets_folder,
                                       'sourcetracking/%s' % dat)
            for method in sourcetracking_params['method']:
                out_sh = '%s/run_sourcetracking_%s_%s%s%s_%s.sh' % (
                    job_folder2, prjct_nm, dat, filt_raref, cur_raref, method)
                for case_var, case_vals_list in cases_dict.items():
                    for filt, (fp, fa) in filters.items():
                        cur_sh = '%s/run_sourcetracking_%s_%s_%s%s%s_%s_%s.sh' % (
                            job_folder2, prjct_nm, dat, case_var, filt_raref,
                            cur_raref, method, filt)
                        cur_sh = cur_sh.replace(' ', '-')
                        cur_import_sh = '%s/run_import_sourcetracking_%s_%s_%s%s%s_%s_%s.sh' % (
                            job_folder2, prjct_nm, dat, case_var, filt_raref,
                            cur_raref, method, filt)
                        cur_import_sh = cur_import_sh.replace(' ', '-')
                        all_sh_pbs.setdefault((dat, out_sh), []).append(cur_sh)
                        all_import_sh_pbs.setdefault((dat, out_import_sh),
                                                     []).append(cur_import_sh)
                        run_single_sourcetracking(
                            odir, tsv, meta_pd, case_var,
                            sourcetracking_params, method, imports,
                            sourcetracking_sourcesink, case_vals_list, cur_sh,
                            cur_import_sh, force, filt, cur_raref, fp, fa,
                            run_params["n_nodes"], run_params["n_procs"])

    job_folder = get_job_folder(i_datasets_folder, 'sourcetracking')
    main_sh = write_main_sh(
        job_folder,
        '3_run_import_sourcetracking_%s%s' % (prjct_nm, filt_raref),
        all_import_sh_pbs, '%s.mpt.srctrk%s' % (prjct_nm, filt_raref),
        run_params["time"], run_params["n_nodes"], run_params["n_procs"],
        run_params["mem_num"], run_params["mem_dim"], qiime_env, chmod, noloc,
        slurm, jobs, chunkit, '~/.')
    if main_sh:
        if p_sourcetracking_config:
            if p_sourcetracking_config.startswith('/panfs'):
                p_sourcetracking_config = p_sourcetracking_config.replace(
                    os.getcwd(), '')
            print('# import sourcetracking (groups config in %s)' %
                  p_sourcetracking_config)
        else:
            print('# import sourcetracking')
        print_message('', 'sh', main_sh, jobs)

    main_sh = write_main_sh(
        job_folder, '3_run_sourcetracking_%s%s' % (prjct_nm, filt_raref),
        all_sh_pbs, '%s.srctrk%s' % (prjct_nm, filt_raref), run_params["time"],
        run_params["n_nodes"], run_params["n_procs"], run_params["mem_num"],
        run_params["mem_dim"], qiime_env, chmod, noloc, slurm, jobs, chunkit,
        '~/.')
    if main_sh:
        if p_sourcetracking_config:
            if p_sourcetracking_config.startswith('/panfs'):
                p_sourcetracking_config = p_sourcetracking_config.replace(
                    os.getcwd(), '')
            print('# sourcetracking (groups config in %s)' %
                  p_sourcetracking_config)
        else:
            print('# sourcetracking')
        print_message('', 'sh', main_sh, jobs)
 def read_data_pd(self):
     data_pd = read_meta_pd(self.tsv[0], '#OTU ID').set_index('#OTU ID')
     self.data.append(data_pd)