def run_single_phate(dat: str, odir: str, tsv: str, meta_pd: pd.DataFrame,
                     case_var: str, phate_labels: list, phate_params: dict,
                     run_params: dict, case_vals_list: list, cur_sh: str,
                     cur_import_sh: str, force: bool, filt: str,
                     cur_raref: str, fp: str, fa: str) -> dict:

    remove = True
    qza = '%s.qza' % splitext(tsv)[0]
    cases = {}
    with open(cur_sh, 'w') as cur_sh_o, open(cur_import_sh,
                                             'w') as cur_import_sh_o:
        for case_vals in case_vals_list:
            case = get_case(case_vals, '', case_var)
            cur_rad = '%s/%s_%s%s' % (odir, case.strip('_'), filt, cur_raref)
            if not isdir(cur_rad):
                os.makedirs(cur_rad)
            new_meta = '%s/meta.tsv' % cur_rad
            new_qza = '%s/tab.qza' % cur_rad
            new_tsv = '%s/tab.tsv' % cur_rad
            phate_html = '%s/phate_%s_%s_%s.html' % (cur_rad, dat, filt, case)
            phate_tsv = '%s_xphate.tsv' % splitext(phate_html)[0]
            if len(glob.glob('%s/TOO_FEW.*' % cur_rad)):
                continue
            cases[case] = phate_tsv
            if force or not isfile(phate_html) or not isfile(phate_tsv):
                new_meta_pd = get_new_meta_pd(meta_pd, case, case_var,
                                              case_vals)
                new_meta_pd.reset_index().to_csv(new_meta,
                                                 index=False,
                                                 sep='\t')
                write_phate_cmd(qza, new_qza, new_tsv, new_meta, fp, fa,
                                phate_html, phate_labels, phate_params,
                                run_params["n_nodes"], run_params["n_procs"],
                                cur_sh_o, cur_import_sh_o)
                remove = False
def run_distance_decay(i_datasets_folder: str, betas: dict,
                       p_distance_decay: str, datasets_rarefs: dict,
                       force: bool, prjct_nm: str, qiime_env: str, chmod: str,
                       noloc: bool, slurm: bool, split: bool, run_params: dict,
                       filt_raref: str, jobs: bool,
                       chunkit: int) -> (dict, list):

    job_folder2 = get_job_folder(i_datasets_folder, 'decay/chunks')
    decay_config = read_yaml_file(p_distance_decay)
    subsets, modes, params = get_decay_config(decay_config)

    all_sh_pbs = {}
    decay_res = {}
    for dat, rarefs_metrics_groups_metas_qzas_dms_trees in betas.items():
        if not split:
            out_sh = '%s/run_decay_%s_%s%s.sh' % (job_folder2, prjct_nm, dat,
                                                  filt_raref)
        decay_res[dat] = []
        for idx, metrics_groups_metas_qzas_dms_trees in enumerate(
                rarefs_metrics_groups_metas_qzas_dms_trees):
            decay_raref = {}
            cur_raref = datasets_rarefs[dat][idx]
            odir = get_analysis_folder(i_datasets_folder,
                                       'decay/%s%s' % (dat, cur_raref))
            if split:
                out_sh = '%s/run_decay_%s_%s%s%s.sh' % (
                    job_folder2, prjct_nm, dat, cur_raref, filt_raref)
            for metric, groups_metas_qzas_dms_trees in metrics_groups_metas_qzas_dms_trees.items(
            ):
                for group, metas_qzas_mat_qzas_trees in groups_metas_qzas_dms_trees.items(
                ):
                    for (meta, qza, mat_qza,
                         tree) in metas_qzas_mat_qzas_trees:
                        meta_pd = read_meta_pd(meta).set_index('sample_name')
                        cases_dict = check_metadata_cases_dict(
                            meta, meta_pd, dict(subsets), 'decay')
                        for case_var, case_vals_list in cases_dict.items():
                            for case_vals in case_vals_list:
                                case = get_case(case_vals,
                                                case_var).replace(' ', '_')
                                cur_sh = '%s/run_decay_%s%s_%s_%s_%s%s.sh' % (
                                    job_folder2, dat, cur_raref, metric, group,
                                    case, filt_raref)
                                cur_sh = cur_sh.replace(' ', '-')
                                all_sh_pbs.setdefault((dat, out_sh),
                                                      []).append(cur_sh)
                                new_meta_pd = get_new_meta_pd(
                                    meta_pd, case, case_var, case_vals)
                                res = run_single_decay(
                                    odir, group, new_meta_pd, cur_sh, mat_qza,
                                    case, modes, force, run_params["n_nodes"],
                                    run_params["n_procs"],
                                    int(params['iteration']),
                                    int(params['step']))
                                decay_raref[(metric, group, case)] = res
            decay_res[dat].append(decay_raref)
Пример #3
0
 def merge_subsets_apply(self):
     subsets_fp = [[pair, var, subset,
                    get_case(subset, var)]
                   for var, subsets in self.subsets.items()
                   for subset in subsets
                   for pair in self.mmvecs.pair.unique()]
     if subsets_fp:
         subsets = pd.DataFrame(
             subsets_fp, columns=['pair', 'variable', 'factors', 'subset'])
         self.mmvecs = self.mmvecs.merge(subsets, on=['pair'], how='outer')
Пример #4
0
def run_single_deicode(odir: str, tsv: str, meta_pd: pd.DataFrame,
                       case_var: str, case_vals_list: list, cur_sh: str,
                       force: bool) -> None:
    """
    Performs robust center log-ratio transform robust PCA and
    ranks the features by the loadings of the resulting SVD.
    https://library.qiime2.org/plugins/deicode/19/
    (in-loop function).

    :param odir: output analysis directory.
    :param tsv: features table input to the beta diversity matrix.
    :param meta_pd: metadata table.
    :param case_var: metadata variable to make groups from.
    :param case_vals_list: groups for the metadata variable.
    :param cur_sh: input bash script file.
    :param force: Force the re-writing of scripts for all commands.
    """

    remove = True
    qza = '%s.qza' % splitext(tsv)[0]
    with open(cur_sh, 'w') as cur_sh_o:
        for case_vals in case_vals_list:
            case = get_case(case_vals, '', case_var)
            cur_rad = odir + '/' + basename(tsv).replace('.tsv', '_%s' % case)
            new_meta = '%s.meta' % cur_rad
            new_mat_qza = '%s_DM.qza' % cur_rad
            new_qza = '%s.qza' % cur_rad
            ordi_qza = '%s_deicode_ordination.qza' % cur_rad
            ordi_qzv = '%s_deicode_ordination_biplot.qzv' % cur_rad
            if force or not isfile(ordi_qzv):
                new_meta_pd = get_new_meta_pd(meta_pd, case, case_var,
                                              case_vals)
                if new_meta_pd.shape[0] < 10:
                    continue
                new_meta_pd.reset_index().to_csv(new_meta,
                                                 index=False,
                                                 sep='\t')
                write_deicode_biplot(qza, new_meta, new_qza, ordi_qza,
                                     new_mat_qza, ordi_qzv, cur_sh_o)
                remove = False
 def merge_subsets_apply(self):
     subsets_fp = [[dataset, var, subset,
                    get_case(subset, var), '']
                   for var, subsets in self.songbird_subsets.items()
                   for subset in subsets
                   for dataset in self.songbirds.dataset.unique()]
     if subsets_fp:
         subsets = pd.DataFrame(
             subsets_fp,
             columns=['dataset', 'variable', 'factors', 'subset', 'pair'])
         self.songbirds = self.songbirds.merge(subsets,
                                               on=['dataset'],
                                               how='outer')
def run_multi_kw(odir: str, meta_pd: pd.DataFrame, div_qza: str,
                 case_vals_list: list, case_var: str, cur_sh: str,
                 force: bool) -> None:
    """
    Run alpha-group-significance: Alpha diversity comparisons.
    https://docs.qiime2.org/2019.10/plugins/available/diversity/alpha-group-significance/
    (in-loop function).

    :param odir: output analysis directory.
    :param meta_pd: metadata table.
    :param div_qza:
    :param case_vals_list:
    :param metric:
    :param case_var:
    :param cur_sh: input bash script file.
    :param force: Force the re-writing of scripts for all commands.
    """
    remove = True
    with open(cur_sh, 'w') as cur_sh_o:
        for case_vals in case_vals_list:
            case = get_case(case_vals, case_var)
            cur_rad = odir + '/' + basename(div_qza).replace(
                '.qza', '_%s' % case)
            new_qzv = '%s_kruskal-wallis.qzv' % cur_rad
            if force or not isfile(new_qzv):
                new_meta = '%s.meta' % cur_rad
                new_meta_pd = get_new_meta_pd(meta_pd, case, case_var,
                                              case_vals)
                new_meta_pd.reset_index().to_csv(new_meta,
                                                 index=False,
                                                 sep='\t')
                new_div = get_new_alpha_div(case, div_qza, cur_rad,
                                            new_meta_pd, cur_sh_o)
                write_alpha_group_significance_cmd(new_div, new_meta, new_qzv,
                                                   cur_sh_o)
                remove = False
Пример #7
0
def run_single_adonis(odir: str, subset: str, case_vals_list: list, metric: str,
                      case_var: str, form: str, formula: str, qza: str, mat_qza: str,
                      meta_pd: pd.DataFrame, cur_sh: str, force: bool) -> None:
    """
    Run adonis: adonis PERMANOVA test for beta group significance.
    https://docs.qiime2.org/2019.10/plugins/available/diversity/adonis/
    (in-loop function).

    :param odir: output analysis directory.
    :param case_vals_list:
    :param metric:
    :param case_var:
    :param form:
    :param formula:
    :param tsv: features table input to the beta diversity matrix.
    :param mat_qza:
    :param meta_pd: metadata table.
    :param cur_sh: input bash script file.
    :param force: Force the re-writing of scripts for all commands.
    """
    remove = True
    with open(cur_sh, 'w') as cur_sh_o:
        for case_vals in case_vals_list:
            case = '%s__%s' % (metric, get_case(case_vals, case_var, form))
            if subset:
                cur_rad = '%s/%s_%s_%s' % (odir, splitext(basename(qza))[0], subset, case)
            else:
                cur_rad = '%s/%s_%s' % (odir, splitext(basename(qza))[0], case)
            new_meta = '%s.meta' % cur_rad
            new_qzv = '%s_adonis.qzv' % cur_rad
            new_mat_qza = '%s/%s' % (odir, basename(mat_qza).replace('.qza', '_%s.qza' % case))
            new_meta_pd = get_new_meta_pd(meta_pd, case, case_var, case_vals)
            new_meta_pd.reset_index().to_csv(new_meta, index=False, sep='\t')
            if force or not isfile(new_qzv):
                write_diversity_adonis(new_meta, mat_qza, new_mat_qza,
                                       formula, new_qzv, cur_sh_o)
                remove = False
def get_common_datasets(i_datasets_folder: str, mmvec_pairs: dict,
                        filtering: dict, filt_datasets: dict,
                        common_datasets_done: dict, input_to_filtered: dict,
                        train_test_dict: dict, force: bool,
                        subsets: dict) -> (dict, list):
    """
    :param i_datasets_folder:
    :param mmvec_pairs:
    :param filt_datasets:
    :param force: Force the re-writing of scripts for all commands.
    :return:
    """
    common_jobs = []
    common_datasets = {}
    for pair, pair_datasets in mmvec_pairs.items():
        # print("pair, pair_datasets")
        # print(pair, pair_datasets)
        (omic1_, bool1), (omic2_, bool2) = pair_datasets
        if omic1_ not in input_to_filtered or omic2_ not in input_to_filtered:
            continue
        omic1 = input_to_filtered[omic1_]
        omic2 = input_to_filtered[omic2_]
        if (omic1, bool1) not in filt_datasets or (omic2,
                                                   bool2) not in filt_datasets:
            continue
        pair_filtering = filtering[pair]
        for case_var, case_vals_list in subsets.items():
            for case_vals in case_vals_list:
                case = get_case(case_vals, case_var)
                data_dir = get_analysis_folder(
                    i_datasets_folder,
                    'mmvec/common/data/%s/%s' % (pair, case))
                meta_dir = get_analysis_folder(
                    i_datasets_folder,
                    'mmvec/common/metadata/%s/%s' % (pair, case))
                for preval_abund, preval_abund_dats in sorted(
                        pair_filtering.items()):
                    preval_filt1, abund_filter1 = preval_abund_dats[(omic1_,
                                                                     bool1)]
                    preval_filt2, abund_filter2 = preval_abund_dats[(omic2_,
                                                                     bool2)]
                    filt1 = '%s_%s' % (preval_filt1, abund_filter1)
                    filt2 = '%s_%s' % (preval_filt2, abund_filter2)
                    if (case, preval_abund) not in filt_datasets[(omic1,
                                                                  bool1)]:
                        continue
                    if (case, preval_abund) not in filt_datasets[(omic2,
                                                                  bool2)]:
                        continue
                    tsv1, qza1, meta1, meta_pd1, sams1 = filt_datasets[(
                        omic1, bool1)][(case, preval_abund)]
                    tsv2, qza2, meta2, meta_pd2, sams2 = filt_datasets[(
                        omic2, bool2)][(case, preval_abund)]
                    common_sams = sorted(set(sams1) & set(sams2))
                    len_common_sams = len(common_sams)
                    if len_common_sams < 10:
                        print(
                            'Not enough samples: %s (%s) vs %s (%s) -> skipping'
                            % (omic1, filt1, omic2, filt2))
                        continue

                    meta_fp = '%s/meta_%s_%s_%s__%s_%s_%s__%s_%ss.tsv' % (
                        meta_dir, omic1, preval_filt1, abund_filter1, omic2,
                        preval_filt2, abund_filter2, pair, len_common_sams)
                    new_tsv1 = '%s/tab_%s_%s_%s__%s_%ss.tsv' % (
                        data_dir, omic1, preval_filt1, abund_filter1, pair,
                        len_common_sams)
                    new_qza1 = '%s.qza' % splitext(new_tsv1)[0]
                    new_tsv2 = '%s/tab_%s_%s_%s__%s_%ss.tsv' % (
                        data_dir, omic2, preval_filt2, abund_filter2, pair,
                        len_common_sams)
                    new_qza2 = '%s.qza' % splitext(new_tsv2)[0]
                    common_datasets.setdefault(pair, []).append([
                        meta_fp, omic1, omic2, filt1, filt2, new_tsv1,
                        new_tsv2, new_qza1, new_qza2, len_common_sams, case
                    ])
                    meta_subset1 = get_meta_common_sorted(
                        meta_pd1, common_sams)
                    meta_subset2 = get_meta_common_sorted(
                        meta_pd2, common_sams)
                    merge_and_write_metas(meta_subset1, meta_subset2, meta_fp,
                                          omic1, omic2, train_test_dict)
                    if meta_fp in common_datasets_done[pair]:
                        print('\t\t\t* [DONE]', pair, ':', omic1, filt1, omic2,
                              filt2)
                        continue
                    if force or not isfile(new_qza1):
                        cmd = filter_feature_table(qza1, new_qza1, meta_fp)
                        common_jobs.append(cmd)
                    if force or not isfile(new_tsv1):
                        cmd = run_export(new_qza1, new_tsv1, 'FeatureTable')
                        common_jobs.append(cmd)
                    if force or not isfile(new_qza2):
                        cmd = filter_feature_table(qza2, new_qza2, meta_fp)
                        common_jobs.append(cmd)
                    if force or not isfile(new_tsv2):
                        cmd = run_export(new_qza2, new_tsv2, 'FeatureTable')
                        common_jobs.append(cmd)
                    print('\t\t\t* [TODO]', pair, ':', omic1,
                          '[%s: %s]' % (filt1, meta_subset1.shape[0]), omic2,
                          '[%s: %s]' % (filt2, meta_subset2.shape[0]))
def check_common_datasets(i_datasets_folder: str, mmvec_pairs: dict,
                          mmvec_filtering: dict, filt_datasets_pass: dict,
                          input_to_filtered: dict,
                          mmvec_subsets: dict) -> (dict, list):
    """
    :param i_datasets_folder:
    :param mmvec_pairs:
    :param force: Force the re-writing of scripts for all commands.
    :return:
    """
    common_datasets_pass = {}
    for pair, pair_datasets in mmvec_pairs.items():
        pair_filtering = mmvec_filtering[pair]
        common_datasets_pass[pair] = []
        data_dir_ = get_analysis_folder(i_datasets_folder,
                                        'mmvec/common/data/%s' % pair)
        meta_dir_ = get_analysis_folder(i_datasets_folder,
                                        'mmvec/common/metadata/%s' % pair)
        (omic1_, bool1), (omic2_, bool2) = pair_datasets
        if omic1_ not in input_to_filtered or omic2_ not in input_to_filtered:
            continue
        omic1 = input_to_filtered[omic1_]
        omic2 = input_to_filtered[omic2_]
        if (omic1, bool1) not in filt_datasets_pass or (
                omic2, bool2) not in filt_datasets_pass:
            continue
        for case_var, case_vals_list in mmvec_subsets.items():
            for case_vals in case_vals_list:
                case = get_case(case_vals, case_var)
                data_dir = data_dir_ + '/' + case
                meta_dir = meta_dir_ + '/' + case
                for preval_abund in sorted(pair_filtering):
                    preval_filt1, abund_filter1 = pair_filtering[preval_abund][
                        (omic1_, bool1)]
                    preval_filt2, abund_filter2 = pair_filtering[preval_abund][
                        (omic2_, bool2)]
                    if not filt_datasets_pass[(omic1, bool1)][(case,
                                                               preval_abund)]:
                        continue
                    if not filt_datasets_pass[(omic2, bool2)][(case,
                                                               preval_abund)]:
                        continue
                    filt1 = '_'.join([preval_filt1, abund_filter1])
                    filt2 = '_'.join([preval_filt2, abund_filter2])
                    tsv1, qza1, meta1, meta_pd1, sams1 = filt_datasets_pass[(
                        omic1, bool1)][(case, preval_abund)]
                    tsv2, qza2, meta2, meta_pd2, sams2 = filt_datasets_pass[(
                        omic2, bool2)][(case, preval_abund)]
                    common_sams = sorted(set(sams1) & set(sams2))
                    if len(common_sams) < 10:
                        print(
                            'Not enough samples: %s (%s) vs %s (%s) -> skipping'
                            % (omic1, filt1, omic2, filt2))
                        continue
                    meta_fp = '%s/meta_%s_%s_%s__%s_%s_%s__%s_%ss.tsv' % (
                        meta_dir, omic1, preval_filt1, abund_filter1, omic2,
                        preval_filt2, abund_filter2, pair, len(common_sams))
                    new_tsv1 = '%s/tab_%s_%s_%s__%s_%ss.tsv' % (
                        data_dir, omic1, preval_filt1, abund_filter1, pair,
                        len(common_sams))
                    new_qza1 = '%s.qza' % splitext(new_tsv1)[0]
                    new_tsv2 = '%s/tab_%s_%s_%s__%s_%ss.tsv' % (
                        data_dir, omic2, preval_filt2, abund_filter2, pair,
                        len(common_sams))
                    new_qza2 = '%s.qza' % splitext(new_tsv2)[0]
                    if isfile(meta_fp) and isfile(new_qza1) and isfile(
                            new_qza2):
                        common_datasets_pass[pair].append(meta_fp)
Пример #10
0
def run_single_sourcetracking(odir: str, tsv: str, meta_pd: pd.DataFrame,
                              case_var: str, sourcetracking_params: dict,
                              method: str, imports: set,
                              sourcetracking_sourcesink: dict,
                              case_vals_list: list, cur_sh: str,
                              cur_import_sh: str, force: bool, filt: str,
                              cur_raref: str, fp: str, fa: str, n_nodes: str,
                              n_procs: str) -> list:

    cases = []
    remove = True
    qza = '%s.qza' % splitext(tsv)[0]
    with open(cur_sh, 'w') as cur_sh_o, open(cur_import_sh,
                                             'w') as cur_import_sh_o:
        for case_vals in case_vals_list:
            case = get_case(case_vals, '', case_var)
            for sourcesink_name, sourcesink_d in sourcetracking_sourcesink.items(
            ):
                column = sourcesink_d['column']
                sink = sourcesink_d['sink']
                sources = ['']
                if 'source' in sourcesink_d:
                    sources = sourcesink_d['source']
                new_meta_pd = get_new_meta_pd(meta_pd, case, case_var,
                                              case_vals)
                if column not in new_meta_pd.columns:
                    raise IOError('"%s" not in metadata...' % column)
                if sink not in set(new_meta_pd[column].unique()):
                    raise IOError(
                        'All sinks "%s" not in metadata column "%s"' %
                        (sink, column))
                if sources != [''] and not set(sources).issubset(
                        set(new_meta_pd[column].unique())):
                    raise IOError(
                        'All sources "%s" not in metadata column "%s"' %
                        (sources, column))

                cur_rad = '%s/%s_%s%s/%s' % (odir, case.strip('_'), filt,
                                             cur_raref, sourcesink_name)
                if not isdir(cur_rad):
                    os.makedirs(cur_rad)

                replacements = {
                    sink:
                    sink.replace('/',
                                 '').replace('(',
                                             '').replace(')',
                                                         '').replace(' ', '_')
                }
                for source in sources:
                    replacements.update({
                        source:
                        source.replace('/', '').replace('(', '').replace(
                            ')', '').replace(' ', '_')
                    })

                sink = replacements[sink]
                sources = [replacements[source] for source in sources]

                folder = '%s/%s-%s' % (cur_rad, column, sink)
                if sources != ['']:
                    folder = '%s_%s' % (folder, '_'.join(sources))

                new_meta = '%s/meta.tsv' % cur_rad
                new_qza = '%s/tab.qza' % cur_rad
                new_tsv = '%s/tab.tsv' % cur_rad
                new_meta_pd = new_meta_pd[[column]].reset_index()
                new_meta_pd.replace({column: replacements}, inplace=True)
                new_meta_pd.to_csv(new_meta, index=False, sep='\t')

                loo = False
                missing = False
                folder_method = folder + '/' + method
                if method == 'q2':
                    for root, dirs, files in os.walk(folder_method):
                        if len(root.split(folder_method)[-1].split('/')) == 4:
                            print(method,
                                  root.split(folder_method)[-1].split('/'))
                            if 'predictions.tsv' not in files:
                                print('\n'.join(files))
                                missing = True
                    outs = folder_method + '/t0/r*/*/predictions.tsv'
                elif method == 'feast':
                    outs = folder_method + '/t0/out.r0*'
                elif method == 'sourcetracker':
                    if 'loo' in sourcetracking_params and sourcetracking_params[
                            'loo']:
                        loo = True
                        outs = folder_method + '/t0/loo/mixing_proportions.txt'
                    else:
                        outs = folder_method + '/t0/r0/mixing_proportions.txt'
                    for root, dirs, files in os.walk(folder_method):
                        if len(root.split(folder_method)[-1].split('/')) == 3:
                            print(method,
                                  root.split(folder_method)[-1].split('/'))
                            if 'mixing_proportions.txt' not in files:
                                print('\n'.join(files))
                                missing = True

                if force or not len(glob.glob(outs)) or missing:
                    write_sourcetracking(qza, new_qza, new_tsv, new_meta,
                                         method, fp, fa, cur_rad, column, sink,
                                         sources, sourcetracking_params, loo,
                                         n_nodes, n_procs, cur_sh_o,
                                         cur_import_sh_o, imports)
                    cur_sh_o.write('echo "sh %s/cmd_%s.sh"\n' %
                                   (folder_method, method))
                    cur_sh_o.write('sh %s/cmd_%s.sh\n\n\n' %
                                   (folder_method, method))
                    remove = False
Пример #11
0
def run_nestedness(i_datasets_folder: str, betas: dict,
                   datasets_collapsed_map: dict, p_nestedness_groups: str,
                   datasets_rarefs: dict, force: bool, prjct_nm: str,
                   qiime_env: str, chmod: str, noloc: bool, slurm: bool,
                   split: bool, run_params: dict, filt_raref: str, jobs: bool,
                   chunkit: int) -> (dict, list, dict):

    job_folder2 = get_job_folder(i_datasets_folder, 'nestedness/chunks')

    nestedness_config = read_yaml_file(p_nestedness_groups)
    if 'soft' not in nestedness_config:
        print(
            'Must provide the path to the Nestedness soft (containing bin/Autocorrelation.jar)'
        )
        return {}
    if nestedness_config['soft'].endswith('Autocorrelation.jar') and isfile(
            nestedness_config['soft']):
        binary = nestedness_config['soft']
    else:
        binary = '%s/bin/Autocorrelation.jar' % nestedness_config['soft']
        if not isfile(binary):
            print(
                'Must provide the path to the Nestedness soft (containing bin/Autocorrelation.jar)'
            )
            return {}

    subsets, nodfs, colors, nulls, modes, params = get_nestedness_config(
        nestedness_config)

    nodfs_fps = {}
    all_sh_pbs = {}
    nestedness_res = {}
    for dat, rarefs_metrics_groups_metas_qzas_dms_trees in betas.items():
        if not split:
            out_sh = '%s/run_nestedness_%s_%s%s.sh' % (job_folder2, prjct_nm,
                                                       dat, filt_raref)

        stats_tax_dat, level = get_stats_tax_dat(dat, datasets_collapsed_map)

        nestedness_res[dat] = []
        for idx, metrics_groups_metas_qzas_dms_trees in enumerate(
                rarefs_metrics_groups_metas_qzas_dms_trees):
            nestedness_raref = {}
            cur_raref = datasets_rarefs[dat][idx]
            odir = get_analysis_folder(i_datasets_folder,
                                       'nestedness/%s%s' % (dat, cur_raref))
            if split:
                out_sh = '%s/run_nestedness_%s_%s%s%s.sh' % (
                    job_folder2, prjct_nm, dat, cur_raref, filt_raref)
            for _, groups_metas_qzas_dms_trees in metrics_groups_metas_qzas_dms_trees.items(
            ):
                for group, metas_qzas_mat_qzas_trees in groups_metas_qzas_dms_trees.items(
                ):

                    meta, qza, mat_qza, tree = metas_qzas_mat_qzas_trees[0]
                    meta_pd = read_meta_pd(meta).set_index('sample_name')
                    cases_dict = check_metadata_cases_dict(
                        meta, meta_pd, dict(subsets), 'nestedness')
                    for case_var, case_vals_list in cases_dict.items():
                        for case_vals in case_vals_list:
                            case = get_case(case_vals,
                                            case_var).replace(' ', '_')
                            cur_sh = '%s/run_nestedness_%s%s_%s_%s%s.sh' % (
                                job_folder2, dat, cur_raref, group, case,
                                filt_raref)
                            cur_sh = cur_sh.replace(' ', '-')
                            # print("case", case)
                            all_sh_pbs.setdefault((dat, out_sh),
                                                  []).append(cur_sh)
                            res, group_case_nodfs = run_single_nestedness(
                                odir, cur_raref, level, group, meta_pd, nodfs,
                                nulls, modes, cur_sh, qza, case, case_var,
                                case_vals, binary, params, force)
                            nodfs_fps.setdefault(stats_tax_dat,
                                                 []).extend(group_case_nodfs)
                            nestedness_raref[(group, case)] = res
                break
            nestedness_res[dat].append(nestedness_raref)
Пример #12
0
def run_single_doc(i_dataset_folder: str, odir: str, tsv: str,
                   meta_pd: pd.DataFrame, case_var: str, doc_params: dict,
                   case_vals_list: list, cur_sh: str, cur_import_sh: str,
                   force: bool, filt: str, cur_raref: str, fp: str, fa: str,
                   n_nodes: str, n_procs: str, dat_phates: dict,
                   doc_phate: bool, need_to_run_phate: list,
                   need_to_run_less_phate: list) -> list:
    remove = True
    qza = '%s.qza' % splitext(tsv)[0]
    cases = []
    with open(cur_sh, 'w') as cur_sh_o, open(cur_import_sh,
                                             'w') as cur_import_sh_o:
        for case_vals in case_vals_list:
            token = ''.join([str(random.choice(range(100))) for x in range(3)])
            case = get_case(case_vals, '', case_var)
            cur_rad = '%s/%s%s/%s' % (odir, case.strip('_'), cur_raref, filt)
            cases.append(cur_rad)
            cur_rad_r = '%s/R' % cur_rad
            cur_rad_token = '%s/tmp/%s' % (i_dataset_folder, token)
            if not isdir(cur_rad_r):
                os.makedirs(cur_rad_r)
            new_meta = '%s/meta.tsv' % cur_rad
            new_qza = '%s/tab.qza' % cur_rad
            new_tsv = '%s/tab.tsv' % cur_rad
            new_tsv_token = '%s/tab.tsv' % cur_rad_token
            if force or not isfile('%s/DO.tsv' % cur_rad):
                new_meta_pd = get_new_meta_pd(meta_pd, case, case_var,
                                              case_vals)
                new_meta_pd.reset_index().to_csv(new_meta,
                                                 index=False,
                                                 sep='\t')
                write_doc(qza, fp, fa, new_meta, new_qza, new_tsv, cur_rad,
                          new_tsv_token, cur_rad_token, n_nodes, n_procs,
                          doc_params, cur_sh_o, cur_import_sh_o)
                remove = False

            # run DOC on each cluster from PHATE
            if doc_phate and filt in dat_phates and case_var in dat_phates[
                    filt] and case in dat_phates[filt][case_var]:
                # get the clusters
                xphate_tsv = dat_phates[filt][case_var][case]
                if not isfile(xphate_tsv):
                    if not need_to_run_phate:
                        print('Unable to run DOC on a set of PHATE clusters:\n'
                              '(Be sure to run PHATE first...)')
                    need_to_run_phate.append(
                        xphate_tsv.replace('%s/qiime/phate' % i_dataset_folder,
                                           '...'))
                    continue
                xphate_pd = pd.read_csv(xphate_tsv,
                                        header=0,
                                        sep='\t',
                                        dtype={'sample_name': str})
                xphate_pd = xphate_pd.loc[xphate_pd['variable'].str.contains(
                    'cluster_k')]
                if len(xphate_pd[['knn', 'decay', 't']].drop_duplicates()) > 5:
                    if not need_to_run_less_phate:
                        print(
                            'Warning: PHATE has been for multiple parameters combinations:\n'
                            ' --> It may be unwise to let DOC run on every combination...\n'
                            ' --> Be sure to run PHATE using few, desired sets of parameters!'
                        )
                    need_to_run_less_phate.append(
                        xphate_tsv.replace('%s/qiime/phate' % i_dataset_folder,
                                           '...'))
                cols = [
                    'sample_name', 'knn', 'decay', 't', 'variable', 'factor'
                ]
                xphate_clusters = dict(xphate_pd[cols].groupby(
                    ['knn', 'decay', 't', 'variable',
                     'factor']).apply(func=lambda x: x.sample_name.tolist()))
                new_meta_pd = get_new_meta_pd(meta_pd, case, case_var,
                                              case_vals)
                # repeat DOC command for the clusters
                cur_rad_phate = '%s/phate' % cur_rad
                if not isdir(cur_rad_phate):
                    os.makedirs(cur_rad_phate)
                doc_phate_processed = []
                for (knn, decay, t, k,
                     cluster), samples_phate in xphate_clusters.items():
                    if len(samples_phate) < 50:
                        doc_phate_processed.append([
                            knn, decay, t, k, cluster,
                            len(samples_phate), 'TOO FEW'
                        ])
                        continue
                    token = ''.join(
                        [str(random.choice(range(100))) for x in range(3)])
                    cur_rad_phate_clust = '%s/%s_%s_%s_k%s_clust%s' % (
                        cur_rad_phate, knn, decay, t, k, cluster)
                    doc_phate_processed.append([
                        knn, decay, t, k, cluster,
                        len(samples_phate), cur_rad_phate_clust
                    ])
                    cases.append(cur_rad_phate_clust)
                    cur_rad_phate_clust_r = '%s/R' % cur_rad_phate_clust
                    cur_rad_token = '%s/tmp/%s' % (i_dataset_folder, token)
                    if not isdir(cur_rad_phate_clust_r):
                        os.makedirs(cur_rad_phate_clust_r)
                    new_meta = '%s/meta.tsv' % cur_rad_phate_clust
                    new_qza = '%s/tab.qza' % cur_rad_phate_clust
                    new_tsv = '%s/tab.tsv' % cur_rad_phate_clust
                    new_tsv_token = '%s/tab.tsv' % cur_rad_phate_clust
                    if force or not isfile('%s/DO.tsv' % cur_rad_phate_clust):
                        new_meta_pd_phate = new_meta_pd.loc[
                            samples_phate, :].copy()
                        new_meta_pd_phate.reset_index().to_csv(new_meta,
                                                               index=False,
                                                               sep='\t')
                        write_doc(qza, fp, fa, new_meta, new_qza, new_tsv,
                                  cur_rad_phate_clust, new_tsv_token,
                                  cur_rad_token, n_nodes, n_procs, doc_params,
                                  cur_sh_o, cur_import_sh_o)
                        remove = False
                phate_doc_out = '%s/phate_processed.txt' % cur_rad_phate
                with open(phate_doc_out, 'w') as o:
                    o.write('knn\tdecay\tt\tk\tcluster\tsamples\tfate\n')
                    for doc_phate_proc in doc_phate_processed:
                        o.write('%s\n' % '\t'.join(map(str, doc_phate_proc)))
def run_permanova(i_datasets_folder: str, betas: dict,
                  main_testing_groups: tuple, p_perm_tests_min: int,
                  p_beta_type: tuple, datasets_rarefs: dict,
                  p_perm_groups: str, force: bool, prjct_nm: str,
                  qiime_env: str, chmod: str, noloc: bool, slurm: bool,
                  split: bool, run_params: dict, filt_raref: str, jobs: bool,
                  chunkit: int) -> dict:
    """
    Run beta-group-significance: Beta diversity group significance.
    https://docs.qiime2.org/2019.10/plugins/available/diversity/beta-group-significance/
    Main per-dataset looper for the PERMANOVA tests on beta diversity matrices.

    :param i_datasets_folder: Path to the folder containing the data/metadata subfolders.
    :param datasets: list of datasets.
    :param betas: beta diversity matrices.
    :param main_testing_groups: groups to test.
    :param p_perm_groups: groups to subset.
    :param force: Force the re-writing of scripts for all commands.
    :param prjct_nm: Nick name for your project.
    :param qiime_env: qiime2-xxxx.xx conda environment.
    :param chmod: whether to change permission of output files (defalt: 775).
    """
    permanovas = {}
    job_folder2 = get_job_folder(i_datasets_folder, 'permanova/chunks')
    main_cases_dict = get_main_cases_dict(p_perm_groups)

    npermutations = 999

    metric_check = set()
    all_sh_pbs = {}
    first_print = 0
    for dat, metric_groups_metas_qzas_dms_trees_ in betas.items():
        permanovas[dat] = []
        if not split:
            out_sh = '%s/run_beta_group_significance_%s_%s%s.sh' % (
                job_folder2, prjct_nm, dat, filt_raref)
        for idx, metric_groups_metas_qzas_dms_trees in enumerate(
                metric_groups_metas_qzas_dms_trees_):
            cur_depth = datasets_rarefs[dat][idx]
            odir = get_analysis_folder(i_datasets_folder,
                                       'permanova/%s%s' % (dat, cur_depth))
            for metric, subset_files in metric_groups_metas_qzas_dms_trees.items(
            ):
                permanovas.setdefault(dat, []).append(metric)
                if split:
                    out_sh = '%s/run_beta_group_significance_%s_%s_%s%s.sh' % (
                        job_folder2, prjct_nm, dat, metric, filt_raref)
                for subset, metas_qzas_mat_qzas_trees in subset_files.items():
                    (meta, qza, mat_qza, tree) = metas_qzas_mat_qzas_trees[0]
                    if not isfile(mat_qza):
                        if not first_print:
                            print(
                                'Beta diversity, distances matrices must be generated already to automatise PERMANOVA\n'
                                '\t(re-run this after steps "2_run_beta.sh" and "2x_run_beta_export.pbs" are done)'
                            )
                            first_print += 1
                        continue
                    if (dat, subset) not in metric_check:
                        meta_pd = read_meta_pd(meta)
                        meta_pd = meta_pd.set_index('sample_name')
                        cases_dict = check_metadata_cases_dict(
                            meta, meta_pd, dict(main_cases_dict), 'PERMANOVA')
                        testing_groups = check_metadata_testing_groups(
                            meta, meta_pd, main_testing_groups,
                            p_perm_tests_min, 'PERMANOVA')
                        metric_check.add((dat, subset))

                    for case_var, case_vals_list in cases_dict.items():
                        testing_groups_case_var = list(
                            set(testing_groups + [case_var]))
                        for case_vals in case_vals_list:
                            case = get_case(case_vals,
                                            case_var).replace(' ', '_')
                            for testing_group in testing_groups_case_var:
                                if testing_group == 'ALL':
                                    continue
                                cur_sh = '%s/run_beta_group_significance_%s%s_%s_%s_%s_%s%s.sh' % (
                                    job_folder2, dat, cur_depth, metric,
                                    subset, case, testing_group, filt_raref)
                                cur_sh = cur_sh.replace(' ', '-')
                                all_sh_pbs.setdefault((dat, out_sh),
                                                      []).append(cur_sh)
                                run_single_perm(odir, subset, meta_pd, cur_sh,
                                                metric, case, testing_group,
                                                p_perm_tests_min, p_beta_type,
                                                qza, mat_qza, case_var,
                                                case_vals, npermutations,
                                                force)
def run_mantel(i_datasets_folder: str, datasets_filt: dict, p_mantel: str,
               betas: dict, force: bool, prjct_nm: str, qiime_env: str,
               chmod: str, noloc: bool, slurm: bool, split: bool,
               run_params: dict, filt_raref: str, filt_only: bool,
               eval_depths: dict, jobs: bool, chunkit: int) -> None:
    """
    """
    evaluation = ''
    if eval_depths:
        evaluation = '_eval'
        mantel_pairs = {}
        for dat, depths in eval_depths.items():
            sorted_depths = sorted(depths, key=lambda x: int(x.split('_')[-1]))
            for idx, x in enumerate(sorted_depths[:-1]):
                y = sorted_depths[(idx + 1)]
                n0 = x.split('_')[-1]
                n1 = y.split('_')[-1]
                mantel_pairs['%s_%s' % (n0, n1)] = [x, y]
        mantel_subsets = {'ALL': [[]]}
    else:
        mantel_pairs, mantel_subsets = get_procrustes_mantel_dicts(p_mantel)

    get_job_folder(i_datasets_folder, 'mantel%s' % evaluation)

    all_sh_pbs = {}
    missing_dats = set()
    for pair, (dat1_, dat2_) in mantel_pairs.items():

        dat1, raref1 = get_dat_idx(dat1_, evaluation, datasets_filt, filt_only)
        dat2, raref2 = get_dat_idx(dat2_, evaluation, datasets_filt, filt_only)

        if check_dat_exists(betas, dat1, missing_dats) or check_dat_exists(
                betas, dat2, missing_dats):
            continue

        if evaluation:
            metrics_groups_metas_qzas_dms_trees1 = betas[dat1]
            metrics_groups_metas_qzas_dms_trees2 = betas[dat2]
        else:
            metrics_groups_metas_qzas_dms_trees1 = betas[dat1][0]
            metrics_groups_metas_qzas_dms_trees2 = betas[dat2][0]

        job_folder2 = get_job_folder(
            i_datasets_folder,
            'mantel%s/chunks/%s%s' % (evaluation, pair, filt_raref))
        if not split:
            out_sh = '%s/run_mantel_%s%s_%s%s.sh' % (
                job_folder2, prjct_nm, evaluation, pair, filt_raref)

        for metric, groups_metas_qzas_dms_trees1 in metrics_groups_metas_qzas_dms_trees1.items(
        ):
            if split:
                out_sh = '%s/run_mantel_%s%s_%s_%s%s.sh' % (
                    job_folder2, prjct_nm, evaluation, pair, metric,
                    filt_raref)
            if metric not in metrics_groups_metas_qzas_dms_trees2:
                continue
            groups_metas_qzas_dms_trees2 = metrics_groups_metas_qzas_dms_trees2[
                metric]
            groups1 = sorted(groups_metas_qzas_dms_trees1.keys())
            groups2 = sorted(groups_metas_qzas_dms_trees2.keys())
            for (group1_, group2_) in itertools.product(*[groups1, groups2]):
                if group1_ == '':
                    group1 = 'full'
                else:
                    group1 = group1_
                if group2_ == '':
                    group2 = 'full'
                else:
                    group2 = group2_

                meta1, qza1, dm1, tree1 = groups_metas_qzas_dms_trees1[
                    group1_][0]
                meta2, qza2, dm2, tree2 = groups_metas_qzas_dms_trees2[
                    group2_][0]

                skip = 0
                if not evaluation:
                    if '__raref' in dat1_:
                        dm1, meta1 = get_dm_meta(dat1, dm1, meta1, raref1,
                                                 metric, i_datasets_folder,
                                                 skip)
                    if '__raref' in dat2_:
                        dm2, meta2 = get_dm_meta(dat2, dm2, meta2, raref2,
                                                 metric, i_datasets_folder,
                                                 skip)
                if skip:
                    print(
                        '[Mantels] One desired rarefaction depth not run (pair %s)'
                        % pair)
                    continue

                meta_pd1 = read_meta_pd(meta1)
                meta_pd2 = read_meta_pd(meta2)
                common_sams = list(
                    set(meta_pd1.sample_name) & set(meta_pd2.sample_name))
                if len(common_sams) < 3:
                    continue

                meta_pd = meta_pd1.loc[meta_pd1.sample_name.isin(common_sams)]
                cases_dict = check_metadata_cases_dict(meta1, meta_pd,
                                                       dict(mantel_subsets),
                                                       'mantel')
                odir = get_analysis_folder(
                    i_datasets_folder, 'mantel%s/%s%s/%s_vs_%s' %
                    (evaluation, pair, filt_raref, group1, group2))
                job_folder3 = get_job_folder(
                    i_datasets_folder, 'mantel%s/chunks/%s%s/%s_vs_%s' %
                    (evaluation, pair, filt_raref, group1, group2))

                for case_var, case_vals_list in cases_dict.items():
                    for case_vals in case_vals_list:
                        case_ = get_case(case_vals, case_var).replace(' ', '_')
                        cur = '%s__%s' % (metric, case_)
                        cur_sh = '%s/run_mantel%s_%s%s.sh' % (
                            job_folder3, evaluation, cur, filt_raref)
                        cur_sh = cur_sh.replace(' ', '-')
                        all_sh_pbs.setdefault((pair, out_sh),
                                              []).append(cur_sh)

                        dm_out1 = '%s/dm_%s__%s_DM.qza' % (odir, dat1_, cur)
                        dm_out2 = '%s/dm_%s__%s_DM.qza' % (odir, dat2_, cur)
                        mantel_out = '%s/mantel%s_%s__%s__%s.qzv' % (
                            odir, evaluation, dat1_, dat2_, cur)
                        run_single_procrustes_mantel('mantel', odir, dm1, dm2,
                                                     meta_pd, dm_out1, dm_out2,
                                                     mantel_out, cur_sh, cur,
                                                     case_var, case_vals,
                                                     force)

    job_folder = get_job_folder(i_datasets_folder, 'mantel%s' % evaluation)
    main_sh = write_main_sh(
        job_folder, '4_run_mantel_%s%s%s' % (prjct_nm, evaluation, filt_raref),
        all_sh_pbs, '%s.mntl%s%s' % (prjct_nm, evaluation, filt_raref),
        run_params["time"], run_params["n_nodes"], run_params["n_procs"],
        run_params["mem_num"], run_params["mem_dim"], qiime_env, chmod, noloc,
        slurm, jobs, chunkit)
    if main_sh:
        if p_mantel and p_mantel != 1:
            if p_mantel.startswith('/panfs'):
                p_mantel = p_mantel.replace(os.getcwd(), '')
            print('# Mantels (pairs and samples subsets config in %s)' %
                  p_mantel)
        else:
            print('# Mantels')
        print_message('', 'sh', main_sh, jobs)
def run_procrustes(i_datasets_folder: str, datasets_filt: dict,
                   p_procrustes: str, betas: dict, force: bool, prjct_nm: str,
                   qiime_env: str, chmod: str, noloc: bool, slurm: bool,
                   split: bool, run_params: dict, filt_raref: str,
                   filt_only: bool, eval_depths: dict, jobs: bool,
                   chunkit: int) -> None:
    """
    """
    evaluation = ''
    if eval_depths:
        evaluation = '_eval'
        procrustes_pairs = {}
        for dat, depths in eval_depths.items():
            sorted_depths = sorted(depths, key=lambda x: int(x.split('_')[-1]))
            for idx, x in enumerate(sorted_depths[:-1]):
                y = sorted_depths[(idx + 1)]
                n0 = x.split('_')[-1]
                n1 = y.split('_')[-1]
                procrustes_pairs['%s_%s' % (n0, n1)] = [x, y]
        procrustes_subsets = {'ALL': [[]]}
    else:
        procrustes_pairs, procrustes_subsets = get_procrustes_mantel_dicts(
            p_procrustes)
    get_job_folder(i_datasets_folder, 'procrustes%s' % evaluation)
    dms_tab = []
    all_sh_pbs = {}
    missing_dats = set()
    for pair, (dat1_, dat2_) in procrustes_pairs.items():

        dat1, raref1 = get_dat_idx(dat1_, evaluation, datasets_filt, filt_only)
        dat2, raref2 = get_dat_idx(dat2_, evaluation, datasets_filt, filt_only)

        if check_dat_exists(betas, dat1, missing_dats) or check_dat_exists(
                betas, dat2, missing_dats):
            continue

        if evaluation:
            metrics_groups_metas_qzas_dms_trees1 = betas[dat1]
            metrics_groups_metas_qzas_dms_trees2 = betas[dat2]
        else:
            metrics_groups_metas_qzas_dms_trees1 = betas[dat1][0]
            metrics_groups_metas_qzas_dms_trees2 = betas[dat2][0]

        job_folder2 = get_job_folder(
            i_datasets_folder,
            'procrustes%s/chunks/%s%s' % (evaluation, pair, filt_raref))
        if not split:
            out_sh = '%s/run_procrustes_%s%s_%s%s.sh' % (
                job_folder2, prjct_nm, evaluation, pair, filt_raref)

        for metric, groups_metas_qzas_dms_trees1 in metrics_groups_metas_qzas_dms_trees1.items(
        ):
            if split:
                out_sh = '%s/run_procrustes_%s%s_%s_%s%s.sh' % (
                    job_folder2, prjct_nm, evaluation, pair, metric,
                    filt_raref)
            if metric not in metrics_groups_metas_qzas_dms_trees2:
                continue
            groups_metas_qzas_dms_trees2 = metrics_groups_metas_qzas_dms_trees2[
                metric]
            groups1 = sorted(groups_metas_qzas_dms_trees1.keys())
            groups2 = sorted(groups_metas_qzas_dms_trees2.keys())
            for (group1_, group2_) in itertools.product(*[groups1, groups2]):
                if group1_ == '':
                    group1 = 'full'
                else:
                    group1 = group1_
                if group2_ == '':
                    group2 = 'full'
                else:
                    group2 = group2_

                meta1, qza1, dm1, tree1 = groups_metas_qzas_dms_trees1[
                    group1_][0]
                meta2, qza2, dm2, tree2 = groups_metas_qzas_dms_trees2[
                    group2_][0]

                skip = 0
                if not evaluation:
                    if '__raref' in dat1_:
                        dm1, meta1 = get_dm_meta(dat1, dm1, meta1, raref1,
                                                 metric, i_datasets_folder,
                                                 skip)
                    if '__raref' in dat2_:
                        dm2, meta2 = get_dm_meta(dat2, dm2, meta2, raref2,
                                                 metric, i_datasets_folder,
                                                 skip)
                if skip:
                    print(
                        '[Proscustes] One desired rarefaction depth not run (pair %s)'
                        % pair)
                    continue

                meta_pd1 = read_meta_pd(meta1)
                meta_pd2 = read_meta_pd(meta2)
                common_sams = list(
                    set(meta_pd1.sample_name) & set(meta_pd2.sample_name))
                if len(common_sams) < 3:
                    continue

                meta_pd = meta_pd1.loc[meta_pd1.sample_name.isin(common_sams)]
                cases_dict = check_metadata_cases_dict(
                    meta1, meta_pd, dict(procrustes_subsets), 'procrustes')
                odir = get_analysis_folder(
                    i_datasets_folder, 'procrustes%s/%s%s/%s_vs_%s' %
                    (evaluation, pair, filt_raref, group1, group2))
                job_folder3 = get_job_folder(
                    i_datasets_folder, 'procrustes%s/chunks/%s%s/%s_vs_%s' %
                    (evaluation, pair, filt_raref, group1, group2))
                for case_var, case_vals_list in cases_dict.items():
                    for case_vals in case_vals_list:
                        case_ = get_case(case_vals, case_var).replace(' ', '_')
                        cur = '%s__%s' % (metric, case_)
                        cur_sh = '%s/run_procrustes%s_%s%s.sh' % (
                            job_folder3, evaluation, cur, filt_raref)
                        cur_sh = cur_sh.replace(' ', '-')
                        all_sh_pbs.setdefault((pair, out_sh),
                                              []).append(cur_sh)

                        dm_out1 = '%s/dm_%s__%s_DM.qza' % (odir, dat1_, cur)
                        dm_out2 = '%s/dm_%s__%s_DM.qza' % (odir, dat2_, cur)
                        dm_out1_tsv = '%s.tsv' % splitext(dm_out1)[0]
                        dm_out2_tsv = '%s.tsv' % splitext(dm_out2)[0]
                        biplot = '%s/procrustes%s_%s__%s__%s.qzv' % (
                            odir, evaluation, dat1_, dat2_, cur)
                        run_single_procrustes_mantel('procrustes', odir, dm1,
                                                     dm2, meta_pd, dm_out1,
                                                     dm_out2, biplot, cur_sh,
                                                     cur, case_var, case_vals,
                                                     force)
                        dms_tab.append([
                            pair, dat1_, dat2_, group1, group2, case_, metric,
                            dm_out1_tsv, dm_out2_tsv
                        ])

    job_folder = get_job_folder(i_datasets_folder, 'procrustes%s' % evaluation)
    main_sh = write_main_sh(
        job_folder,
        '4_run_procrustes_%s%s%s' % (prjct_nm, evaluation, filt_raref),
        all_sh_pbs, '%s.prcst%s%s' % (prjct_nm, evaluation, filt_raref),
        run_params["time"], run_params["n_nodes"], run_params["n_procs"],
        run_params["mem_num"], run_params["mem_dim"], qiime_env, chmod, noloc,
        slurm, jobs, chunkit)
    if main_sh:
        if p_procrustes and p_procrustes != 1:
            if p_procrustes.startswith('/panfs'):
                p_procrustes = p_procrustes.replace(os.getcwd(), '')
            print('# Procrustes (pairs and samples subsets config in %s)' %
                  p_procrustes)
        else:
            print('# Procrustes')
        print_message('', 'sh', main_sh, jobs)

    dms_tab_pd = pd.DataFrame(dms_tab,
                              columns=[
                                  'pair',
                                  'dat1',
                                  'dat2',
                                  'metric',
                                  'group1',
                                  'group2',
                                  'case',
                                  'dm_out1',
                                  'dm_out2',
                              ])

    odir = get_analysis_folder(i_datasets_folder,
                               'procrustes%s/R' % evaluation)
    out_Rs = glob.glob('%s/pairs_proscrustes_results%s%s*.tsv' %
                       (odir, evaluation, filt_raref))
    if len(out_Rs):
        done_R = pd.concat([pd.read_table(x, sep=' ') for x in out_Rs])
        dms_tab_pd = dms_tab_pd.loc[~dms_tab_pd[['dm_out1', 'dm_out2']].sum(1).
                                    isin(done_R[['f1', 'f2']].sum(1))]

    if dms_tab_pd.shape[0]:
        fp_num = 0
        if len(out_Rs):
            last = sorted(
                out_Rs, key=lambda fp: int(fp.split('.tsv')[0].split('_')[-1]))
            fp_num = int(last[-1].split('.tsv')[0].split('_')[-1]) + 1

        dms_tab_fp = '%s/pairs%s%s_%s.tsv' % (odir, evaluation, filt_raref,
                                              fp_num)
        dms_tab_pd.to_csv(dms_tab_fp, index=False, sep='\t')
        out_R = '%s/pairs_proscrustes_results%s%s_%s.tsv' % (
            odir, evaluation, filt_raref, fp_num)
        job_folder = get_job_folder(i_datasets_folder, 'procrustes/R')
        R_script = '%s/4_run_procrustes_%s%s.R' % (job_folder, prjct_nm,
                                                   filt_raref)
        with open(R_script, 'w') as o:
            o.write("library(vegan)\n")
            o.write("dms_files <- read.table('%s', h=T)\n" % dms_tab_fp)
            o.write(
                "cols <- c('pair', 'd1', 'd2', 'g1', 'g2', 'case', 'metric', 'f1', 'f2', 'samples', 'M2', 'p-value')\n"
            )
            o.write(
                "res <- setNames(data.frame(matrix(ncol = 12, nrow = 0)), cols)\n"
            )
            o.write("for (i in seq(1, dim(dms_files)[1])) {\n")
            o.write("    row <- as.vector(unlist(dms_files[i,]))\n")
            o.write("    pair <- row[1]\n")
            o.write("    d1 <- row[2]\n")
            o.write("    d2 <- row[3]\n")
            o.write("    group1 <- row[4]\n")
            o.write("    group2 <- row[5]\n")
            o.write("    case <- row[6]\n")
            o.write("    metric <- row[7]\n")
            o.write("    f1 <- row[8]\n")
            o.write("    f2 <- row[9]\n")
            o.write("    if (sum(file.exists(f1, f2)) == 2) {\n")
            o.write(
                "        filin_tsv_pd1 <- read.csv(f1, header = TRUE, check.names=FALSE,\n"
            )
            o.write(
                "                                  row.names = 1, colClasses = 'character', sep = '\\t')\n"
            )
            o.write(
                "        filin_tsv_pd2 <- read.csv(f2, header = TRUE, check.names=FALSE,\n"
            )
            o.write(
                "                                  row.names = 1, colClasses = 'character', sep = '\\t')\n"
            )
            o.write("        filin_tsv_pd1 <- data.matrix(filin_tsv_pd1)\n")
            o.write("        filin_tsv_pd2 <- data.matrix(filin_tsv_pd2)\n")
            o.write(
                "        filin_tsv_pd1 <- filin_tsv_pd1[rownames(filin_tsv_pd2), rownames(filin_tsv_pd2)]\n"
            )
            o.write(
                "        # procrustes12 <- procrustes(filin_tsv_pd1, filin_tsv_pd2, kind=2, permutations=999)\n"
            )
            o.write(
                "        prtst <- protest(filin_tsv_pd1, filin_tsv_pd2, permutations = 999)\n"
            )
            o.write("        n <- dim(filin_tsv_pd1)[1]\n")
            o.write(
                "        res[i,] <- c(pair, d1, d2, group1, group2, case, metric, f1, f2, n, prtst$ss, prtst$signif)\n"
            )
            o.write("    }\n")
            o.write("}\n")
            o.write("write.table(x = res, file = '%s')\n" % out_R)

        out_sh = '%s/4_run_procrustes_%s%s_R%s.sh' % (job_folder, prjct_nm,
                                                      evaluation, filt_raref)
        out_pbs = '%s.pbs' % splitext(out_sh)[0]
        with open(out_sh, 'w') as o:
            o.write('R -f %s --vanilla\n' % R_script)

        run_xpbs(
            out_sh, out_pbs,
            '%s.prcrt%s.R%s' % (prjct_nm, evaluation, filt_raref), 'renv',
            run_params["time"], run_params["n_nodes"], run_params["n_procs"],
            run_params["mem_num"], run_params["mem_dim"], chmod, 1,
            '# Procrustes for stats in R (pairs and samples subsets config in %s)'
            % p_procrustes, None, False, jobs)