def fastspar_correlation(table: Table,
                         verbose: bool = False,
                         calc_pvalues=False,
                         bootstraps=1000,
                         nprocs=1,
                         p_adjust_method='fdr_bh') -> pd.DataFrame:
    with tempfile.TemporaryDirectory(prefix='fastspar') as temp:
        # To fix AttributeError: 'DataFrame' object has no attribute 'to_dense'. See: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.sparse.to_dense.html
        table.to_dataframe().sparse.to_dense().to_csv(path.join(
            temp, 'otu_table.tsv'),
                                                      sep='\t',
                                                      index_label='#OTU ID')
        if verbose:
            stdout = None
        else:
            stdout = subprocess.DEVNULL
        run_fastspar(path.join(temp, 'otu_table.tsv'),
                     path.join(temp, path.join(temp, 'correl_table.tsv')),
                     path.join(temp, 'covar_table.tsv'), stdout, nprocs)
        cor = pd.read_csv(path.join(temp, 'correl_table.tsv'),
                          sep='\t',
                          index_col=0)
        correls = df_to_correls(cor)
        if calc_pvalues:
            subprocess.run([
                'fastspar_bootstrap', '-c',
                path.join(temp, 'otu_table.tsv'), '-n',
                str(bootstraps), '-p',
                path.join(temp, 'boot'), '-t',
                str(nprocs)
            ],
                           stdout=stdout)
            # infer correlations for each bootstrap count using all available processes
            with ThreadPoolExecutor(max_workers=nprocs) as executor:
                for i in glob((path.join(temp, 'boot*'))):
                    executor.submit(run_fastspar, i,
                                    i.replace('boot', 'cor_boot'),
                                    i.replace('boot', 'cov_boot'))
            # calculate p_values for correlation table
            subprocess.run([
                'fastspar_pvalues', '-c',
                path.join(temp, 'otu_table.tsv'), '-r',
                path.join(temp, 'correl_table.tsv'), '-p',
                path.join(temp, 'cor_boot'), '-t',
                str(nprocs), '-n',
                str(bootstraps), '-o',
                path.join(temp, 'pvalues.tsv')
            ],
                           stdout=stdout)
            pvals = pd.read_csv(path.join(temp, 'pvalues.tsv'),
                                sep='\t',
                                index_col=0)
            pvals = df_to_correls(pvals, col_label='p')
            correls = pd.concat([correls, pvals], axis=1, join='inner')
            correls['p_adjusted'] = p_adjust(correls.p, p_adjust_method)
        correls.index = pd.MultiIndex.from_tuples(
            [sorted(i) for i in correls.index])
        return correls
Пример #2
0
def fastspar_correlation(table: Table,
                         verbose: bool = False,
                         nprocs=1) -> pd.DataFrame:
    # TODO: multiprocess support
    with tempfile.TemporaryDirectory(prefix='fastspar') as temp:
        table.to_dataframe().to_dense().to_csv(path.join(
            temp, 'otu_table.tsv'),
                                               sep='\t',
                                               index_label='#OTU ID')
        if verbose:
            stdout = None
        else:
            stdout = subprocess.DEVNULL
        subprocess.run([
            'fastspar', '-c',
            path.join(temp, 'otu_table.tsv'), '-r',
            path.join(temp, path.join(temp, 'correl_table.tsv')), '-a',
            path.join(temp, 'covar_table.tsv'), '-t',
            str(nprocs)
        ],
                       stdout=stdout)
        cor = pd.read_table(path.join(temp, 'correl_table.tsv'), index_col=0)
        return df_to_correls(cor)
Пример #3
0
def fastspar_correlation_permutation(table: Table,
                                     verbose: bool = False,
                                     nprocs=1,
                                     bootstraps=1000) -> pd.DataFrame:
    # TODO: multiprocess support
    with tempfile.TemporaryDirectory(prefix='fastspar') as temp:
        table.to_dataframe().to_dense().to_csv(path.join(
            temp, 'otu_table.tsv'),
                                               sep='\t',
                                               index_label='#OTU ID')
        if verbose:
            stdout = None
        else:
            stdout = subprocess.DEVNULL
        # generate correlation table, -r, to compare against bootstraps
        subprocess.run([
            'fastspar', '-c',
            path.join(temp, 'otu_table.tsv'), '-r',
            path.join(temp, path.join(temp, 'correl_table.tsv')), '-a',
            path.join(temp, 'covar_table.tsv'), '-t',
            str(nprocs)
        ],
                       stdout=stdout)
        # generate bootstraps with prefix, boot
        #subprocess.run(['mkdir', path.join(temp, 'bootstraps')])
        subprocess.run([
            'fastspar_bootstrap', '-c',
            path.join(temp, 'otu_table.tsv'), '-n',
            str(bootstraps), '-p',
            path.join(temp, 'boot'), '-t',
            str(nprocs)
        ],
                       stdout=stdout)
        # infer correlations for each bootstrap count using all available processes
        # TODO specify number of dedicated processes
        subprocess.run(['ls', temp], stdout=stdout)
        subprocess.run([
            'parallel', '-j',
            str(nprocs), 'fastspar', '-c', '{}', '-r',
            path.join(temp, 'cor_{/}'), '-a',
            path.join(temp, 'cov_{/}'), '-i',
            str(5), ':::'
        ] + glob(path.join(temp, 'boot*')),
                       stdout=stdout)
        # caluculate p_values for correlation table
        subprocess.run([
            'fastspar_exactpvalues', '-c',
            path.join(temp, 'otu_table.tsv'), '-r',
            path.join(temp, 'correl_table.tsv'), '-p',
            path.join(temp, 'cor_boot'), '-t',
            str(nprocs), '-n',
            str(bootstraps), '-o',
            path.join(temp, 'pvalues.tsv')
        ],
                       stdout=stdout)

        cor = pd.read_table(path.join(temp, 'correl_table.tsv'), index_col=0)
        pvals = pd.read_table(path.join(temp, 'pvalues.tsv'), index_col=0)
        return pd.concat(
            [df_to_correls(cor),
             df_to_correls(pvals, col_label='pvalue')],
            axis=1,
            join='inner')