def fastspar_correlation(table: Table, verbose: bool = False, calc_pvalues=False, bootstraps=1000, nprocs=1, p_adjust_method='fdr_bh') -> pd.DataFrame: with tempfile.TemporaryDirectory(prefix='fastspar') as temp: # To fix AttributeError: 'DataFrame' object has no attribute 'to_dense'. See: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.sparse.to_dense.html table.to_dataframe().sparse.to_dense().to_csv(path.join( temp, 'otu_table.tsv'), sep='\t', index_label='#OTU ID') if verbose: stdout = None else: stdout = subprocess.DEVNULL run_fastspar(path.join(temp, 'otu_table.tsv'), path.join(temp, path.join(temp, 'correl_table.tsv')), path.join(temp, 'covar_table.tsv'), stdout, nprocs) cor = pd.read_csv(path.join(temp, 'correl_table.tsv'), sep='\t', index_col=0) correls = df_to_correls(cor) if calc_pvalues: subprocess.run([ 'fastspar_bootstrap', '-c', path.join(temp, 'otu_table.tsv'), '-n', str(bootstraps), '-p', path.join(temp, 'boot'), '-t', str(nprocs) ], stdout=stdout) # infer correlations for each bootstrap count using all available processes with ThreadPoolExecutor(max_workers=nprocs) as executor: for i in glob((path.join(temp, 'boot*'))): executor.submit(run_fastspar, i, i.replace('boot', 'cor_boot'), i.replace('boot', 'cov_boot')) # calculate p_values for correlation table subprocess.run([ 'fastspar_pvalues', '-c', path.join(temp, 'otu_table.tsv'), '-r', path.join(temp, 'correl_table.tsv'), '-p', path.join(temp, 'cor_boot'), '-t', str(nprocs), '-n', str(bootstraps), '-o', path.join(temp, 'pvalues.tsv') ], stdout=stdout) pvals = pd.read_csv(path.join(temp, 'pvalues.tsv'), sep='\t', index_col=0) pvals = df_to_correls(pvals, col_label='p') correls = pd.concat([correls, pvals], axis=1, join='inner') correls['p_adjusted'] = p_adjust(correls.p, p_adjust_method) correls.index = pd.MultiIndex.from_tuples( [sorted(i) for i in correls.index]) return correls
def fastspar_correlation(table: Table, verbose: bool = False, nprocs=1) -> pd.DataFrame: # TODO: multiprocess support with tempfile.TemporaryDirectory(prefix='fastspar') as temp: table.to_dataframe().to_dense().to_csv(path.join( temp, 'otu_table.tsv'), sep='\t', index_label='#OTU ID') if verbose: stdout = None else: stdout = subprocess.DEVNULL subprocess.run([ 'fastspar', '-c', path.join(temp, 'otu_table.tsv'), '-r', path.join(temp, path.join(temp, 'correl_table.tsv')), '-a', path.join(temp, 'covar_table.tsv'), '-t', str(nprocs) ], stdout=stdout) cor = pd.read_table(path.join(temp, 'correl_table.tsv'), index_col=0) return df_to_correls(cor)
def fastspar_correlation_permutation(table: Table, verbose: bool = False, nprocs=1, bootstraps=1000) -> pd.DataFrame: # TODO: multiprocess support with tempfile.TemporaryDirectory(prefix='fastspar') as temp: table.to_dataframe().to_dense().to_csv(path.join( temp, 'otu_table.tsv'), sep='\t', index_label='#OTU ID') if verbose: stdout = None else: stdout = subprocess.DEVNULL # generate correlation table, -r, to compare against bootstraps subprocess.run([ 'fastspar', '-c', path.join(temp, 'otu_table.tsv'), '-r', path.join(temp, path.join(temp, 'correl_table.tsv')), '-a', path.join(temp, 'covar_table.tsv'), '-t', str(nprocs) ], stdout=stdout) # generate bootstraps with prefix, boot #subprocess.run(['mkdir', path.join(temp, 'bootstraps')]) subprocess.run([ 'fastspar_bootstrap', '-c', path.join(temp, 'otu_table.tsv'), '-n', str(bootstraps), '-p', path.join(temp, 'boot'), '-t', str(nprocs) ], stdout=stdout) # infer correlations for each bootstrap count using all available processes # TODO specify number of dedicated processes subprocess.run(['ls', temp], stdout=stdout) subprocess.run([ 'parallel', '-j', str(nprocs), 'fastspar', '-c', '{}', '-r', path.join(temp, 'cor_{/}'), '-a', path.join(temp, 'cov_{/}'), '-i', str(5), ':::' ] + glob(path.join(temp, 'boot*')), stdout=stdout) # caluculate p_values for correlation table subprocess.run([ 'fastspar_exactpvalues', '-c', path.join(temp, 'otu_table.tsv'), '-r', path.join(temp, 'correl_table.tsv'), '-p', path.join(temp, 'cor_boot'), '-t', str(nprocs), '-n', str(bootstraps), '-o', path.join(temp, 'pvalues.tsv') ], stdout=stdout) cor = pd.read_table(path.join(temp, 'correl_table.tsv'), index_col=0) pvals = pd.read_table(path.join(temp, 'pvalues.tsv'), index_col=0) return pd.concat( [df_to_correls(cor), df_to_correls(pvals, col_label='pvalue')], axis=1, join='inner')