def heatmap(output_dir: str, ranks: pd.DataFrame, microbe_metadata: qiime2.CategoricalMetadataColumn = None, metabolite_metadata: qiime2.CategoricalMetadataColumn = None, method: str = 'average', metric: str = 'euclidean', color_palette: str = 'seismic', margin_palette: str = 'cubehelix', x_labels: bool = False, y_labels: bool = False, level: int = -1) -> None: if microbe_metadata is not None: microbe_metadata = microbe_metadata.to_series() if metabolite_metadata is not None: metabolite_metadata = metabolite_metadata.to_series() hotmap = ranks_heatmap(ranks, microbe_metadata, metabolite_metadata, method, metric, color_palette, margin_palette, x_labels, y_labels, level) hotmap.savefig(join(output_dir, 'heatmap.pdf'), bbox_inches='tight') hotmap.savefig(join(output_dir, 'heatmap.png'), bbox_inches='tight') index = join(TEMPLATES, 'index.html') q2templates.render(index, output_dir, context={ 'title': 'Rank Heatmap', 'pdf_fp': 'heatmap.pdf', 'png_fp': 'heatmap.png' })
def subsample_longitudinal(dates: qiime2.CategoricalMetadataColumn, start_date: str = None, samples_per_interval: int = 7, days_per_interval: int = 7, seed: int = None) -> IDSelection: window_size = '%dD' % days_per_interval dt_series = pd.to_datetime(dates.to_series(), errors='coerce') df = pd.DataFrame({'ids': dates.to_series().index}, index=dt_series) if start_date is not None: filter_before = pd.Timestamp(start_date) df = df.iloc[np.where(dt_series >= filter_before)] if filter_before not in df.index: # this will be stripped in _sample_group::_sampler # the purpose is to force Pandas to begin the window at this # time instead of the first observation (by making NaN the first # observation) df.loc[filter_before] = float('nan') grouped = df.groupby(pd.Grouper(freq=window_size, convention='start', closed='left'), group_keys=False) filtered_df = grouped.apply(_sample_group(samples_per_interval, seed)) df = df.dropna(axis=0) selection = pd.Series(False, index=dates.to_series().index) selection[filtered_df['ids']] = True md = qiime2.Metadata(dates.to_dataframe()) return IDSelection(selection, md, 'subsample_longitudinal')
def group(table: biom.Table, axis: str, metadata: qiime2.CategoricalMetadataColumn, mode: str) -> biom.Table: if table.is_empty(): raise ValueError("Cannot group an empty table.") if axis == 'feature': biom_axis = 'observation' else: biom_axis = axis metadata = _munge_metadata_column(metadata, table.ids(axis=biom_axis), axis) grouped_table = table.collapse( lambda axis_id, _: metadata.get_value(axis_id), collapse_f=_mode_lookup[mode], axis=biom_axis, norm=False, include_collapsed_metadata=False) # Reorder axis by first unique appearance of each group value in metadata # (makes it stable for identity mappings and easier to test) # TODO use CategoricalMetadataColumn API for retrieving categories/groups, # when the API exists. series = metadata.to_series() return grouped_table.sort_order(series.unique(), axis=biom_axis)
def test_typical(self): metadata = CategoricalMetadataColumn( pd.Series(['AAAA', 'CCCC'], name='Barcode', index=pd.Index(['sample_a', 'sample_b'], name='id'))) exp = [ # sample a, fwd '@id1\nACGTACGT\n+\nzzzzzzzz\n' '@id3\nACGTACGT\n+\nzzzzzzzz\n', # sample a, rev '@id1\nGGGGTGCATGCA\n+\nzzzzzzzzzzzz\n' '@id3\nGGGGTGCATGCA\n+\nzzzzzzzzzzzz\n', # sample b, fwd '@id2\nACGTACGT\n+\nzzzzzzzz\n' '@id4\nACGTACGT\n+\nzzzzzzzz\n' '@id5\nACGTACGT\n+\nzzzzzzzz\n', # sample b, fwd '@id2\nTTTTTGCATGCA\n+\nzzzzzzzzzzzz\n' '@id4\nTTTTTGCATGCA\n+\nzzzzzzzzzzzz\n' '@id5\nTTTTTGCATGCA\n+\nzzzzzzzzzzzz\n', ] exp_untrimmed = [ '@id6\nGGGGACGTACGT\n+\nzzzzzzzzzzzz\n', '@id6\nTTTTTGCATGCA\n+\nzzzzzzzzzzzz\n' ] with redirected_stdio(stderr=os.devnull): obs_demuxed_art, obs_untrimmed_art = \ self.demux_paired_fn(self.muxed_sequences, metadata) self.assert_demux_results(metadata.to_series(), exp, obs_demuxed_art) self.assert_untrimmed_results(exp_untrimmed, obs_untrimmed_art)
def test_mixed_orientation_success(self): forward_barcodes = CategoricalMetadataColumn( pd.Series(['AAAA', 'CCCC'], name='ForwardBarcode', index=pd.Index(['sample_a', 'sample_b'], name='id'))) mixed_orientation_sequences_f_fp = self.get_data_path( 'mixed-orientation/forward.fastq.gz') mixed_orientation_sequences_r_fp = self.get_data_path( 'mixed-orientation/reverse.fastq.gz') with tempfile.TemporaryDirectory() as temp: shutil.copy(mixed_orientation_sequences_f_fp, temp) shutil.copy(mixed_orientation_sequences_r_fp, temp) mixed_orientation_sequences = Artifact.import_data( 'MultiplexedPairedEndBarcodeInSequence', temp) with redirected_stdio(stderr=os.devnull): obs_demuxed_art, obs_untrimmed_art = \ self.demux_paired_fn(mixed_orientation_sequences, forward_barcodes=forward_barcodes, mixed_orientation=True) self.assert_demux_results(forward_barcodes.to_series(), obs_demuxed_art) # Everything should match self.assert_untrimmed_results([b'', b''], obs_untrimmed_art)
def test_batch_size_odd_number_of_samples(self): metadata = CategoricalMetadataColumn( pd.Series(['AAAA', 'CCCC', 'GGGG'], name='Barcode', index=pd.Index(['sample_a', 'sample_b', 'sample_c'], name='id'))) exp = [ # sample a '@id1\nACGTACGT\n+\nzzzzzzzz\n' '@id3\nACGTACGT\n+\nzzzzzzzz\n', # sample b '@id2\nACGTACGT\n+\nzzzzzzzz\n' '@id4\nACGTACGT\n+\nzzzzzzzz\n' '@id5\nACGTACGT\n+\nzzzzzzzz\n', # sample c '@id6\nACGTACGT\n+\nzzzzzzzz\n', ] with redirected_stdio(stderr=os.devnull): obs_demuxed_art, obs_untrimmed_art = \ self.demux_single_fn(self.muxed_sequences, metadata, batch_size=2) self.assert_demux_results(metadata.to_series(), exp, obs_demuxed_art) self.assert_untrimmed_results('', obs_untrimmed_art)
def test_min_length(self): metadata = CategoricalMetadataColumn( # The third barcode is meant to completely remove the only GGGG # coded sequence pd.Series(['AAAA', 'CCCC', 'GGGGACGTACGT'], name='Barcode', index=pd.Index(['sample_a', 'sample_b', 'sample_c'], name='id'))) exp = [ # sample a '@id1\nACGTACGT\n+\nzzzzzzzz\n' '@id3\nACGTACGT\n+\nzzzzzzzz\n', # sample b '@id2\nACGTACGT\n+\nzzzzzzzz\n' '@id4\nACGTACGT\n+\nzzzzzzzz\n' '@id5\nACGTACGT\n+\nzzzzzzzz\n', # sample c is empty because the barcode matched the entire # read, which removed everything. '', ] with redirected_stdio(stderr=os.devnull): obs_demuxed_art, obs_untrimmed_art = \ self.demux_single_fn(self.muxed_sequences, metadata) self.assert_demux_results(metadata.to_series(), exp, obs_demuxed_art) self.assert_untrimmed_results('', obs_untrimmed_art)
def test_batch_size(self): metadata = CategoricalMetadataColumn( pd.Series(['AAAA', 'CCCC'], name='Barcode', index=pd.Index(['sample_a', 'sample_b'], name='id'))) exp = [ # sample a '@id1\nACGTACGT\n+\nzzzzzzzz\n' '@id3\nACGTACGT\n+\nzzzzzzzz\n', # sample b '@id2\nACGTACGT\n+\nzzzzzzzz\n' '@id4\nACGTACGT\n+\nzzzzzzzz\n' '@id5\nACGTACGT\n+\nzzzzzzzz\n', ] with redirected_stdio(stderr=os.devnull): obs_demuxed_art, obs_untrimmed_art = \ self.demux_single_fn(self.muxed_sequences, metadata, batch_size=1) # This test should yield the same results as test_typical, above, # the fact that we are batching shouldn't impact the final results self.assert_demux_results(metadata.to_series(), exp, obs_demuxed_art) self.assert_untrimmed_results('@id6\nGGGGACGTACGT\n+\nzzzzzzzzzzzz\n', obs_untrimmed_art)
def test_variable_length_barcodes(self): metadata = CategoricalMetadataColumn( pd.Series(['AAAAA', 'CCCCCC', 'GGGG'], name='Barcode', index=pd.Index(['sample_a', 'sample_b', 'sample_c'], name='id'))) muxed_sequences_fp = self.get_data_path('variable_length.fastq.gz') muxed_sequences = Artifact.import_data( 'MultiplexedSingleEndBarcodeInSequence', muxed_sequences_fp) exp = [ # sample a '@id1\nACGTACGT\n+\nzzzzzzzz\n' '@id3\nACGTACGT\n+\nzzzzzzzz\n', # sample b '@id2\nACGTACGT\n+\nzzzzzzzz\n' '@id4\nACGTACGT\n+\nzzzzzzzz\n' '@id5\nACGTACGT\n+\nzzzzzzzz\n', # sample c '@id6\nACGTACGT\n+\nzzzzzzzz\n', ] with redirected_stdio(stderr=os.devnull): obs_demuxed_art, obs_untrimmed_art = \ self.demux_single_fn(muxed_sequences, metadata) self.assert_demux_results(metadata.to_series(), exp, obs_demuxed_art) self.assert_untrimmed_results('', obs_untrimmed_art)
def group(table: biom.Table, axis: str, metadata: qiime2.CategoricalMetadataColumn, mode: str) -> biom.Table: if table.is_empty(): raise ValueError("Cannot group an empty table.") if axis == 'feature': biom_axis = 'observation' else: biom_axis = axis metadata = _munge_metadata_column(metadata, table.ids(axis=biom_axis), axis) grouped_table = table.collapse( lambda axis_id, _: metadata.get_value(axis_id), collapse_f=_mode_lookup[mode], axis=biom_axis, norm=False, include_collapsed_metadata=False) # Reorder axis by first unique appearance of each group value in metadata # (makes it stable for identity mappings and easier to test) # TODO use CategoricalMetadataColumn API for retrieving categories/groups, # when the API exists. series = metadata.to_series() return grouped_table.sort_order(series.unique(), axis=biom_axis)
def classify_samples(output_dir: str, table: pd.DataFrame, metadata: qiime2.CategoricalMetadataColumn, test_size: float = defaults['test_size'], step: float = defaults['step'], cv: int = defaults['cv'], random_state: int = None, n_jobs: int = defaults['n_jobs'], n_estimators: int = defaults['n_estimators'], estimator: str = defaults['estimator_r'], optimize_feature_selection: bool = False, parameter_tuning: bool = False, palette: str = defaults['palette']) -> None: # extract column name from CategoricalMetadataColumn column = metadata.to_series().name # disable feature selection for unsupported estimators optimize_feature_selection, calc_feature_importance = \ _disable_feature_selection(estimator, optimize_feature_selection) # specify parameters and distributions to sample from for parameter tuning estimator, param_dist, parameter_tuning = _set_parameters_and_estimator( estimator, table, metadata, column, n_estimators, n_jobs, cv, random_state, parameter_tuning, classification=True) estimator, cm, accuracy, importances = split_optimize_classify( table, metadata, column, estimator, output_dir, test_size=test_size, step=step, cv=cv, random_state=random_state, n_jobs=n_jobs, optimize_feature_selection=optimize_feature_selection, parameter_tuning=parameter_tuning, param_dist=param_dist, calc_feature_importance=calc_feature_importance, palette=palette) _visualize(output_dir, estimator, cm, accuracy, importances, optimize_feature_selection, title='classification predictions')
def test_mixed_orientation_success(self): # sample_a and sample_b have reads in both fwd and rev directions. # sample_c only has reads in the fwd direction. # sample_d only has reads in the rev direction. forward_barcodes = CategoricalMetadataColumn( pd.Series(['AAAA', 'CCCC', 'GGGG', 'TTTT'], name='ForwardBarcode', index=pd.Index( ['sample_a', 'sample_b', 'sample_c', 'sample_d'], name='id'))) mixed_orientation_sequences_f_fp = self.get_data_path( 'mixed-orientation/forward.fastq.gz') mixed_orientation_sequences_r_fp = self.get_data_path( 'mixed-orientation/reverse.fastq.gz') with tempfile.TemporaryDirectory() as temp: shutil.copy(mixed_orientation_sequences_f_fp, temp) shutil.copy(mixed_orientation_sequences_r_fp, temp) mixed_orientation_sequences = Artifact.import_data( 'MultiplexedPairedEndBarcodeInSequence', temp) with redirected_stdio(stderr=os.devnull): obs_demuxed_art, obs_untrimmed_art = \ self.demux_paired_fn(mixed_orientation_sequences, forward_barcodes=forward_barcodes, mixed_orientation=True) exp = [ # sample_a fwd '@id1\nACGTACGT\n+\nyyyyyyyy\n' \ '@id3\nACGTACGT\n+\nyyyyyyyy\n', # sample_a rev '@id1\nTGCATGCATGCA\n+\nzzzzzzzzzzzz\n' \ '@id3\nTGCATGCATGCA\n+\nzzzzzzzzzzzz\n', # sample_b fwd '@id4\nACGTACGT\n+\nyyyyyyyy\n' \ '@id2\nACGTACGT\n+\nyyyyyyyy\n', # sample_b rev '@id4\nTGCATGCATGCA\n+\nzzzzzzzzzzzz\n' \ '@id2\nTGCATGCATGCA\n+\nzzzzzzzzzzzz\n', # sample_c fwd '@id5\nACGTACGT\n+\nyyyyyyyy\n', # sample_c rev '@id5\nTGCATGCATGCA\n+\nzzzzzzzzzzzz\n', # sample_d fwd '@id6\nACGTACGT\n+\nyyyyyyyy\n', # sample_d rev '@id6\nTGCATGCATGCA\n+\nzzzzzzzzzzzz\n', ] # We want to be sure that the validation is 100%, not just `min`, obs_demuxed_art.validate(level='max') # checkpoint assertion for the above `validate` - nothing should fail self.assertTrue(True) self.assert_demux_results(forward_barcodes.to_series(), exp, obs_demuxed_art) # Everything should match, so untrimmed should be empty self.assert_untrimmed_results(['', ''], obs_untrimmed_art)
def test_typical(self): metadata = CategoricalMetadataColumn( pd.Series(['AAAA', 'CCCC'], name='Barcode', index=pd.Index(['sample_a', 'sample_b'], name='id'))) with redirected_stdio(stderr=os.devnull): obs_demuxed_art, obs_untrimmed_art = \ self.demux_single_fn(self.muxed_sequences, metadata) self.assert_demux_results(metadata.to_series(), obs_demuxed_art) self.assert_untrimmed_results(b'@id6\nGGGGACGTACGT\n+\nzzzzzzzzzzzz\n', obs_untrimmed_art)
def paired_heatmap(output_dir: str, ranks: pd.DataFrame, microbes_table: biom.Table, metabolites_table: biom.Table, features: str = None, top_k_microbes: int = 2, keep_top_samples: bool = True, microbe_metadata: qiime2.CategoricalMetadataColumn = None, normalize: str = 'log10', color_palette: str = 'magma', top_k_metabolites: int = 50, level: int = -1, row_center: bool = True) -> None: if microbe_metadata is not None: microbe_metadata = microbe_metadata.to_series() ranks = ranks.T if row_center: ranks = ranks - ranks.mean(axis=0) select_microbes, select_metabolites, hotmaps = paired_heatmaps( ranks, microbes_table, metabolites_table, microbe_metadata, features, top_k_microbes, top_k_metabolites, keep_top_samples, level, normalize, color_palette) hotmaps.savefig(join(output_dir, 'heatmap.pdf'), bbox_inches='tight') hotmaps.savefig(join(output_dir, 'heatmap.png'), bbox_inches='tight') select_microbes.to_csv(join(output_dir, 'select_microbes.tsv'), sep='\t') select_metabolites.to_csv(join(output_dir, 'select_metabolites.tsv'), sep='\t') index = join(TEMPLATES, 'index.html') q2templates.render(index, output_dir, context={ 'title': 'Paired Feature Abundance Heatmaps', 'pdf_fp': 'heatmap.pdf', 'png_fp': 'heatmap.png', 'table1_fp': 'select_microbes.tsv', 'download1_text': 'Download microbe abundances as TSV', 'table2_fp': 'select_metabolites.tsv', 'download2_text': 'Download top k metabolite abundances as TSV' })
def aldex2(table: pd.DataFrame, metadata: qiime2.CategoricalMetadataColumn, mc_samples: int = 128, test: str = 't', denom: str = 'all') -> pd.DataFrame: # create series from the metadata column meta = metadata.to_series() # The condition is just the only column in the passed metadata column condition = metadata.name # filter the metadata so only the samples present in the table are used # this also reorders it for the correct condition selection # it has to be re ordered for aldex to correctly input the conditions meta = meta.loc[list(table.index)] # force reorder based on the data to ensure conds are selected correctly with tempfile.TemporaryDirectory() as temp_dir_name: biom_fp = os.path.join(temp_dir_name, 'input.tsv.biom') map_fp = os.path.join(temp_dir_name, 'input.map.txt') summary_fp = os.path.join(temp_dir_name, 'output.summary.txt') # Need to manually specify header=True for Series (i.e. "meta"). It's # already the default for DataFrames (i.e. "table"), but we manually # specify it here anyway to alleviate any potential confusion. table.to_csv(biom_fp, sep='\t', header=True) meta.to_csv(map_fp, sep='\t', header=True) cmd = [ 'run_aldex2.R', biom_fp, map_fp, condition, mc_samples, test, denom, summary_fp ] cmd = list(map(str, cmd)) try: run_commands([cmd]) except subprocess.CalledProcessError as e: raise Exception("An error was encountered while running ALDEx2" " in R (return code %d), please inspect stdout" " and stderr to learn more." % e.returncode) summary = pd.read_csv(summary_fp, index_col=0) #differentials = summary[['effect']] # hack to fix column name for features because aldex removes #it in R because of row.names = 1 summary.index.name = "featureid" summary.rename(index=str, inplace=True) return summary
def test_all_matched(self): metadata = CategoricalMetadataColumn( pd.Series(['AAAA', 'CCCC', 'GGGG'], name='Barcode', index=pd.Index(['sample_a', 'sample_b', 'sample_c'], name='id'))) with redirected_stdio(stderr=os.devnull): obs_demuxed_art, obs_untrimmed_art = \ self.demux_single_fn(self.muxed_sequences, metadata) self.assert_demux_results(metadata.to_series(), obs_demuxed_art) # obs_untrimmed should be empty, since everything matched self.assert_untrimmed_results(b'', obs_untrimmed_art)
def estimate(counts: pd.DataFrame, replicates: qiime2.CategoricalMetadataColumn, batches: qiime2.CategoricalMetadataColumn, monte_carlo_samples: int = 100, cores: int = 1) -> az.InferenceData: # match everything up replicates = replicates.to_series() batches = batches.to_series() idx = list(set(counts.index) & set(replicates.index) & set(batches.index)) counts, replicates, batches = [ x.loc[idx] for x in (counts, replicates, batches) ] replicates, batches = replicates.values, batches.values depth = counts.sum(axis=1) pfunc = lambda x: _batch_func(np.array(x.values), replicates, batches, depth, monte_carlo_samples) if cores > 1: try: import dask.dataframe as dd dcounts = dd.from_pandas(counts.T, npartitions=cores) res = dcounts.apply(pfunc, axis=1) resdf = res.compute(scheduler='processes') data_df = list(resdf.values) except: data_df = list(counts.T.apply(pfunc, axis=1).values) else: data_df = list(counts.T.apply(pfunc, axis=1).values) inf_list = list(resdf[0]) coords = { 'features': counts.columns, 'monte_carlo_samples': np.arange(args.monte_carlo_samples) } samples = merge_inferences(inf_list, 'y_predict', 'log_lhood', coords) return samples
def test_error_tolerance_high_enough_to_prevent_filtering(self): metadata = CategoricalMetadataColumn( pd.Series(['AAAG', 'CCCC'], name='Barcode', index=pd.Index(['sample_a', 'sample_b'], name='id'))) with redirected_stdio(stderr=os.devnull): obs_demuxed_art, obs_untrimmed_art = \ self.demux_single_fn(self.muxed_sequences, metadata, error_rate=0.25) # This test should yield the same results as test_typical, above self.assert_demux_results(metadata.to_series(), obs_demuxed_art) self.assert_untrimmed_results(b'@id6\nGGGGACGTACGT\n+\nzzzzzzzzzzzz\n', obs_untrimmed_art)
def rename_ids(table: biom.Table, metadata: qiime2.CategoricalMetadataColumn, axis: str = 'sample', strict: bool = False)\ -> biom.Table: rename = metadata.to_series() if axis == 'feature': axis = 'observation' old_ids = table.ids(axis=axis) new_ids = _generate_new_names(old_ids, rename, strict, False) updated = table.update_ids(new_ids, axis=axis, inplace=False) return updated
def test_variable_length_barcodes(self): metadata = CategoricalMetadataColumn( pd.Series(['AAAAA', 'CCCCCC', 'GGGG'], name='Barcode', index=pd.Index(['sample_a', 'sample_b', 'sample_c'], name='id'))) muxed_sequences_fp = self.get_data_path('variable_length.fastq.gz') muxed_sequences = Artifact.import_data( 'MultiplexedSingleEndBarcodeInSequence', muxed_sequences_fp) with redirected_stdio(stderr=os.devnull): obs_demuxed_art, obs_untrimmed_art = \ self.demux_single_fn(muxed_sequences, metadata) # This test should yield the same results as test_typical, above, just # with variable length barcodes self.assert_demux_results(metadata.to_series(), obs_demuxed_art) self.assert_untrimmed_results(b'', obs_untrimmed_art)
def test_none_matched(self): metadata = CategoricalMetadataColumn( pd.Series(['TTTT'], name='Barcode', index=pd.Index(['sample_d'], name='id'))) with redirected_stdio(stderr=os.devnull): obs_demuxed_art, obs_untrimmed_art = \ self.demux_single_fn(self.muxed_sequences, metadata) self.assert_demux_results(metadata.to_series(), [''], obs_demuxed_art) self.assert_untrimmed_results( '@id1\nAAAAACGTACGT\n+\nzzzzzzzzzzzz\n' '@id2\nCCCCACGTACGT\n+\nzzzzzzzzzzzz\n' '@id3\nAAAAACGTACGT\n+\nzzzzzzzzzzzz\n' '@id4\nCCCCACGTACGT\n+\nzzzzzzzzzzzz\n' '@id5\nCCCCACGTACGT\n+\nzzzzzzzzzzzz\n' '@id6\nGGGGACGTACGT\n+\nzzzzzzzzzzzz\n', obs_untrimmed_art)
def test_di_typical(self): forward_barcodes = CategoricalMetadataColumn( pd.Series(['AAAA', 'CCCC'], name='ForwardBarcode', index=pd.Index(['sample_a', 'sample_b'], name='id'))) reverse_barcodes = CategoricalMetadataColumn( pd.Series(['GGGG', 'TTTT'], name='ReverseBarcode', index=pd.Index(['sample_a', 'sample_b'], name='id'))) with redirected_stdio(stderr=os.devnull): obs_demuxed_art, obs_untrimmed_art = \ self.demux_paired_fn(self.muxed_sequences, forward_barcodes=forward_barcodes, reverse_barcodes=reverse_barcodes) self.assert_demux_results(forward_barcodes.to_series(), obs_demuxed_art) exp_untrimmed = [ b'@id6\nGGGGACGTACGT\n+\nzzzzzzzzzzzz\n', b'@id6\nTTTTTGCATGCA\n+\nzzzzzzzzzzzz\n' ] self.assert_untrimmed_results(exp_untrimmed, obs_untrimmed_art)
def test_error_tolerance_filtering(self): metadata = CategoricalMetadataColumn( pd.Series(['AAAG', 'CCCC'], name='Barcode', index=pd.Index(['sample_a', 'sample_b'], name='id'))) exp = [ # sample a has no reads (bc we misspelled the barcode) '', # sample b '@id2\nACGTACGT\n+\nzzzzzzzz\n' '@id4\nACGTACGT\n+\nzzzzzzzz\n' '@id5\nACGTACGT\n+\nzzzzzzzz\n', ] with redirected_stdio(stderr=os.devnull): obs_demuxed_art, obs_untrimmed_art = \ self.demux_single_fn(self.muxed_sequences, metadata) self.assert_demux_results(metadata.to_series(), exp, obs_demuxed_art) self.assert_untrimmed_results( '@id1\nAAAAACGTACGT\n+\nzzzzzzzzzzzz\n' '@id3\nAAAAACGTACGT\n+\nzzzzzzzzzzzz\n' '@id6\nGGGGACGTACGT\n+\nzzzzzzzzzzzz\n', obs_untrimmed_art)
def ancom(output_dir: str, table: pd.DataFrame, metadata: qiime2.CategoricalMetadataColumn, transform_function: str = 'clr', difference_function: str = None) -> None: index_fp = os.path.join(output_dir, 'index.html') metadata = metadata.filter_ids(table.index) if metadata.has_missing_values(): missing_data_sids = metadata.get_ids(where_values_missing=True) missing_data_sids = ', '.join(sorted(missing_data_sids)) raise ValueError('Metadata column is missing values for the ' 'following samples. Values need to be added for ' 'these samples, or the samples need to be removed ' 'from the table: %s' % missing_data_sids) ancom_results = skbio_ancom(table, metadata.to_series(), significance_test=f_oneway) # scikit-bio 0.4.2 returns a single tuple from ancom, and scikit-bio 0.5.0 # returns two tuples. We want to support both scikit-bio versions, so we # tuplize ancom_result to support both. Similarly, the "reject" column # was renamed in scikit-bio 0.5.0, so we apply a rename here (which does # nothing if a column called "reject" isn't found). ancom_results = qiime2.core.util.tuplize(ancom_results) ancom_results[0].sort_values(by='W', ascending=False) ancom_results[0].rename(columns={'reject': 'Reject null hypothesis'}, inplace=True) ancom_results[0].to_csv(os.path.join(output_dir, 'ancom.csv'), header=True, index=True) html = _volcanoplot(output_dir, table, metadata, ancom_results[0], transform_function, difference_function) significant_features = ancom_results[0][ancom_results[0] ['Reject null hypothesis']] significant_features_present = not significant_features.empty insignificant_div = ('<div>No significant features identified!</div>') with open(index_fp, 'w') as index_f: index_f.write('<html>\n') if html is not None: index_f.write('<head>\n') index_f.write(INLINE.render()) index_f.write('</head>\n') index_f.write('<body>\n') index_f.write('<h1>ANCOM statistical results</h1>\n') index_f.write('<a href="ancom.csv">Download complete table as CSV</a>' '<br>\n') if significant_features_present: index_f.write( q2templates.df_to_html(significant_features['W'].to_frame(), border=None, classes=None)) else: index_f.write(insignificant_div) if len(ancom_results) == 2: ancom_results[1].to_csv(os.path.join(output_dir, 'percent-abundances.csv'), header=True, index=True) index_f.write(('<h1>Percentile abundances of features ' 'by group</h1>\n')) index_f.write(('<a href="percent-abundances.csv">' 'Download complete table as CSV</a><br>\n')) if significant_features_present: index_f.write( q2templates.df_to_html( ancom_results[1].loc[significant_features.index], border=None, classes=None)) else: index_f.write(insignificant_div) if html is not None: index_f.write(html[1]) index_f.write(html[0]) else: index_f.write('<p>Unable to generate volcano plot, please check ' 'the ANCOM statistical results (above).</p>\n') index_f.write('</body></html>\n')
def beta_group_significance(output_dir: str, distance_matrix: skbio.DistanceMatrix, metadata: qiime2.CategoricalMetadataColumn, method: str = 'permanova', pairwise: bool = False, permutations: int = 999) -> None: try: beta_group_significance_fn = _beta_group_significance_fns[method] except KeyError: raise ValueError('Unknown group significance method %s. The available ' 'options are %s.' % (method, ', '.join(_beta_group_significance_fns))) # Filter metadata to only include IDs present in the distance matrix. # Also ensures every distance matrix ID is present in the metadata. metadata = metadata.filter_ids(distance_matrix.ids) metadata = metadata.drop_missing_values() # filter the distance matrix to exclude samples that were dropped from # the metadata due to missing values, and keep track of how many samples # survived the filtering so that information can be presented to the user. initial_dm_length = distance_matrix.shape[0] distance_matrix = distance_matrix.filter(metadata.ids) filtered_dm_length = distance_matrix.shape[0] metadata = metadata.to_series() # Run the significance test result = beta_group_significance_fn(distance_matrix, metadata, permutations=permutations) # Generate distance boxplots sns.set_style('white') # Identify the groups, then compute the within group distances and the # between group distances, and generate one boxplot per group. # groups will be an OrderedDict mapping group id to the sample ids in that # group. The order is used both on the x-axis, and in the layout of the # boxplots in the visualization. # TODO: update to use a grouping API and natsort API on # CategoricalMetadataColumn, if those become available. groupings = collections.OrderedDict([ (id, list(series.index)) for id, series in natsorted(metadata.groupby(metadata)) ]) pairs_summary = pd.DataFrame( columns=['SubjectID1', 'SubjectID2', 'Group1', 'Group2', 'Distance']) for group_id in groupings: group_distances, x_ticklabels, group_pairs_summary = \ _get_distance_boxplot_data(distance_matrix, group_id, groupings) group_pairs_summary = pd.DataFrame(group_pairs_summary, columns=[ 'SubjectID1', 'SubjectID2', 'Group1', 'Group2', 'Distance' ]) pairs_summary = pd.concat([pairs_summary, group_pairs_summary]) ax = sns.boxplot(data=group_distances, flierprops={ 'marker': 'o', 'markeredgecolor': 'black', 'markeredgewidth': 0.5, 'alpha': 0.5 }) ax.set_xticklabels(x_ticklabels, rotation=90) ax.set_xlabel('Group') ax.set_ylabel('Distance') ax.set_title('Distances to %s' % group_id) # change the color of the boxes to white for box in ax.artists: box.set_facecolor('white') sns.despine() plt.tight_layout() fig = ax.get_figure() fig.savefig( os.path.join(output_dir, '%s-boxplots.png' % urllib.parse.quote(str(group_id)))) fig.savefig( os.path.join(output_dir, '%s-boxplots.pdf' % urllib.parse.quote(str(group_id)))) fig.clear() pairs_summary.to_csv(os.path.join(output_dir, 'raw_data.tsv'), sep='\t') result_html = q2templates.df_to_html(result.to_frame()) if pairwise: pairwise_results = [] for group1_id, group2_id in itertools.combinations(groupings, 2): pairwise_result = \ _get_pairwise_group_significance_stats( distance_matrix=distance_matrix, group1_id=group1_id, group2_id=group2_id, groupings=groupings, metadata=metadata, beta_group_significance_fn=beta_group_significance_fn, permutations=permutations) pairwise_results.append([ group1_id, group2_id, pairwise_result['sample size'], permutations, pairwise_result['test statistic'], pairwise_result['p-value'] ]) columns = [ 'Group 1', 'Group 2', 'Sample size', 'Permutations', result['test statistic name'], 'p-value' ] pairwise_results = pd.DataFrame(pairwise_results, columns=columns) pairwise_results.set_index(['Group 1', 'Group 2'], inplace=True) pairwise_results['q-value'] = multipletests( pairwise_results['p-value'], method='fdr_bh')[1] pairwise_results.sort_index(inplace=True) pairwise_path = os.path.join(output_dir, '%s-pairwise.csv' % method) pairwise_results.to_csv(pairwise_path) pairwise_results_html = q2templates.df_to_html(pairwise_results) else: pairwise_results_html = None # repartition groupings for rendering group_ids = [ # We have to DOUBLE encode this, as the file/resource name is a literal # URI-encoded string, we do this to prevent issues with the filesystem # however, as a result, our links need to escape % so that the browser # asks for the right escaped name (instead of the original name, which # doesn't exist inside the visualization). urllib.parse.quote(urllib.parse.quote(k)) for k in groupings.keys() ] row_count, group_count = 3, len(group_ids) # Start at three plots per row while group_count % row_count != 0: row_count = row_count - 1 group_rows = [ group_ids[g:g + row_count] for g in range(0, group_count, row_count) ] index = os.path.join(TEMPLATES, 'beta_group_significance_assets', 'index.html') q2templates.render(index, output_dir, context={ 'initial_dm_length': initial_dm_length, 'filtered_dm_length': filtered_dm_length, 'method': method, 'group_rows': group_rows, 'bootstrap_group_col_size': int(12 / row_count), 'result': result_html, 'pairwise_results': pairwise_results_html })
def ancom(output_dir: str, table: pd.DataFrame, metadata: qiime2.CategoricalMetadataColumn, transform_function: str = 'clr', difference_function: str = None) -> None: metadata = metadata.filter_ids(table.index) if metadata.has_missing_values(): missing_data_sids = metadata.get_ids(where_values_missing=True) missing_data_sids = ', '.join(sorted(missing_data_sids)) raise ValueError('Metadata column is missing values for the ' 'following samples. Values need to be added for ' 'these samples, or the samples need to be removed ' 'from the table: %s' % missing_data_sids) ancom_results = skbio_ancom(table, metadata.to_series(), significance_test=f_oneway) ancom_results[0].sort_values(by='W', ascending=False, inplace=True) ancom_results[0].rename(columns={'reject': 'Reject null hypothesis'}, inplace=True) significant_features = ancom_results[0][ancom_results[0] ['Reject null hypothesis']] context = dict() if not significant_features.empty: context['significant_features'] = q2templates.df_to_html( significant_features['W'].to_frame()) context['percent_abundances'] = q2templates.df_to_html( ancom_results[1].loc[significant_features.index]) metadata = metadata.to_series() cats = list(set(metadata)) transform_function_name = transform_function transform_function = _transform_functions[transform_function] transformed_table = table.apply(transform_function, axis=1, result_type='broadcast') if difference_function is None: if len(cats) == 2: difference_function = 'mean_difference' else: # len(categories) > 2 difference_function = 'f_statistic' _d_func = _difference_functions[difference_function] def diff_func(x): args = _d_func(*[x[metadata == c] for c in cats]) if isinstance(args, tuple): return args[0] else: return args # effectively doing a groupby operation wrt to the metadata fold_change = transformed_table.apply(diff_func, axis=0) if not pd.isnull(fold_change).all(): pre_filtered_ids = set(fold_change.index) with pd.option_context('mode.use_inf_as_na', True): fold_change = fold_change.dropna(axis=0) filtered_ids = pre_filtered_ids - set(fold_change.index) filtered_ancom_results = ancom_results[0].drop(labels=filtered_ids) volcano_results = pd.DataFrame({ transform_function_name: fold_change, 'W': filtered_ancom_results.W }) volcano_results.index.name = 'id' volcano_results.to_csv(os.path.join(output_dir, 'data.tsv'), header=True, index=True, sep='\t') volcano_results = volcano_results.reset_index(drop=False) spec = { '$schema': 'https://vega.github.io/schema/vega/v4.json', 'width': 300, 'height': 300, 'data': [{ 'name': 'values', 'values': volcano_results.to_dict(orient='records') }], 'scales': [{ 'name': 'xScale', 'domain': { 'data': 'values', 'field': transform_function_name }, 'range': 'width' }, { 'name': 'yScale', 'domain': { 'data': 'values', 'field': 'W' }, 'range': 'height' }], 'axes': [{ 'scale': 'xScale', 'orient': 'bottom', 'title': transform_function_name }, { 'scale': 'yScale', 'orient': 'left', 'title': 'W' }], 'marks': [{ 'type': 'symbol', 'from': { 'data': 'values' }, 'encode': { 'hover': { 'fill': { 'value': '#FF0000' }, 'opacity': { 'value': 1 } }, 'enter': { 'x': { 'scale': 'xScale', 'field': transform_function_name }, 'y': { 'scale': 'yScale', 'field': 'W' } }, 'update': { 'fill': { 'value': 'black' }, 'opacity': { 'value': 0.3 }, 'tooltip': { 'signal': "{{'title': datum['id'], '{0}': " "datum['{0}'], 'W': datum['W']}}".format( transform_function_name) } } } }] } context['vega_spec'] = json.dumps(spec) if filtered_ids: context['filtered_ids'] = ', '.join(sorted(filtered_ids)) copy_tree(os.path.join(TEMPLATES, 'ancom'), output_dir) ancom_results[0].to_csv(os.path.join(output_dir, 'ancom.tsv'), header=True, index=True, sep='\t') ancom_results[1].to_csv(os.path.join(output_dir, 'percent-abundances.tsv'), header=True, index=True, sep='\t') index = os.path.join(TEMPLATES, 'ancom', 'index.html') q2templates.render(index, output_dir, context=context)
def beta_group_significance(output_dir: str, distance_matrix: skbio.DistanceMatrix, metadata: qiime2.CategoricalMetadataColumn, method: str = 'permanova', pairwise: bool = False, permutations: int = 999) -> None: try: beta_group_significance_fn = _beta_group_significance_fns[method] except KeyError: raise ValueError('Unknown group significance method %s. The available ' 'options are %s.' % (method, ', '.join(_beta_group_significance_fns))) # Filter metadata to only include IDs present in the distance matrix. # Also ensures every distance matrix ID is present in the metadata. metadata = metadata.filter_ids(distance_matrix.ids) metadata = metadata.drop_missing_values() # filter the distance matrix to exclude samples that were dropped from # the metadata due to missing values, and keep track of how many samples # survived the filtering so that information can be presented to the user. initial_dm_length = distance_matrix.shape[0] distance_matrix = distance_matrix.filter(metadata.ids) filtered_dm_length = distance_matrix.shape[0] metadata = metadata.to_series() # Run the significance test result = beta_group_significance_fn(distance_matrix, metadata, permutations=permutations) # Generate distance boxplots sns.set_style('white') # Identify the groups, then compute the within group distances and the # between group distances, and generate one boxplot per group. # groups will be an OrderedDict mapping group id to the sample ids in that # group. The order is used both on the x-axis, and in the layout of the # boxplots in the visualization. # TODO: update to use a grouping API and natsort API on # CategoricalMetadataColumn, if those become available. groupings = collections.OrderedDict( [(id, list(series.index)) for id, series in natsorted(metadata.groupby(metadata))]) pairs_summary = pd.DataFrame(columns=['SubjectID1', 'SubjectID2', 'Group1', 'Group2', 'Distance']) for group_id in groupings: group_distances, x_ticklabels, group_pairs_summary = \ _get_distance_boxplot_data(distance_matrix, group_id, groupings) group_pairs_summary = pd.DataFrame( group_pairs_summary, columns=['SubjectID1', 'SubjectID2', 'Group1', 'Group2', 'Distance']) pairs_summary = pd.concat([pairs_summary, group_pairs_summary]) ax = sns.boxplot(data=group_distances, flierprops={ 'marker': 'o', 'markeredgecolor': 'black', 'markeredgewidth': 0.5, 'alpha': 0.5}) ax.set_xticklabels(x_ticklabels, rotation=90) ax.set_xlabel('Group') ax.set_ylabel('Distance') ax.set_title('Distances to %s' % group_id) # change the color of the boxes to white for box in ax.artists: box.set_facecolor('white') sns.despine() plt.tight_layout() fig = ax.get_figure() fig.savefig(os.path.join(output_dir, '%s-boxplots.png' % urllib.parse.quote_plus(str(group_id)))) fig.savefig(os.path.join(output_dir, '%s-boxplots.pdf' % urllib.parse.quote_plus(str(group_id)))) fig.clear() pairs_summary.to_csv(os.path.join(output_dir, 'raw_data.tsv'), sep='\t') result_html = q2templates.df_to_html(result.to_frame()) if pairwise: pairwise_results = [] for group1_id, group2_id in itertools.combinations(groupings, 2): pairwise_result = \ _get_pairwise_group_significance_stats( distance_matrix=distance_matrix, group1_id=group1_id, group2_id=group2_id, groupings=groupings, metadata=metadata, beta_group_significance_fn=beta_group_significance_fn, permutations=permutations) pairwise_results.append([group1_id, group2_id, pairwise_result['sample size'], permutations, pairwise_result['test statistic'], pairwise_result['p-value']]) columns = ['Group 1', 'Group 2', 'Sample size', 'Permutations', result['test statistic name'], 'p-value'] pairwise_results = pd.DataFrame(pairwise_results, columns=columns) pairwise_results.set_index(['Group 1', 'Group 2'], inplace=True) pairwise_results['q-value'] = multipletests( pairwise_results['p-value'], method='fdr_bh')[1] pairwise_results.sort_index(inplace=True) pairwise_path = os.path.join( output_dir, '%s-pairwise.csv' % method) pairwise_results.to_csv(pairwise_path) pairwise_results_html = q2templates.df_to_html(pairwise_results) else: pairwise_results_html = None # repartition groupings for rendering group_ids = list(groupings.keys()) row_count, group_count = 3, len(group_ids) # Start at three plots per row while group_count % row_count != 0: row_count = row_count - 1 group_rows = [group_ids[g:g+row_count] for g in range(0, group_count, row_count)] index = os.path.join( TEMPLATES, 'beta_group_significance_assets', 'index.html') q2templates.render(index, output_dir, context={ 'initial_dm_length': initial_dm_length, 'filtered_dm_length': filtered_dm_length, 'method': method, 'group_rows': group_rows, 'bootstrap_group_col_size': int(12 / row_count), 'result': result_html, 'pairwise_results': pairwise_results_html })
def classify( features: pd.DataFrame, y: qiime2.CategoricalMetadataColumn, c: np.ndarray = None, weights: np.ndarray = None, # taxa: skbio.TreeNode = None, # PATH parameters : path: bool = True, path_numerical_method: str = "not specified", path_n_active: int = 0, path_nlam_log: int = 40, path_lamin_log: float = 1e-2, # CV parameters : cv: bool = True, cv_numerical_method: str = "not specified", cv_seed: int = 1, cv_one_se: bool = True, cv_subsets: int = 5, cv_nlam: int = 100, cv_lamin: float = 1e-3, cv_logscale: bool = True, # StabSel parameters : stabsel: bool = True, stabsel_numerical_method: str = "not specified", stabsel_seed: int = None, # do something here ! for now it can be a bool ! stabsel_lam: float = -1.0, # if negative, then it means 'theoretical' stabsel_true_lam: bool = True, stabsel_method: str = "first", stabsel_b: int = 50, stabsel_q: int = 10, stabsel_percent_ns: float = 0.5, stabsel_lamin: float = 1e-2, stabsel_threshold: float = 0.7, stabsel_threshold_label: float = 0.4, # might unneeded here, but needed for visualisation # LAMfixed parameters : lamfixed: bool = True, lamfixed_numerical_method: str = "not specified", lamfixed_lam: float = -1.0, # if negative, then it means 'theoretical' lamfixed_true_lam: bool = True, # Formulation parameters huber: bool = False, rho: float = 0.0, intercept: bool = True, ) -> classo_problem: complete_y = y.to_series() complete_y = complete_y[~complete_y.isna()] first_cell = complete_y[0] #print(sum(complete_y==complete_y[0]), len(complete_y)) features, pdY = features.align(y.to_series(), join="inner", axis=0) missing = pdY.isna() training_labels = list(pdY[~missing].index) label_missing = list(pdY.index[missing]) if label_missing: print("{} are missing in y ".format(label_missing)) Y = pdY[~missing].to_numpy() X = features.values[~missing, :] verfify_binary(Y) Y = Y == first_cell Y = 2 * Y - 1 problem = classo_problem(X, Y, C=c, label=list(features.columns)) problem.formulation.classification = True problem.formulation.concomitant = False problem.formulation.huber = huber #print(rho) problem.formulation.rho_classification = rho problem.formulation.intercept = intercept d = X.shape[1] if weights is not None: if len(weights) < d: problem.formulation.w = np.concatenate( [weights, np.ones(d - len(weights))], axis=0) else: problem.formulation.w = weights[:d] problem.model_selection.PATH = path if path: param = problem.model_selection.PATHparameters param.numerical_method = path_numerical_method param.n_active = path_n_active param.logscale = True param.Nlam = path_nlam_log param.lamin = path_lamin_log problem.model_selection.CV = cv if cv: param = problem.model_selection.CVparameters param.numerical_method = cv_numerical_method param.seed = cv_seed param.oneSE = cv_one_se param.Nsubsets = cv_subsets param.lamin = cv_lamin param.Nlam = cv_nlam param.logscale = cv_logscale problem.model_selection.StabSel = stabsel if stabsel: param = problem.model_selection.StabSelparameters param.numerical_method = stabsel_numerical_method param.seed = stabsel_seed param.true_lam = stabsel_true_lam param.method = stabsel_method param.B = stabsel_b param.q = stabsel_q param.percent_nS = stabsel_percent_ns param.lamin = stabsel_lamin param.threshold = stabsel_threshold param.threshold_label = stabsel_threshold_label if stabsel_lam > 0.0: param.lam = stabsel_lam else: param.lam = "theoretical" problem.model_selection.LAMfixed = lamfixed if lamfixed: param = problem.model_selection.LAMfixedparameters param.numerical_method = lamfixed_numerical_method param.true_lam = lamfixed_true_lam if lamfixed_lam > 0.0: param.lam = lamfixed_lam else: param.lam = "theoretical" problem.solve() cy = complete_y.values problem.data.complete_y = 2 * (cy == cy[0]) - 1 problem.data.complete_labels = list(complete_y.index) problem.data.training_labels = training_labels return problem
def dendrogram_heatmap(output_dir: str, table: pd.DataFrame, tree: TreeNode, metadata: qiime2.CategoricalMetadataColumn, pseudocount: float = 0.5, ndim: int = 10, method: str = 'clr', color_map: str = 'viridis'): table, tree = match_tips(add_pseudocount(table, pseudocount), tree) nodes = [n.name for n in tree.levelorder() if not n.is_tip()] nlen = min(ndim, len(nodes)) numerator_color, denominator_color = '#fb9a99', '#e31a1c' highlights = pd.DataFrame([[numerator_color, denominator_color]] * nlen, index=nodes[:nlen]) if method == 'clr': mat = pd.DataFrame(clr(centralize(table)), index=table.index, columns=table.columns) elif method == 'log': mat = pd.DataFrame(np.log(table), index=table.index, columns=table.columns) c = metadata.to_series() table, c = match(table, c) # TODO: There are a few hard-coded constants here # will need to have some adaptive defaults set in the future fig = heatmap(mat, tree, c, highlights, cmap=color_map, highlight_width=0.01, figsize=(12, 8)) fig.savefig(os.path.join(output_dir, 'heatmap.svg')) fig.savefig(os.path.join(output_dir, 'heatmap.pdf')) css = r""" .square { float: left; width: 100px; height: 20px; margin: 5px; border: 1px solid rgba(0, 0, 0, .2); } .numerator { background: %s; } .denominator { background: %s; } """ % (numerator_color, denominator_color) index_fp = os.path.join(output_dir, 'index.html') with open(index_fp, 'w') as index_f: index_f.write('<html><body>\n') index_f.write('<h1>Dendrogram heatmap</h1>\n') index_f.write('<img src="heatmap.svg" alt="heatmap">') index_f.write('<a href="heatmap.pdf">') index_f.write('Download as PDF</a><br>\n') index_f.write('<style>%s</style>' % css) index_f.write('<div class="square numerator">' 'Numerator<br/></div>') index_f.write('<div class="square denominator">' 'Denominator<br/></div>') index_f.write('</body></html>\n')
def ancom(output_dir: str, table: pd.DataFrame, metadata: qiime2.CategoricalMetadataColumn, transform_function: str = 'clr', difference_function: str = None) -> None: metadata = metadata.filter_ids(table.index) if metadata.has_missing_values(): missing_data_sids = metadata.get_ids(where_values_missing=True) missing_data_sids = ', '.join(sorted(missing_data_sids)) raise ValueError('Metadata column is missing values for the ' 'following samples. Values need to be added for ' 'these samples, or the samples need to be removed ' 'from the table: %s' % missing_data_sids) ancom_results = skbio_ancom(table, metadata.to_series(), significance_test=f_oneway) ancom_results[0].sort_values(by='W', ascending=False, inplace=True) ancom_results[0].rename(columns={'reject': 'Reject null hypothesis'}, inplace=True) significant_features = ancom_results[0][ ancom_results[0]['Reject null hypothesis']] context = dict() if not significant_features.empty: context['significant_features'] = q2templates.df_to_html( significant_features['W'].to_frame()) context['percent_abundances'] = q2templates.df_to_html( ancom_results[1].loc[significant_features.index]) metadata = metadata.to_series() cats = list(set(metadata)) transform_function_name = transform_function transform_function = _transform_functions[transform_function] transformed_table = table.apply( transform_function, axis=1, result_type='broadcast') if difference_function is None: if len(cats) == 2: difference_function = 'mean_difference' else: # len(categories) > 2 difference_function = 'f_statistic' _d_func = _difference_functions[difference_function] def diff_func(x): args = _d_func(*[x[metadata == c] for c in cats]) if isinstance(args, tuple): return args[0] else: return args # effectively doing a groupby operation wrt to the metadata fold_change = transformed_table.apply(diff_func, axis=0) if not pd.isnull(fold_change).all(): volcano_results = pd.DataFrame({transform_function_name: fold_change, 'W': ancom_results[0].W}) volcano_results = volcano_results.reset_index(drop=False) spec = { '$schema': 'https://vega.github.io/schema/vega/v4.json', 'width': 300, 'height': 300, 'data': [ {'name': 'values', 'values': volcano_results.to_dict(orient='records')}], 'scales': [ {'name': 'xScale', 'domain': {'data': 'values', 'field': transform_function_name}, 'range': 'width'}, {'name': 'yScale', 'domain': {'data': 'values', 'field': 'W'}, 'range': 'height'}], 'axes': [ {'scale': 'xScale', 'orient': 'bottom', 'title': transform_function_name}, {'scale': 'yScale', 'orient': 'left', 'title': 'W'}], 'marks': [ {'type': 'symbol', 'from': {'data': 'values'}, 'encode': { 'hover': { 'fill': {'value': '#FF0000'}, 'opacity': {'value': 1}}, 'enter': { 'x': {'scale': 'xScale', 'field': transform_function_name}, 'y': {'scale': 'yScale', 'field': 'W'}}, 'update': { 'fill': {'value': 'black'}, 'opacity': {'value': 0.3}, 'tooltip': { 'signal': "{{'title': datum['index'], '{0}': " "datum['{0}'], 'W': datum['W']}}".format( transform_function_name)}}}}]} context['vega_spec'] = json.dumps(spec) copy_tree(os.path.join(TEMPLATES, 'ancom'), output_dir) ancom_results[0].to_csv(os.path.join(output_dir, 'ancom.tsv'), header=True, index=True, sep='\t') ancom_results[1].to_csv(os.path.join(output_dir, 'percent-abundances.tsv'), header=True, index=True, sep='\t') index = os.path.join(TEMPLATES, 'ancom', 'index.html') q2templates.render(index, output_dir, context=context)