def subsample(table: biom.Table, subsampling_depth: int, axis: str) -> biom.Table: if axis == 'feature': # we are transposing the table due to biocore/biom-format#759 table = table.transpose() if len(table.ids()) < subsampling_depth: raise ValueError('The subsampling depth exceeds the number of ' 'elements on the desired axis. The maximum depth ' 'is: %d.' % len(table.ids())) # the axis is always 'sample' due to the above transpose table = table.subsample(subsampling_depth, axis='sample', by_id=True) # the inverted axis is always observation due to the above transpose invaxis = 'observation' table.filter(lambda v, i, m: v.sum() > 0, axis=invaxis) if axis == 'feature': # reverse the transpose necessary due to biocore/biom-format#759 table = table.transpose() if table.is_empty(): raise ValueError('The subsampled table contains no samples or features' ' (samples/features that sum to zero after filtering' ' are automatically removed). It may be a good idea' ' to double check that your table is valid/nonempty.') return table
def filter_features(table: biom.Table, tree: NewickFormat) -> (biom.Table, biom.Table): # load the insertion tree tree = skbio.TreeNode.read(str(tree)) # collect all tips=inserted fragments+reference taxa names fragments_tree = { str(tip.name) for tip in tree.tips() if tip.name is not None} # collect all fragments/features from table fragments_table = set(map(str, table.ids(axis='observation'))) if len(fragments_table & fragments_tree) <= 0: raise ValueError(('Not a single fragment of your table is part of your' ' tree. The resulting table would be empty.')) tbl_positive = table.filter(fragments_table & fragments_tree, axis='observation', inplace=False) tbl_negative = table.filter(fragments_table - fragments_tree, axis='observation', inplace=False) # print some information for quality control, # which user can request via --verbose results = pd.DataFrame( data={'kept_reads': tbl_positive.sum(axis='sample'), 'removed_reads': tbl_negative.sum(axis='sample')}, index=tbl_positive.ids()) results['removed_ratio'] = results['removed_reads'] / \ (results['kept_reads'] + results['removed_reads']) return (tbl_positive, tbl_negative)
def filter_table(table: biom.Table, tree: skbio.TreeNode) -> biom.Table: """ Filter table to remove feature ids that are not tip ids in tree """ tip_ids = set([t.name for t in tree.tips()]) feature_ids = set(table.ids(axis='observation')) # ids_to_keep can only include ids that are in table ids_to_keep = tip_ids & feature_ids table.filter(ids_to_keep, axis='observation', inplace=True) return table
def filter_features_conditionally(table: biom.Table, abundance: float, prevalence: float, ) -> biom.Table: """ A function to perform joint filtering because it makes life better """ num_observations, num_samples = table.shape prevalence = prevalence * num_samples # Calculates the filtering parameters on the original table def _filter_f(values, id_, metadata): return (values >= abundance).sum() >= prevalence # Normalized the table to get the prevalance # Copy is because biom really wants to normalize the original table. By # copying and not using inplace, the original table is preserved. # Redundant, but better safe that sorry. table_norm = table.copy().norm(axis='sample', inplace=False) table_norm.filter(_filter_f, axis='observation', inplace=True) filter_ids = table_norm.ids(axis='observation') new_table = table.filter(filter_ids, axis='observation', inplace=False) return new_table
def rpca( table: biom.Table, rank: int = 3, min_sample_count: int = 500, min_feature_count: int = 10, iterations: int = 5 ) -> (skbio.OrdinationResults, skbio.DistanceMatrix): """ Runs RPCA with an rclr preprocessing step""" # filter sample to min depth def sample_filter(val, id_, md): return sum(val) > min_sample_count table = table.filter(sample_filter, axis='sample') table = table.to_dataframe().T.drop_duplicates() table = table.T[table.sum() > min_feature_count].T # rclr preprocessing and OptSpace (RPCA) opt = OptSpace(rank=rank, iteration=iterations).fit(rclr().fit_transform( table.copy())) rename_cols = {i - 1: 'PC' + str(i) for i in range(1, rank + 1)} # Feature Loadings feature_loading = pd.DataFrame(opt.feature_weights, index=table.columns) feature_loading = feature_loading.rename(columns=rename_cols) feature_loading.sort_values('PC1', inplace=True, ascending=True) # Sample Loadings sample_loading = pd.DataFrame(opt.sample_weights, index=table.index) sample_loading = sample_loading.rename(columns=rename_cols) # % var explained proportion_explained = pd.Series(opt.explained_variance_ratio, index=list(rename_cols.values())) # eigan-vals eigvals = pd.Series(opt.eigenvalues, index=list(rename_cols.values())) # if the rank is two add PC3 of zeros if rank == 2: feature_loading['PC3'] = [0] * len(feature_loading.index) sample_loading['PC3'] = [0] * len(sample_loading.index) eigvals.loc['PC3'] = 0 proportion_explained.loc['PC3'] = 0 # save ordination results short_method_name = 'rpca_biplot' long_method_name = '(Robust Aitchison) RPCA Biplot' ord_res = skbio.OrdinationResults( short_method_name, long_method_name, eigvals.copy(), samples=sample_loading.copy(), features=feature_loading.copy(), proportion_explained=proportion_explained.copy()) # save distance matrix dist_res = skbio.stats.distance.DistanceMatrix(opt.distance, ids=sample_loading.index) return ord_res, dist_res
def plot(output_dir, table: biom.Table, metadata: q2.Metadata, case_where: str, control_where: str, feature_tree: skbio.TreeNode = None): with open('/tmp/tree.nwk', 'w') as fh: feature_tree.write(fh) copy_tree(os.path.join(PLOT, 'assets', 'dist'), output_dir) data_dir = os.path.join(output_dir, 'data') os.mkdir(data_dir) metadata = metadata.filter_ids(table.ids(axis='sample')) case_samples = sorted(list(metadata.get_ids(case_where))) control_samples = sorted(list(metadata.get_ids(control_where))) table.filter(case_samples + control_samples) table.remove_empty('observation') features = list(table.ids(axis='observation')) if feature_tree is not None: feature_tree = shear_no_prune(feature_tree, features) else: feature_tree = TreeNode() tree_data = tree_to_array(feature_tree) idx, = np.where(np.asarray(tree_data['children']) == 0) tree_data['lookup'] = dict(zip(map(str, idx), range(len(idx)))) tip_order = np.asarray(tree_data['names'])[idx] table = table.sort_order(tip_order, axis='observation') table = table.sort_order(case_samples + control_samples, axis='sample') with open(os.path.join(data_dir, 'packed_table.jsonp'), 'w') as fh: fh.write('LOAD_PACKED_TABLE(') fh.write(json.dumps(table_to_b64pa(table))) fh.write(');') with open(os.path.join(data_dir, 'tree.jsonp'), 'w') as fh: fh.write('LOAD_TREE(') fh.write(json.dumps(tree_data)) fh.write(');')
def pad_features_in_test_data(train_table: biom.Table, test_table: biom.Table) -> biom.Table: ''' Do feature alignment on train and test tables by adding zero-padding features that only existed in the train table into test table. Parameters ---------- train_table: biom.Table A biom table with train data test_table: biom.Table A biom table with test data Returns ------- new_test_biom: biom.Table A biom table with the updated test data with identical set of features in the train table. ''' train_feature_ids = train_table.ids(axis='observation') test_feature_ids = test_table.ids(axis='observation') n_samples = test_table.shape[0] #n_features = test_table.shape[1] sample_ids= test_table.ids(axis='sample') #print("The # of features in the train data: ", len(train_feature_ids)) #print("The # of features in the original test data: ", len(test_feature_ids)) train_uniq_f=list(set(train_feature_ids)-set(test_feature_ids)) shared_f=set(train_feature_ids).intersection(set(test_feature_ids)) # create a zero matrix for all features uniquely existed in the train table padding_table = biom.Table(np.zeros((len(train_uniq_f), n_samples)), train_uniq_f, sample_ids) # filter out features that don't exist in the train table in the test table test_table.filter(shared_f, axis='observation') n_filtered_features = test_table.shape[1] if n_filtered_features == 0: raise ValueError('No feature overlap between train and test table!' 'Check the feature-format consistentcy between tables!') # merge the two tables new_test_table = test_table.merge(padding_table) return new_test_table
def collapse_biom(table: biom.Table, mapping: dict, normalize=False): """Collapse a BIOM table in many-to-many mode. Parameters ---------- table : biom.Table Table to collapse. mapping : dict of list of str Source-to-target(s) mapping. normalize : bool, optional Whether normalize per-target counts by number of targets per source. Returns ------- biom.Table Collapsed BIOM table. Notes ----- Metadata will not be retained in the collapsed table. See Also -------- .table.collapse_table """ # filter table features table = table.filter(lambda data, id_, md: id_ in mapping, axis='observation', inplace=False) # stop if no feature left if table.is_empty(): return table # add mapping to table metadata table.add_metadata({k: dict(part=v) for k, v in mapping.items()}, axis='observation') # determine collapsing method kwargs = dict(norm=False, one_to_many=True, axis='observation', one_to_many_mode=('divide' if normalize else 'add')) # collapse table in many-to-many mode table = table.collapse(lambda id_, md: zip(md['part'], md['part']), **kwargs) # round to integers if normalize: round_biom(table) # clean up table.del_metadata(keys=['Path']) return table
def generate_per_sample_biom(biom_file, limit): """Generate per-sample BIOM files Parameters ---------- biom_file : str A filepath to a BIOM table limit : int or None Limit the number of tables to load Returns ------- str The sample ID str The table in BIOM Format v1.0 str The table in the classic OTU table format """ table = load_table(biom_file) obs_ids = table.ids(axis='observation') obs_md = table.metadata(axis='observation') if limit is None: limit = np.inf count = 0 for v, sample, _ in table.iter(): if count >= limit: break single_sample = Table(v[:, np.newaxis], obs_ids, [sample], obs_md) single_sample.filter(lambda v_, i, md: v_ > 0, axis='observation') biomv1 = single_sample.to_json('AG') biomtxt = single_sample.to_tsv( header_key='taxonomy', header_value='taxonomy', metadata_formatter=lambda x: '; '.join(x)) yield (sample, biomv1, biomtxt) count += 1
def match_table(tree: TreeNode, feature_table: biom.Table) -> biom.Table: ''' Filters the feature table to retain the features present in the tree. Parameters ---------- tree : TreeNode skbio TreeNode object representing tree of relatedness between molecules feature_table : pd.DataFrame feature table with features in columns and samples in rows Raises ------ ValueError If ``feature_table`` has no features If ``tree`` tips are not a subset of feature names in ``feature_table`` If ``filtered_feature_table`` is empty Returns ------- biom.Table filtered feature table that contains only the features present in the tree ''' if feature_table.shape[0] == 0: raise ValueError("There are no features in the feature table!") allfeatrs = set(feature_table.ids(axis='observation')) tip_names = {node.name for node in tree.tips()} if not tip_names.issubset(allfeatrs): extra_tips = tip_names - tip_names.intersection(allfeatrs) warnings.warn( UserWarning('The following tips were not ' 'found in the feature table:\n' + ', '.join([str(i) for i in extra_tips]))) common_features = list(allfeatrs.intersection(tip_names)) filtered_feature_table = feature_table.filter(common_features, axis='observation', inplace=False) return filtered_feature_table
def cluster_features_closed_reference( sequences: DNAFASTAFormat, table: biom.Table, reference_sequences: DNAFASTAFormat, perc_identity: float, strand: str = 'plus', threads: int = 1) -> (biom.Table, DNAFASTAFormat, DNAFASTAFormat): table_ids = set(table.ids(axis='observation')) sequence_ids = { e.metadata['id'] for e in skbio.io.read( str(sequences), constructor=skbio.DNA, format='fasta') } _error_on_nonoverlapping_ids(table_ids, sequence_ids) matched_seqs, unmatched_seqs = DNAFASTAFormat(), DNAFASTAFormat() with tempfile.NamedTemporaryFile() as fasta_with_sizes, \ tempfile.NamedTemporaryFile() as out_uc, \ tempfile.NamedTemporaryFile() as tmp_unmatched_seqs: _fasta_with_sizes(str(sequences), fasta_with_sizes.name, table) cmd = [ 'vsearch', '--usearch_global', fasta_with_sizes.name, '--id', str(perc_identity), '--db', str(reference_sequences), '--uc', out_uc.name, '--strand', str(strand), '--qmask', 'none', # ensures no lowercase DNA chars '--notmatched', tmp_unmatched_seqs.name, '--threads', str(threads) ] run_command(cmd) out_uc.seek(0) # It is possible for there to be no unmatched sequences --- if that # is the case, skip thie following clean-up. if os.path.getsize(tmp_unmatched_seqs.name) > 0: # We don't really need to sort the matched sequences, this # is just to let us use --xsize, which strips the counts from # the Feature ID. It would be more ideal if --usearch_global, # above let us pass in --xsize, but unfortunately it isn't # supported. cmd = [ 'vsearch', '--sortbysize', tmp_unmatched_seqs.name, '--xsize', '--output', str(unmatched_seqs) ] run_command(cmd) try: conn = _uc_to_sqlite(out_uc) collapse_f = _collapse_f_from_sqlite(conn) _fasta_from_sqlite(conn, str(sequences), str(matched_seqs)) except ValueError: raise VSearchError('No matches were identified to ' 'reference_sequences. This can happen if ' 'sequences are not homologous to ' 'reference_sequences, or if sequences are ' 'not in the same orientation as reference_' 'sequences (i.e., if sequences are reverse ' 'complemented with respect to reference ' 'sequences). Sequence orientation can be ' 'adjusted with the strand parameter.') unmatched_ids = [ e.metadata['id'] for e in skbio.io.read(open(str(unmatched_seqs)), constructor=skbio.DNA, format='fasta') ] table.filter(ids_to_keep=unmatched_ids, invert=True, axis='observation', inplace=True) table = table.collapse(collapse_f, norm=False, min_group_size=1, axis='observation', include_collapsed_metadata=False) return table, matched_seqs, unmatched_seqs
def percentile_normalize(table: biom.Table, metadata: qiime2.MetadataColumn, batch: qiime2.MetadataColumn = None, n_control_thresh: int = 10, otu_thresh: float = 0.3) -> biom.Table: """ Converts an input table with cases and controls into percentiles of control samples. Parameters ---------- table : biom.Table Feature table with relative abundances. Samples are in columns, features (i.e. OTUs) are in rows. metadata : qiime2.CategoricalMetadataColumn metadata column with samples labeled as "case" or "control". All samples with either label are returned, normalized to the equivalent percentile in "control" samples. batch : qiime2.CategoricalMetadataColumn metadata column with the different batches labeled. Percentile normalization will be performed within each batch, and the output tables will be concatenated together. You can use this to normalize multiple studies at once by first merging the original feature table, adding a study ID column in the merged metadata, and then calling percentile normalization with this option. n_control_thresh : int [default=10] Minimum number of controls accepted to perform percentile normalization. Because the transformation converts abundances in controls to a uniform distribution, we *highly* discourage performing percentile normalization on datasets with fewer than 30 controls, and certainly not fewer than 10 (the default value). If you have fewer controls than `n_control_thresh`, the normalization will return an error. otu_thresh : float [default=0.3] The OTU filtering threshold: OTUs must be present in at least otu_thresh fraction of cases OR controls, otherwise it gets thrown out and not percentile normalized. This method does not perform well with very sparse OTUs, so we do not recommend lowering this threshold below 0.3. otu_thresh should be [0, 1] Returns ------- norm_biom : biom.Table A biom table with the normalized data, only including the samples that were labeled as either "case" or "control", and the OTUs which passed the otu_thresh threshold. """ # Filter metadata to only include IDs present in the table. # Also ensures every distance table ID is present in the metadata. metadata = metadata.filter_ids(table.ids(axis='sample')) metadata = metadata.drop_missing_values() # filter the table to exclude samples that were dropped from # the metadata due to missing values table = table.filter(metadata.ids) metadata = metadata.to_series() ## Convert biom Table into dense pandas dataframe # Transpose so samples are in rows and OTUs/features in columns df = table.to_dataframe().to_dense().T # Set up a list of metadata series, one per batch batches_to_norm = [] if batch is not None: batch = batch.filter_ids(table.ids(axis='sample')) batch = batch.drop_missing_values() batch = batch.to_series() for g, one_batch in batch.groupby(batch): batches_to_norm.append(metadata.loc[one_batch.index]) else: batches_to_norm.append(metadata) norm_dfs = [] for meta in batches_to_norm: # Get case and control samples from metadata control_samples = meta[meta == "control"].index.tolist() case_samples = meta[meta == "case"].index.tolist() # Check that there are cases and controls if len(control_samples) == 0: if len(case_samples) == 0: # Both cases and controls are zero raise ValueError( 'There are no case or control samples in your data. Check the metadata column for "case" and "control" labels.' ) # Just controls as zero raise ValueError( 'There are no control samples in your data. Check the metadata column for "control" labels.' ) # Just cases are zero elif len(case_samples) == 0: raise ValueError( 'There are no case samples in your data. Check the metadata column for "case" labels.' ) # Make sure there are enough controls to perform normalization if len(control_samples) < n_control_thresh: if batch is not None: batch_err = (' in batch ' + str(batch.loc[meta.index].unique()[0]) + '') else: batch_err = '' raise ValueError( "There aren't enough controls in your data. " + batch_err + "(n_control_thresh = {})".format(n_control_thresh)) # Filter OTUs, replace zeros with random value, and # percentile normalize norm_df = _percentile_normalize_one_df(df, control_samples, case_samples, otu_thresh) norm_dfs.append(norm_df) # Merge all normalized data # Keep all samples and all OTUs - OTUs not present in one batch will be NaNs norm_df = pd.concat(norm_dfs, axis=1) # Put this dataframe into biom format norm_biom = biom.Table(data=norm_df.values, observation_ids=norm_df.index, sample_ids=norm_df.columns) return norm_biom
def qarcoal( table: biom.Table, taxonomy: pd.DataFrame, num_string: str, denom_string: str, samples_to_use: Metadata = None, allow_shared_features: bool = False, ) -> pd.DataFrame: """Calculate sample-wise log-ratios of features based on taxonomy. Parameters: ----------- table: biom file with which to calculate log ratios taxonomy: pd.DataFrame with taxonomy information (should have Taxon column in which features will be searched) num_string: numerator string to search for in taxonomy denom_string: denominator string to search for in taxonomy samples_to_use: Q2 Metadata file with samples to use. If provided, feature table will be filtered to only consider samples present in this file. (optional) allow_shared_features: bool denoting handling of shared features between numerator and denominator. If False, an error is raised if features are shared between numerator and denominator. If True, will allow shared features without throwing an error. Returns: -------- comparison_df: pd DataFrame in the form: Sample-ID Num_Sum Denom_Sum log_ratio S1 7 15 -0.762140 """ # biom table is features x samples if samples_to_use is not None: filt_samples = set(samples_to_use.to_dataframe().index) feat_table = table.filter(filt_samples, axis="sample", inplace=False) feat_table = feat_table.to_dataframe() else: feat_table = table.to_dataframe() # raise error if there are any negative counts in the feature table if feat_table.lt(0).any().any(): raise ValueError("Feature table has negative counts!") tax_num_df, tax_denom_df = filter_and_join_taxonomy( feat_table, taxonomy, num_string, denom_string, ) # if shared features are disallowed, check to make sure they don't occur # if allowed, can skip this step at user's risk if not allow_shared_features: shared_features = set(tax_num_df.index) & set(tax_denom_df.index) if shared_features: raise ValueError("Shared features between num and denom!") tax_num_sample_sum = tax_num_df.sum(axis=0) tax_denom_sample_sum = tax_denom_df.sum(axis=0) comparison_df = pd.DataFrame.from_records( [tax_num_sample_sum, tax_denom_sample_sum], index=["Num_Sum", "Denom_Sum"], ).T comparison_df["log_ratio"] = comparison_df.apply( lambda x: np.log(x.Num_Sum / x.Denom_Sum), axis=1) comparison_df.index.name = "Sample-ID" return comparison_df
def simple_plot(output_dir, table: biom.Table, feature_tree: skbio.TreeNode, metadata: q2.Metadata, case_where: str, control_where: str, n_transects: int = 10, stratify_by: str = None, mode: str = 'max'): print("Data extracted") layer_dir = os.path.join(output_dir, 'layers') rank_dir = os.path.join(output_dir, 'ranks') os.mkdir(layer_dir) os.mkdir(rank_dir) metadata = metadata.filter_ids(table.ids(axis='sample')) case_samples = sorted(list(metadata.get_ids(case_where))) control_samples = sorted(list(metadata.get_ids(control_where))) get_pairs = comparisons(metadata, control_samples, case_samples, stratify_by) table.filter(case_samples + control_samples) table.remove_empty('observation') features = list(table.ids(axis='observation')) feature_tree = shear_no_prune(feature_tree, features) print("Extraneous features removed") for n in feature_tree.traverse(): if not n.length: n.length = 0 tree = tree_to_array(feature_tree, mode) print("Tree index created") possible_transects = len(np.unique(np.asarray(tree['distances']))) tree_length = tree['distances'][0] # root of tree if n_transects > possible_transects: n_transects = possible_transects print("Only %d transects exist, using that instead" % n_transects) transects = list(np.linspace(0, tree_length, num=n_transects)) print("Will transect at: %s" % ", ".join(map(str, transects))) figure_gen = prepare_plot(tree_length) figure_gen.send(None) # initialize co-routine colors = [] points, _ = pairwise_components(table, get_pairs()) color_fig, highlight_fig, color = figure_gen.send((points, None)) color_fig.savefig(os.path.join(layer_dir, 'original.png'), transparent=True) plt.close(color_fig) highlight_fig.savefig(os.path.join(layer_dir, 'original.h.png'), transparent=True) plt.close(highlight_fig) colors.append(color) rank_files = [] collapsed_groups = pd.DataFrame() for distance in transects: collapsed_table, collapsed_counts, groups = group_by_transect( table, tree, distance) collapsed_groups[groups.name] = groups print("Table collapsed at transect %s" % distance) points, ranks = pairwise_components(collapsed_table, get_pairs()) filename = write_ranks(rank_dir, collapsed_counts, ranks, distance) rank_files.append(filename) color_fig, highlight_fig, color = figure_gen.send((points, distance)) colors.append(color) color_fig.savefig(os.path.join(layer_dir, 'T_%s.png' % distance), transparent=True) plt.close(color_fig) highlight_fig.savefig(os.path.join(layer_dir, 'T_%s.h.png' % distance), transparent=True) plt.close(highlight_fig) print("Finalizing visualization") figure = figure_gen.send((None, None)) figure.savefig(os.path.join(layer_dir, 'trajectory.png'), transparent=True) plt.close(figure) background = next(figure_gen) background.savefig(os.path.join(layer_dir, 'bg.png'), transparent=True) plt.close(background) with open(os.path.join(output_dir, 'collapsed_groups.tsv'), 'w') as fh: collapsed_groups.to_csv(fh, sep='\t') with open(os.path.join(output_dir, 'index.html'), 'w') as fh: template = Environment(loader=BaseLoader).from_string(TEMPLATE) fh.write( template.render({ 'legend': list( zip(['original'] + ['T_%s' % d for d in transects] + ['trajectory'], list(map(to_hex, colors)) + ['red'])), 'filenames': rank_files }))
def simulate_samples(taxonomy_samples, fold, taxon_defaults, ref_taxa, ref_seqs): with open(join(fold, 'sample_test.json')) as fp: test_samples = json.load(fp) test_samples = extract_sample(test_samples, taxonomy_samples) ref_taxa, _ = load_references(ref_taxa, ref_seqs) with open(join(fold, 'seq_test.json')) as fp: test_seqs = json.load(fp) test_taxa = {ref_taxa[sid] for sid in test_seqs} hits = [0] direct_remaps = [0] indirect_remaps = [0] def collapse(taxon, _): if taxon in test_taxa: hits[0] += 1 return taxon if taxon_defaults[taxon][0] in test_taxa: direct_remaps[0] += 1 return taxon_defaults[taxon][0] for try_taxon in taxon_defaults[taxon][1:]: if try_taxon in test_taxa: indirect_remaps[0] += 1 return try_taxon test_samples = test_samples.collapse(collapse, norm=False, axis='observation') logging.info('Test taxon remaps') logging.info(str(hits[0]) + ' hits') logging.info(str(direct_remaps[0]) + ' direct remaps') logging.info(str(indirect_remaps[0]) + ' indirect remaps') samples = [] obs_ids = [] expected = [] taxa_ref = defaultdict(list) for sid, taxon in ref_taxa.items(): if sid in test_seqs: taxa_ref[taxon].append(sid) for abundances, taxon, _ in test_samples.iter(axis='observation'): taxa = taxa_ref[taxon] n_taxa = len(taxa) obs_ids.extend(taxa) expected.extend(ref_taxa[sid] for sid in taxa) taxa_samples = numpy.vstack([abundances // n_taxa] * n_taxa) # magic taxa = cycle(range(n_taxa)) for i, r in enumerate(abundances % n_taxa): for t, _ in zip(taxa, range(int(r))): taxa_samples[t, i] += 1 assert (taxa_samples.sum(axis=0) == abundances).all() samples.append(taxa_samples) test_samples = Table(numpy.vstack(samples), obs_ids, test_samples.ids()) test_samples.filter(lambda v, _, __: v.sum() > 1e-9, axis='observation', inplace=True) return (test_samples, dict(zip(obs_ids, expected)))
def trim_dada2_posthoc( table: biom.Table, representative_sequences: pd.Series, trim_length: int = 0, hashed_feature_ids: bool = True) -> (biom.Table, pd.Series): """ Trims ASVs generated by DADA2 to a standard length Parameters ---------- table : biom.Table The feature table representative_sequences: DNAFASTAFormat The sequences which correspond to the ASV table trim_length : int The length to trim the ASVS. If the length is 0, the minimum sequence length will be used. hash_feature_ids: bool Whether feature and sequence IDs should be hashed. """ # Trims the sequences seq_length = representative_sequences.apply(lambda x: len(x)) if trim_length == 0: trim_length = seq_length.min() if (seq_length < trim_length).any(): warnings.warn( "There are ASVs shorter than the trim length. " "These sequences will be discarded.", UserWarning) rep_seqs = representative_sequences.astype(str) rep_seqs = rep_seqs.loc[seq_length >= trim_length].copy() rep_seqs = pd.DataFrame(data=[rep_seqs.apply(lambda x: x[:trim_length])], index=['sequence']).T # Collapses the table based on the trimmed sequences table.filter(lambda v, id_, md: id_ in rep_seqs.index, axis='observation', inplace=True) table.add_metadata( rep_seqs.loc[table.ids(axis='observation')].to_dict(orient='index'), axis='observation') table2 = table.collapse(lambda id_, md: md['sequence'], norm=False, axis='observation') seqs2 = rep_seqs.drop_duplicates()['sequence'].copy() if hashed_feature_ids: table2.update_ids( {seq_: _hash_seq(seq_) for seq_ in table2.ids(axis='observation')}, axis='observation', inplace=True) seqs2.rename({id_: _hash_seq(seq_) for id_, seq_ in seqs2.items()}, inplace=True) else: seqs2.rename({id_: seq_ for id_, seq_ in seqs2.items()}, inplace=True) return table2, seqs2
def maturity_index(output_dir: str, table: biom.Table, metadata: qiime2.Metadata, column: str, group_by: str, control: str, estimator: str = defaults['estimator_r'], n_estimators: int = defaults['n_estimators'], test_size: float = defaults['test_size'], step: float = defaults['step'], cv: int = defaults['cv'], random_state: int = None, n_jobs: int = defaults['n_jobs'], parameter_tuning: bool = True, optimize_feature_selection: bool = True, stratify: str = False, maz_stats: bool = True, missing_samples: str = defaults['missing_samples']) -> None: # select estimator param_dist, estimator = _select_estimator(estimator, n_jobs, n_estimators) estimator = Pipeline([('dv', DictVectorizer()), ('est', estimator)]) param_dist = _map_params_to_pipeline(param_dist) # split input data into control and treatment groups table, metadata = _load_data(table, metadata, missing_samples=missing_samples, extract=False) fancy_index = metadata[group_by] == control md_control = metadata[fancy_index] table_control = table.filter(md_control.index, inplace=False) # train model on control data estimator, cm, accuracy, importances = split_optimize_classify( table_control, md_control, column, estimator, output_dir, random_state=random_state, n_jobs=n_jobs, test_size=test_size, step=step, cv=cv, parameter_tuning=parameter_tuning, optimize_feature_selection=optimize_feature_selection, param_dist=param_dist, calc_feature_importance=True, load_data=False, scoring=mean_squared_error, stratify=stratify, classification=False, missing_samples='ignore') # predict treatment data index = importances.index table = _extract_features(table) table = [{k: r[k] for k in r.keys() & index} for r in table] y_pred = estimator.predict(table) predicted_column = 'predicted {0}'.format(column) metadata[predicted_column] = y_pred # calculate MAZ score metadata = _maz_score(metadata, predicted_column, column, group_by, control) # visualize table = estimator.named_steps.dv.transform(table).todense() table = pd.DataFrame(table, index=metadata.index, columns=estimator.named_steps.dv.get_feature_names()) _visualize_maturity_index(table, metadata, group_by, column, predicted_column, importances, estimator, accuracy, output_dir, maz_stats=maz_stats)
def ctf_helper( table: biom.Table, sample_metadata: DataFrame, individual_id_column: str, state_columns: list, n_components: int = DEFAULT_COMP, min_sample_count: int = DEFAULT_MSC, min_feature_count: int = DEFAULT_MFC, max_iterations_als: int = DEFAULT_MAXITER, max_iterations_rptm: int = DEFAULT_MAXITER, n_initializations: int = DEFAULT_MAXITER, feature_metadata: DataFrame = DEFFM ) -> (dict, OrdinationResults, dict, tuple): """ Runs Compositional Tensor Factorization CTF. """ # validate the metadata using q2 as a wrapper if sample_metadata is not None and not isinstance(sample_metadata, DataFrame): sample_metadata = sample_metadata.to_dataframe() keep_cols = state_columns + [individual_id_column] all_sample_metadata = sample_metadata.drop(keep_cols, axis=1) sample_metadata = sample_metadata[keep_cols] # validate the metadata using q2 as a wrapper if feature_metadata is not None and not isinstance(feature_metadata, DataFrame): feature_metadata = feature_metadata.to_dataframe() # match the data (borrowed in part from gneiss.util.match) subtablefids = table.ids('observation') subtablesids = table.ids('sample') if len(subtablesids) != len(set(subtablesids)): raise ValueError('Data-table contains duplicate sample IDs') if len(subtablefids) != len(set(subtablefids)): raise ValueError('Data-table contains duplicate feature IDs') submetadataids = set(sample_metadata.index) subtablesids = set(subtablesids) subtablefids = set(subtablefids) if feature_metadata is not None: submetadatafeat = set(feature_metadata.index) fidx = subtablefids & submetadatafeat if len(fidx) == 0: raise ValueError(("No more features left. Check to make " "sure that the sample names between " "`feature-metadata` and `table` are " "consistent")) feature_metadata = feature_metadata.reindex(fidx) sidx = subtablesids & submetadataids if len(sidx) == 0: raise ValueError(("No more features left. Check to make sure that " "the sample names between `sample-metadata` and" " `table` are consistent")) if feature_metadata is not None: table.filter(list(fidx), axis='observation', inplace=True) table.filter(list(sidx), axis='sample', inplace=True) sample_metadata = sample_metadata.reindex(sidx) # filter and import table for axis, min_sum in zip(['sample', 'observation'], [min_sample_count, min_feature_count]): table = table.filter(table.ids(axis)[table.sum(axis) >= min_sum], axis=axis, inplace=True) # table to dataframe table = DataFrame(table.matrix_data.toarray(), table.ids('observation'), table.ids('sample')) # tensor building tensor = build() tensor.construct(table, sample_metadata, individual_id_column, state_columns) # factorize TF = TensorFactorization(n_components=n_components, max_als_iterations=max_iterations_als, max_rtpm_iterations=max_iterations_rptm, n_initializations=n_initializations).fit( rclr(tensor.counts)) # label tensor loadings TF.label(tensor, taxonomy=feature_metadata) # if the n_components is two add PC3 of zeros # this is referenced as in issue in # <https://github.com/biocore/emperor/commit # /a93f029548c421cb0ba365b4294f7a5a6b0209ce> if n_components == 2: TF.subjects.loc[:, 'PC3'] = [0] * len(TF.subjects.index) TF.features.loc[:, 'PC3'] = [0] * len(TF.features.index) TF.proportion_explained['PC3'] = 0 TF.eigvals['PC3'] = 0 # save ordination results short_method_name = 'CTF_Biplot' long_method_name = 'Compositional Tensor Factorization Biplot' # only keep PC -- other tools merge metadata keep_PC = [col for col in TF.features.columns if 'PC' in col] subj_ordin = OrdinationResults( short_method_name, long_method_name, TF.eigvals, samples=TF.subjects[keep_PC].dropna(axis=0), features=TF.features[keep_PC].dropna(axis=0), proportion_explained=TF.proportion_explained) # save distance matrix for each condition distances = {} state_ordn = {} subject_trajectories = {} feature_trajectories = {} for condition, cond, dist, straj, ftraj in zip(tensor.conditions, TF.conditions, TF.subject_distances, TF.subject_trajectory, TF.feature_trajectory): # match distances to metadata ids = straj.index ind_dict = dict((ind, ind_i) for ind_i, ind in enumerate(ids)) inter = set(ind_dict).intersection(sample_metadata.index) indices = sorted([ind_dict[ind] for ind in inter]) dist = dist[indices, :][:, indices] distances[condition] = skbio.stats.distance.DistanceMatrix( dist, ids=ids[indices]) # fix conditions if n_components == 2: cond['PC3'] = [0] * len(cond.index) cond = OrdinationResults(short_method_name, long_method_name, TF.eigvals, samples=cond[keep_PC].dropna(axis=0), features=TF.features[keep_PC].dropna(axis=0), proportion_explained=TF.proportion_explained) state_ordn[condition] = cond # add the sample metadata before returning output # addtionally only keep metadata with trajectory # output available. pre_merge_cols = list(straj.columns) straj = concat( [straj.reindex(all_sample_metadata.index), all_sample_metadata], axis=1, sort=True) straj = straj.dropna(subset=pre_merge_cols) # ensure index name for q2 straj.index.name = "#SampleID" # save traj. keep_PC_traj = [col for col in straj.columns if 'PC' in col] straj[keep_PC_traj] -= straj[keep_PC_traj].mean() ftraj[keep_PC_traj] -= ftraj[keep_PC_traj].mean() subject_trajectories[condition] = straj ftraj.index = ftraj.index.astype(str) feature_trajectories[condition] = ftraj return (state_ordn, subj_ordin, distances, subject_trajectories, feature_trajectories)
def rpca( table: biom.Table, n_components: Union[int, str] = DEFAULT_COMP, min_sample_count: int = DEFAULT_MSC, min_feature_count: int = DEFAULT_MFC, min_feature_frequency: float = DEFAULT_MFF, max_iterations: int = DEFAULT_OPTSPACE_ITERATIONS ) -> (skbio.OrdinationResults, skbio.DistanceMatrix): """Runs RPCA with an matrix_rclr preprocessing step. This code will be run by both the standalone and QIIME 2 versions of gemelli. """ # get shape of table n_features, n_samples = table.shape # filter sample to min seq. depth def sample_filter(val, id_, md): return sum(val) > min_sample_count # filter features to min total counts def observation_filter(val, id_, md): return sum(val) > min_feature_count # filter features by N samples presence def frequency_filter(val, id_, md): return (np.sum(val > 0) / n_samples) > (min_feature_frequency / 100) # filter and import table for each filter above table = table.filter(observation_filter, axis='observation') table = table.filter(frequency_filter, axis='observation') table = table.filter(sample_filter, axis='sample') # table to dataframe table = pd.DataFrame(table.matrix_data.toarray(), table.ids('observation'), table.ids('sample')).T # check the table after filtering if len(table.index) != len(set(table.index)): raise ValueError('Data-table contains duplicate indices') if len(table.columns) != len(set(table.columns)): raise ValueError('Data-table contains duplicate columns') # Robust-clt (matrix_rclr) preprocessing and OptSpace (RPCA) opt = MatrixCompletion(n_components=n_components, max_iterations=max_iterations).fit( matrix_rclr(table)) # get new n-comp when applicable n_components = opt.s.shape[0] # get PC column labels for the skbio OrdinationResults rename_cols = ['PC' + str(i + 1) for i in range(n_components)] # get completed matrix for centering X = opt.sample_weights @ opt.s @ opt.feature_weights.T # center again around zero after completion X = X - X.mean(axis=0) X = X - X.mean(axis=1).reshape(-1, 1) # re-factor the data u, s, v = svd(X) # only take n-components u = u[:, :n_components] v = v.T[:, :n_components] # calc. the new variance using projection p = s**2 / np.sum(s**2) p = p[:n_components] s = s[:n_components] # save the loadings feature_loading = pd.DataFrame(v, index=table.columns, columns=rename_cols) sample_loading = pd.DataFrame(u, index=table.index, columns=rename_cols) # % var explained proportion_explained = pd.Series(p, index=rename_cols) # get eigenvalues eigvals = pd.Series(s, index=rename_cols) # if the n_components is two add PC3 of zeros # this is referenced as in issue in # <https://github.com/biocore/emperor/commit # /a93f029548c421cb0ba365b4294f7a5a6b0209ce> # discussed in gemelli -- PR#29 if n_components == 2: feature_loading['PC3'] = [0] * len(feature_loading.index) sample_loading['PC3'] = [0] * len(sample_loading.index) eigvals.loc['PC3'] = 0 proportion_explained.loc['PC3'] = 0 # save ordination results short_method_name = 'rpca_biplot' long_method_name = '(Robust Aitchison) RPCA Biplot' ord_res = skbio.OrdinationResults( short_method_name, long_method_name, eigvals.copy(), samples=sample_loading.copy(), features=feature_loading.copy(), proportion_explained=proportion_explained.copy()) # save distance matrix dist_res = skbio.stats.distance.DistanceMatrix(opt.distance, ids=sample_loading.index) return ord_res, dist_res
def rpca( table: biom.Table, n_components: int = DEFAULT_RANK, min_sample_count: int = DEFAULT_MSC, min_feature_count: int = DEFAULT_MFC, max_iterations: int = DEFAULT_ITERATIONS ) -> (skbio.OrdinationResults, skbio.DistanceMatrix): """Runs RPCA with an rclr preprocessing step. This code will be run by both the standalone and QIIME 2 versions of DEICODE. """ # filter sample to min depth def sample_filter(val, id_, md): return sum(val) > min_sample_count def observation_filter(val, id_, md): return sum(val) > min_feature_count # filter and import table table = table.filter(observation_filter, axis='observation') table = table.filter(sample_filter, axis='sample') table = table.to_dataframe().T if len(table.index) != len(set(table.index)): raise ValueError('Data-table contains duplicate indices') if len(table.columns) != len(set(table.columns)): raise ValueError('Data-table contains duplicate columns') # rclr preprocessing and OptSpace (RPCA) opt = MatrixCompletion(n_components=n_components, max_iterations=max_iterations).fit(rclr(table)) rename_cols = ['PC' + str(i + 1) for i in range(n_components)] X = opt.sample_weights @ opt.s @ opt.feature_weights.T X = X - X.mean(axis=0) X = X - X.mean(axis=1).reshape(-1, 1) u, s, v = svd(X) u = u[:, :n_components] v = v.T[:, :n_components] p = s**2 / np.sum(s**2) p = p[:n_components] s = s[:n_components] feature_loading = pd.DataFrame(v, index=table.columns, columns=rename_cols) sample_loading = pd.DataFrame(u, index=table.index, columns=rename_cols) # % var explained proportion_explained = pd.Series(p, index=rename_cols) # get eigenvalues eigvals = pd.Series(s, index=rename_cols) # if the n_components is two add PC3 of zeros # this is referenced as in issue in # <https://github.com/biocore/emperor/commit # /a93f029548c421cb0ba365b4294f7a5a6b0209ce> # discussed in DEICODE -- PR#29 if n_components == 2: feature_loading['PC3'] = [0] * len(feature_loading.index) sample_loading['PC3'] = [0] * len(sample_loading.index) eigvals.loc['PC3'] = 0 proportion_explained.loc['PC3'] = 0 # save ordination results short_method_name = 'rpca_biplot' long_method_name = '(Robust Aitchison) RPCA Biplot' ord_res = skbio.OrdinationResults( short_method_name, long_method_name, eigvals.copy(), samples=sample_loading.copy(), features=feature_loading.copy(), proportion_explained=proportion_explained.copy()) # save distance matrix dist_res = skbio.stats.distance.DistanceMatrix(opt.distance, ids=sample_loading.index) return ord_res, dist_res
def collapse_biom(table: biom.Table, mapping: dict, divide=False, field=None): """Collapse a BIOM table in many-to-many mode. Parameters ---------- table : biom.Table Table to collapse. mapping : dict of list of str Source-to-target(s) mapping. divide : bool, optional Whether divide per-target counts by number of targets per source. field : int, optional Index of field to be collapsed in a stratified table. Returns ------- biom.Table Collapsed BIOM table. Raises ------ ValueError Field index is not present in a feature ID. Notes ----- Metadata will not be retained in the collapsed table. See Also -------- .table.collapse_table """ # generate metadata metadata = {} for id_ in table.ids('observation'): feature = id_ if field: fields = feature.split('|') try: feature = fields[field] except IndexError: raise ValueError( f'Feature "{feature}" has less than {field + 1} fields.') if feature not in mapping: continue targets = [] for target in mapping[feature]: if field: fields[field] = target target = '|'.join(fields) targets.append(target) metadata[id_] = dict(part=targets) # filter table features table = table.filter(lambda data, id_, md: id_ in metadata, axis='observation', inplace=False) # stop if no feature left if table.is_empty(): return table # add mapping to table metadata table.add_metadata(metadata, axis='observation') # determine collapsing method kwargs = dict(norm=False, one_to_many=True, axis='observation', one_to_many_mode=('divide' if divide else 'add')) # collapse table in many-to-many mode table = table.collapse(lambda _, md: zip(md['part'], md['part']), **kwargs) # round to integers if divide: round_biom(table) # clean up table.del_metadata(keys=['Path']) return table