def subsample(table: biom.Table, subsampling_depth: int,
              axis: str) -> biom.Table:
    if axis == 'feature':
        # we are transposing the table due to biocore/biom-format#759
        table = table.transpose()

    if len(table.ids()) < subsampling_depth:
        raise ValueError('The subsampling depth exceeds the number of '
                         'elements on the desired axis. The maximum depth '
                         'is: %d.' % len(table.ids()))

    # the axis is always 'sample' due to the above transpose
    table = table.subsample(subsampling_depth, axis='sample', by_id=True)

    # the inverted axis is always observation due to the above transpose
    invaxis = 'observation'
    table.filter(lambda v, i, m: v.sum() > 0, axis=invaxis)

    if axis == 'feature':
        # reverse the transpose necessary due to biocore/biom-format#759
        table = table.transpose()

    if table.is_empty():
        raise ValueError('The subsampled table contains no samples or features'
                         ' (samples/features that sum to zero after filtering'
                         ' are automatically removed). It may be a good idea'
                         ' to double check that your table is valid/nonempty.')

    return table
def filter_features(table: biom.Table,
                    tree: NewickFormat) -> (biom.Table, biom.Table):

    # load the insertion tree
    tree = skbio.TreeNode.read(str(tree))
    # collect all tips=inserted fragments+reference taxa names
    fragments_tree = {
        str(tip.name)
        for tip in tree.tips()
        if tip.name is not None}

    # collect all fragments/features from table
    fragments_table = set(map(str, table.ids(axis='observation')))

    if len(fragments_table & fragments_tree) <= 0:
        raise ValueError(('Not a single fragment of your table is part of your'
                          ' tree. The resulting table would be empty.'))

    tbl_positive = table.filter(fragments_table & fragments_tree,
                                axis='observation', inplace=False)
    tbl_negative = table.filter(fragments_table - fragments_tree,
                                axis='observation', inplace=False)

    # print some information for quality control,
    # which user can request via --verbose
    results = pd.DataFrame(
        data={'kept_reads': tbl_positive.sum(axis='sample'),
              'removed_reads': tbl_negative.sum(axis='sample')},
        index=tbl_positive.ids())
    results['removed_ratio'] = results['removed_reads'] / \
        (results['kept_reads'] + results['removed_reads'])

    return (tbl_positive, tbl_negative)
示例#3
0
def filter_table(table: biom.Table, tree: skbio.TreeNode) -> biom.Table:
    """ Filter table to remove feature ids that are not tip ids in tree
    """
    tip_ids = set([t.name for t in tree.tips()])
    feature_ids = set(table.ids(axis='observation'))
    # ids_to_keep can only include ids that are in table
    ids_to_keep = tip_ids & feature_ids
    table.filter(ids_to_keep, axis='observation', inplace=True)
    return table
示例#4
0
def filter_table(table: biom.Table, tree: skbio.TreeNode) -> biom.Table:
    """ Filter table to remove feature ids that are not tip ids in tree
    """
    tip_ids = set([t.name for t in tree.tips()])
    feature_ids = set(table.ids(axis='observation'))
    # ids_to_keep can only include ids that are in table
    ids_to_keep = tip_ids & feature_ids
    table.filter(ids_to_keep, axis='observation', inplace=True)
    return table
示例#5
0
def filter_features_conditionally(table: biom.Table,
                                  abundance: float,
                                  prevalence: float,
                                  ) -> biom.Table:
    """
    A function to perform joint filtering because it makes life better
    """
    num_observations, num_samples = table.shape
    prevalence = prevalence * num_samples

    # Calculates the filtering parameters on the original table
    def _filter_f(values, id_, metadata):
        return (values >= abundance).sum() >= prevalence

    # Normalized the table to get the prevalance
    # Copy is because biom really wants to normalize the original table. By
    # copying and not using inplace, the original table is preserved.
    # Redundant, but better safe that sorry.
    table_norm = table.copy().norm(axis='sample', inplace=False)
    table_norm.filter(_filter_f, axis='observation', inplace=True)
    filter_ids = table_norm.ids(axis='observation')

    new_table = table.filter(filter_ids, axis='observation', inplace=False)

    return new_table
示例#6
0
def rpca(
        table: biom.Table,
        rank: int = 3,
        min_sample_count: int = 500,
        min_feature_count: int = 10,
        iterations: int = 5
) -> (skbio.OrdinationResults, skbio.DistanceMatrix):
    """ Runs RPCA with an rclr preprocessing step"""

    # filter sample to min depth
    def sample_filter(val, id_, md):
        return sum(val) > min_sample_count

    table = table.filter(sample_filter, axis='sample')
    table = table.to_dataframe().T.drop_duplicates()
    table = table.T[table.sum() > min_feature_count].T

    # rclr preprocessing and OptSpace (RPCA)
    opt = OptSpace(rank=rank, iteration=iterations).fit(rclr().fit_transform(
        table.copy()))
    rename_cols = {i - 1: 'PC' + str(i) for i in range(1, rank + 1)}

    # Feature Loadings
    feature_loading = pd.DataFrame(opt.feature_weights, index=table.columns)
    feature_loading = feature_loading.rename(columns=rename_cols)
    feature_loading.sort_values('PC1', inplace=True, ascending=True)

    # Sample Loadings
    sample_loading = pd.DataFrame(opt.sample_weights, index=table.index)
    sample_loading = sample_loading.rename(columns=rename_cols)

    # % var explained
    proportion_explained = pd.Series(opt.explained_variance_ratio,
                                     index=list(rename_cols.values()))
    # eigan-vals
    eigvals = pd.Series(opt.eigenvalues, index=list(rename_cols.values()))

    # if the rank is two add PC3 of zeros
    if rank == 2:
        feature_loading['PC3'] = [0] * len(feature_loading.index)
        sample_loading['PC3'] = [0] * len(sample_loading.index)
        eigvals.loc['PC3'] = 0
        proportion_explained.loc['PC3'] = 0

    # save ordination results
    short_method_name = 'rpca_biplot'
    long_method_name = '(Robust Aitchison) RPCA Biplot'
    ord_res = skbio.OrdinationResults(
        short_method_name,
        long_method_name,
        eigvals.copy(),
        samples=sample_loading.copy(),
        features=feature_loading.copy(),
        proportion_explained=proportion_explained.copy())
    # save distance matrix
    dist_res = skbio.stats.distance.DistanceMatrix(opt.distance,
                                                   ids=sample_loading.index)

    return ord_res, dist_res
示例#7
0
def plot(output_dir,
         table: biom.Table,
         metadata: q2.Metadata,
         case_where: str,
         control_where: str,
         feature_tree: skbio.TreeNode = None):

    with open('/tmp/tree.nwk', 'w') as fh:
        feature_tree.write(fh)

    copy_tree(os.path.join(PLOT, 'assets', 'dist'), output_dir)
    data_dir = os.path.join(output_dir, 'data')
    os.mkdir(data_dir)

    metadata = metadata.filter_ids(table.ids(axis='sample'))
    case_samples = sorted(list(metadata.get_ids(case_where)))
    control_samples = sorted(list(metadata.get_ids(control_where)))

    table.filter(case_samples + control_samples)
    table.remove_empty('observation')
    features = list(table.ids(axis='observation'))

    if feature_tree is not None:
        feature_tree = shear_no_prune(feature_tree, features)
    else:
        feature_tree = TreeNode()

    tree_data = tree_to_array(feature_tree)
    idx, = np.where(np.asarray(tree_data['children']) == 0)
    tree_data['lookup'] = dict(zip(map(str, idx), range(len(idx))))

    tip_order = np.asarray(tree_data['names'])[idx]
    table = table.sort_order(tip_order, axis='observation')
    table = table.sort_order(case_samples + control_samples, axis='sample')

    with open(os.path.join(data_dir, 'packed_table.jsonp'), 'w') as fh:
        fh.write('LOAD_PACKED_TABLE(')
        fh.write(json.dumps(table_to_b64pa(table)))
        fh.write(');')

    with open(os.path.join(data_dir, 'tree.jsonp'), 'w') as fh:
        fh.write('LOAD_TREE(')
        fh.write(json.dumps(tree_data))
        fh.write(');')
示例#8
0
def pad_features_in_test_data(train_table: biom.Table, 
                                test_table: biom.Table) -> biom.Table:
    '''
    Do feature alignment on train and test tables by adding zero-padding features that
    only existed in the train table into test table.

    Parameters
    ----------
    train_table: biom.Table
    A biom table with train data
    test_table: biom.Table
    A biom table with test data
    
    Returns
    -------
    new_test_biom: biom.Table
    A biom table with the updated test data with identical set of
        features in the train table.
    '''

    train_feature_ids = train_table.ids(axis='observation')
    test_feature_ids = test_table.ids(axis='observation')

    n_samples = test_table.shape[0]
    #n_features = test_table.shape[1]
    sample_ids= test_table.ids(axis='sample')
    #print("The # of features in the train data: ", len(train_feature_ids))
    #print("The # of features in the original test data: ", len(test_feature_ids))
    train_uniq_f=list(set(train_feature_ids)-set(test_feature_ids))
    shared_f=set(train_feature_ids).intersection(set(test_feature_ids))
    # create a zero matrix for all features uniquely existed in the train table
    padding_table = biom.Table(np.zeros((len(train_uniq_f), n_samples)),
                                train_uniq_f, sample_ids)
    # filter out features that don't exist in the train table in the test table
    test_table.filter(shared_f, axis='observation')

    n_filtered_features = test_table.shape[1]
    if n_filtered_features == 0:
        raise ValueError('No feature overlap between train and test table!'
                         'Check the feature-format consistentcy between tables!')
    # merge the two tables
    new_test_table = test_table.merge(padding_table)

    return new_test_table
示例#9
0
文件: biom.py 项目: mortonjt/woltka
def collapse_biom(table: biom.Table, mapping: dict, normalize=False):
    """Collapse a BIOM table in many-to-many mode.

    Parameters
    ----------
    table : biom.Table
        Table to collapse.
    mapping : dict of list of str
        Source-to-target(s) mapping.
    normalize : bool, optional
        Whether normalize per-target counts by number of targets per source.

    Returns
    -------
    biom.Table
        Collapsed BIOM table.

    Notes
    -----
    Metadata will not be retained in the collapsed table.

    See Also
    --------
    .table.collapse_table
    """
    # filter table features
    table = table.filter(lambda data, id_, md: id_ in mapping,
                         axis='observation',
                         inplace=False)

    # stop if no feature left
    if table.is_empty():
        return table

    # add mapping to table metadata
    table.add_metadata({k: dict(part=v)
                        for k, v in mapping.items()},
                       axis='observation')

    # determine collapsing method
    kwargs = dict(norm=False,
                  one_to_many=True,
                  axis='observation',
                  one_to_many_mode=('divide' if normalize else 'add'))

    # collapse table in many-to-many mode
    table = table.collapse(lambda id_, md: zip(md['part'], md['part']),
                           **kwargs)

    # round to integers
    if normalize:
        round_biom(table)

    # clean up
    table.del_metadata(keys=['Path'])
    return table
示例#10
0
def generate_per_sample_biom(biom_file, limit):
    """Generate per-sample BIOM files

    Parameters
    ----------
    biom_file : str
        A filepath to a BIOM table
    limit : int or None
        Limit the number of tables to load

    Returns
    -------
    str
        The sample ID
    str
        The table in BIOM Format v1.0
    str
        The table in the classic OTU table format
    """
    table = load_table(biom_file)
    obs_ids = table.ids(axis='observation')
    obs_md = table.metadata(axis='observation')

    if limit is None:
        limit = np.inf

    count = 0
    for v, sample, _ in table.iter():
        if count >= limit:
            break

        single_sample = Table(v[:, np.newaxis], obs_ids, [sample], obs_md)
        single_sample.filter(lambda v_, i, md: v_ > 0, axis='observation')
        biomv1 = single_sample.to_json('AG')
        biomtxt = single_sample.to_tsv(
            header_key='taxonomy',
            header_value='taxonomy',
            metadata_formatter=lambda x: '; '.join(x))
        yield (sample, biomv1, biomtxt)
        count += 1
示例#11
0
def match_table(tree: TreeNode, feature_table: biom.Table) -> biom.Table:
    '''
    Filters the feature table to retain the features present in the tree.

    Parameters
    ----------
    tree : TreeNode
        skbio TreeNode object representing tree of relatedness
        between molecules
    feature_table : pd.DataFrame
        feature table with features in columns and samples in rows

    Raises
    ------
    ValueError
        If ``feature_table`` has no features
        If ``tree`` tips are not a subset of feature names in ``feature_table``
        If ``filtered_feature_table`` is empty

    Returns
    -------
    biom.Table
        filtered feature table that contains only the features present in
        the tree
    '''
    if feature_table.shape[0] == 0:
        raise ValueError("There are no features in the feature table!")
    allfeatrs = set(feature_table.ids(axis='observation'))
    tip_names = {node.name for node in tree.tips()}
    if not tip_names.issubset(allfeatrs):
        extra_tips = tip_names - tip_names.intersection(allfeatrs)
        warnings.warn(
            UserWarning('The following tips were not '
                        'found in the feature table:\n' +
                        ', '.join([str(i) for i in extra_tips])))
    common_features = list(allfeatrs.intersection(tip_names))
    filtered_feature_table = feature_table.filter(common_features,
                                                  axis='observation',
                                                  inplace=False)
    return filtered_feature_table
示例#12
0
def cluster_features_closed_reference(
        sequences: DNAFASTAFormat,
        table: biom.Table,
        reference_sequences: DNAFASTAFormat,
        perc_identity: float,
        strand: str = 'plus',
        threads: int = 1) -> (biom.Table, DNAFASTAFormat, DNAFASTAFormat):

    table_ids = set(table.ids(axis='observation'))
    sequence_ids = {
        e.metadata['id']
        for e in skbio.io.read(
            str(sequences), constructor=skbio.DNA, format='fasta')
    }
    _error_on_nonoverlapping_ids(table_ids, sequence_ids)
    matched_seqs, unmatched_seqs = DNAFASTAFormat(), DNAFASTAFormat()

    with tempfile.NamedTemporaryFile() as fasta_with_sizes, \
            tempfile.NamedTemporaryFile() as out_uc, \
            tempfile.NamedTemporaryFile() as tmp_unmatched_seqs:
        _fasta_with_sizes(str(sequences), fasta_with_sizes.name, table)
        cmd = [
            'vsearch',
            '--usearch_global',
            fasta_with_sizes.name,
            '--id',
            str(perc_identity),
            '--db',
            str(reference_sequences),
            '--uc',
            out_uc.name,
            '--strand',
            str(strand),
            '--qmask',
            'none',  # ensures no lowercase DNA chars
            '--notmatched',
            tmp_unmatched_seqs.name,
            '--threads',
            str(threads)
        ]
        run_command(cmd)
        out_uc.seek(0)

        # It is possible for there to be no unmatched sequences --- if that
        # is the case, skip thie following clean-up.
        if os.path.getsize(tmp_unmatched_seqs.name) > 0:
            # We don't really need to sort the matched sequences, this
            # is just to let us use --xsize, which strips the counts from
            # the Feature ID. It would be more ideal if --usearch_global,
            # above let us pass in --xsize, but unfortunately it isn't
            # supported.
            cmd = [
                'vsearch', '--sortbysize', tmp_unmatched_seqs.name, '--xsize',
                '--output',
                str(unmatched_seqs)
            ]
            run_command(cmd)

        try:
            conn = _uc_to_sqlite(out_uc)
            collapse_f = _collapse_f_from_sqlite(conn)
            _fasta_from_sqlite(conn, str(sequences), str(matched_seqs))
        except ValueError:
            raise VSearchError('No matches were identified to '
                               'reference_sequences. This can happen if '
                               'sequences are not homologous to '
                               'reference_sequences, or if sequences are '
                               'not in the same orientation as reference_'
                               'sequences (i.e., if sequences are reverse '
                               'complemented with respect to reference '
                               'sequences). Sequence orientation can be '
                               'adjusted with the strand parameter.')

        unmatched_ids = [
            e.metadata['id'] for e in skbio.io.read(open(str(unmatched_seqs)),
                                                    constructor=skbio.DNA,
                                                    format='fasta')
        ]
    table.filter(ids_to_keep=unmatched_ids,
                 invert=True,
                 axis='observation',
                 inplace=True)
    table = table.collapse(collapse_f,
                           norm=False,
                           min_group_size=1,
                           axis='observation',
                           include_collapsed_metadata=False)

    return table, matched_seqs, unmatched_seqs
def percentile_normalize(table: biom.Table,
                         metadata: qiime2.MetadataColumn,
                         batch: qiime2.MetadataColumn = None,
                         n_control_thresh: int = 10,
                         otu_thresh: float = 0.3) -> biom.Table:
    """
    Converts an input table with cases and controls into percentiles
    of control samples.

    Parameters
    ----------
    table : biom.Table
        Feature table with relative abundances. Samples are in columns,
        features (i.e. OTUs) are in rows.
    metadata : qiime2.CategoricalMetadataColumn
        metadata column with samples labeled as "case" or "control".
        All samples with either label are returned, normalized to the
        equivalent percentile in "control" samples.
    batch : qiime2.CategoricalMetadataColumn
        metadata column with the different batches labeled. Percentile
        normalization will be performed within each batch, and the output
        tables will be concatenated together. You can use this to normalize
        multiple studies at once by first merging the original feature table,
        adding a study ID column in the merged metadata, and then calling
        percentile normalization with this option.
    n_control_thresh : int [default=10]
        Minimum number of controls accepted to perform percentile
        normalization. Because the transformation converts abundances
        in controls to a uniform distribution, we *highly* discourage
        performing percentile normalization on datasets with fewer than
        30 controls, and certainly not fewer than 10 (the default value).
        If you have fewer controls than `n_control_thresh`, the
        normalization will return an error.
    otu_thresh : float [default=0.3]
        The OTU filtering threshold: OTUs must be present in at least
        otu_thresh fraction of cases OR controls, otherwise it gets thrown
        out and not percentile normalized. This method does not perform
        well with very sparse OTUs, so we do not recommend lowering
        this threshold below 0.3. otu_thresh should be [0, 1]

    Returns
    -------
    norm_biom : biom.Table
        A biom table with the normalized data, only including the samples
        that were labeled as either "case" or "control", and the OTUs
        which passed the otu_thresh threshold.
    """
    # Filter metadata to only include IDs present in the table.
    # Also ensures every distance table ID is present in the metadata.
    metadata = metadata.filter_ids(table.ids(axis='sample'))
    metadata = metadata.drop_missing_values()

    # filter the table to exclude samples that were dropped from
    # the metadata due to missing values
    table = table.filter(metadata.ids)

    metadata = metadata.to_series()

    ## Convert biom Table into dense pandas dataframe
    # Transpose so samples are in rows and OTUs/features in columns
    df = table.to_dataframe().to_dense().T

    # Set up a list of metadata series, one per batch
    batches_to_norm = []
    if batch is not None:
        batch = batch.filter_ids(table.ids(axis='sample'))
        batch = batch.drop_missing_values()
        batch = batch.to_series()
        for g, one_batch in batch.groupby(batch):
            batches_to_norm.append(metadata.loc[one_batch.index])
    else:
        batches_to_norm.append(metadata)

    norm_dfs = []
    for meta in batches_to_norm:
        # Get case and control samples from metadata
        control_samples = meta[meta == "control"].index.tolist()
        case_samples = meta[meta == "case"].index.tolist()

        # Check that there are cases and controls
        if len(control_samples) == 0:
            if len(case_samples) == 0:
                # Both cases and controls are zero
                raise ValueError(
                    'There are no case or control samples in your data. Check the metadata column for "case" and "control" labels.'
                )
            # Just controls as zero
            raise ValueError(
                'There are no control samples in your data. Check the metadata column for "control" labels.'
            )
        # Just cases are zero
        elif len(case_samples) == 0:
            raise ValueError(
                'There are no case samples in your data. Check the metadata column for "case" labels.'
            )

        # Make sure there are enough controls to perform normalization
        if len(control_samples) < n_control_thresh:
            if batch is not None:
                batch_err = (' in batch ' +
                             str(batch.loc[meta.index].unique()[0]) + '')
            else:
                batch_err = ''
            raise ValueError(
                "There aren't enough controls in your data. " + batch_err +
                "(n_control_thresh = {})".format(n_control_thresh))

        # Filter OTUs, replace zeros with random value, and
        # percentile normalize
        norm_df = _percentile_normalize_one_df(df, control_samples,
                                               case_samples, otu_thresh)
        norm_dfs.append(norm_df)

    # Merge all normalized data
    # Keep all samples and all OTUs - OTUs not present in one batch will be NaNs
    norm_df = pd.concat(norm_dfs, axis=1)

    # Put this dataframe into biom format
    norm_biom = biom.Table(data=norm_df.values,
                           observation_ids=norm_df.index,
                           sample_ids=norm_df.columns)

    return norm_biom
示例#14
0
def qarcoal(
    table: biom.Table,
    taxonomy: pd.DataFrame,
    num_string: str,
    denom_string: str,
    samples_to_use: Metadata = None,
    allow_shared_features: bool = False,
) -> pd.DataFrame:
    """Calculate sample-wise log-ratios of features based on taxonomy.

    Parameters:
    -----------
        table: biom file with which to calculate log ratios
        taxonomy: pd.DataFrame with taxonomy information (should have Taxon
            column in which features will be searched)
        num_string: numerator string to search for in taxonomy
        denom_string: denominator string to search for in taxonomy
        samples_to_use: Q2 Metadata file with samples to use.
            If provided, feature table will be filtered to only consider
            samples present in this file. (optional)
        allow_shared_features: bool denoting handling of shared features
            between numerator and denominator. If False, an error is raised
            if features are shared between numerator and denominator. If True,
            will allow shared features without throwing an error.
    Returns:
    --------
        comparison_df: pd DataFrame in the form:

            Sample-ID    Num_Sum    Denom_Sum   log_ratio
                   S1          7           15   -0.762140
    """

    # biom table is features x samples
    if samples_to_use is not None:
        filt_samples = set(samples_to_use.to_dataframe().index)
        feat_table = table.filter(filt_samples, axis="sample", inplace=False)
        feat_table = feat_table.to_dataframe()
    else:
        feat_table = table.to_dataframe()

    # raise error if there are any negative counts in the feature table
    if feat_table.lt(0).any().any():
        raise ValueError("Feature table has negative counts!")

    tax_num_df, tax_denom_df = filter_and_join_taxonomy(
        feat_table,
        taxonomy,
        num_string,
        denom_string,
    )

    # if shared features are disallowed, check to make sure they don't occur
    # if allowed, can skip this step at user's risk
    if not allow_shared_features:
        shared_features = set(tax_num_df.index) & set(tax_denom_df.index)
        if shared_features:
            raise ValueError("Shared features between num and denom!")

    tax_num_sample_sum = tax_num_df.sum(axis=0)
    tax_denom_sample_sum = tax_denom_df.sum(axis=0)

    comparison_df = pd.DataFrame.from_records(
        [tax_num_sample_sum, tax_denom_sample_sum],
        index=["Num_Sum", "Denom_Sum"],
    ).T
    comparison_df["log_ratio"] = comparison_df.apply(
        lambda x: np.log(x.Num_Sum / x.Denom_Sum), axis=1)
    comparison_df.index.name = "Sample-ID"

    return comparison_df
示例#15
0
def simple_plot(output_dir,
                table: biom.Table,
                feature_tree: skbio.TreeNode,
                metadata: q2.Metadata,
                case_where: str,
                control_where: str,
                n_transects: int = 10,
                stratify_by: str = None,
                mode: str = 'max'):
    print("Data extracted")
    layer_dir = os.path.join(output_dir, 'layers')
    rank_dir = os.path.join(output_dir, 'ranks')
    os.mkdir(layer_dir)
    os.mkdir(rank_dir)

    metadata = metadata.filter_ids(table.ids(axis='sample'))
    case_samples = sorted(list(metadata.get_ids(case_where)))
    control_samples = sorted(list(metadata.get_ids(control_where)))
    get_pairs = comparisons(metadata, control_samples, case_samples,
                            stratify_by)

    table.filter(case_samples + control_samples)
    table.remove_empty('observation')
    features = list(table.ids(axis='observation'))
    feature_tree = shear_no_prune(feature_tree, features)
    print("Extraneous features removed")

    for n in feature_tree.traverse():
        if not n.length:
            n.length = 0
    tree = tree_to_array(feature_tree, mode)
    print("Tree index created")

    possible_transects = len(np.unique(np.asarray(tree['distances'])))
    tree_length = tree['distances'][0]  # root of tree
    if n_transects > possible_transects:
        n_transects = possible_transects
        print("Only %d transects exist, using that instead" % n_transects)

    transects = list(np.linspace(0, tree_length, num=n_transects))
    print("Will transect at: %s" % ", ".join(map(str, transects)))

    figure_gen = prepare_plot(tree_length)
    figure_gen.send(None)  # initialize co-routine
    colors = []

    points, _ = pairwise_components(table, get_pairs())
    color_fig, highlight_fig, color = figure_gen.send((points, None))

    color_fig.savefig(os.path.join(layer_dir, 'original.png'),
                      transparent=True)
    plt.close(color_fig)
    highlight_fig.savefig(os.path.join(layer_dir, 'original.h.png'),
                          transparent=True)
    plt.close(highlight_fig)
    colors.append(color)

    rank_files = []
    collapsed_groups = pd.DataFrame()
    for distance in transects:
        collapsed_table, collapsed_counts, groups = group_by_transect(
            table, tree, distance)
        collapsed_groups[groups.name] = groups
        print("Table collapsed at transect %s" % distance)

        points, ranks = pairwise_components(collapsed_table, get_pairs())

        filename = write_ranks(rank_dir, collapsed_counts, ranks, distance)
        rank_files.append(filename)

        color_fig, highlight_fig, color = figure_gen.send((points, distance))
        colors.append(color)

        color_fig.savefig(os.path.join(layer_dir, 'T_%s.png' % distance),
                          transparent=True)
        plt.close(color_fig)
        highlight_fig.savefig(os.path.join(layer_dir, 'T_%s.h.png' % distance),
                              transparent=True)
        plt.close(highlight_fig)

    print("Finalizing visualization")
    figure = figure_gen.send((None, None))
    figure.savefig(os.path.join(layer_dir, 'trajectory.png'), transparent=True)
    plt.close(figure)

    background = next(figure_gen)
    background.savefig(os.path.join(layer_dir, 'bg.png'), transparent=True)
    plt.close(background)

    with open(os.path.join(output_dir, 'collapsed_groups.tsv'), 'w') as fh:
        collapsed_groups.to_csv(fh, sep='\t')

    with open(os.path.join(output_dir, 'index.html'), 'w') as fh:
        template = Environment(loader=BaseLoader).from_string(TEMPLATE)
        fh.write(
            template.render({
                'legend':
                list(
                    zip(['original'] + ['T_%s' % d
                                        for d in transects] + ['trajectory'],
                        list(map(to_hex, colors)) + ['red'])),
                'filenames':
                rank_files
            }))
示例#16
0
def simulate_samples(taxonomy_samples, fold, taxon_defaults, ref_taxa,
                     ref_seqs):
    with open(join(fold, 'sample_test.json')) as fp:
        test_samples = json.load(fp)
    test_samples = extract_sample(test_samples, taxonomy_samples)
    ref_taxa, _ = load_references(ref_taxa, ref_seqs)

    with open(join(fold, 'seq_test.json')) as fp:
        test_seqs = json.load(fp)
    test_taxa = {ref_taxa[sid] for sid in test_seqs}

    hits = [0]
    direct_remaps = [0]
    indirect_remaps = [0]

    def collapse(taxon, _):
        if taxon in test_taxa:
            hits[0] += 1
            return taxon
        if taxon_defaults[taxon][0] in test_taxa:
            direct_remaps[0] += 1
            return taxon_defaults[taxon][0]
        for try_taxon in taxon_defaults[taxon][1:]:
            if try_taxon in test_taxa:
                indirect_remaps[0] += 1
                return try_taxon

    test_samples = test_samples.collapse(collapse,
                                         norm=False,
                                         axis='observation')
    logging.info('Test taxon remaps')
    logging.info(str(hits[0]) + ' hits')
    logging.info(str(direct_remaps[0]) + ' direct remaps')
    logging.info(str(indirect_remaps[0]) + ' indirect remaps')

    samples = []
    obs_ids = []
    expected = []
    taxa_ref = defaultdict(list)
    for sid, taxon in ref_taxa.items():
        if sid in test_seqs:
            taxa_ref[taxon].append(sid)
    for abundances, taxon, _ in test_samples.iter(axis='observation'):
        taxa = taxa_ref[taxon]
        n_taxa = len(taxa)
        obs_ids.extend(taxa)
        expected.extend(ref_taxa[sid] for sid in taxa)
        taxa_samples = numpy.vstack([abundances // n_taxa] * n_taxa)
        # magic
        taxa = cycle(range(n_taxa))
        for i, r in enumerate(abundances % n_taxa):
            for t, _ in zip(taxa, range(int(r))):
                taxa_samples[t, i] += 1
        assert (taxa_samples.sum(axis=0) == abundances).all()
        samples.append(taxa_samples)
    test_samples = Table(numpy.vstack(samples), obs_ids, test_samples.ids())
    test_samples.filter(lambda v, _, __: v.sum() > 1e-9,
                        axis='observation',
                        inplace=True)

    return (test_samples, dict(zip(obs_ids, expected)))
示例#17
0
def trim_dada2_posthoc(
        table: biom.Table,
        representative_sequences: pd.Series,
        trim_length: int = 0,
        hashed_feature_ids: bool = True) -> (biom.Table, pd.Series):
    """
    Trims ASVs generated by DADA2 to a standard length

    Parameters
    ----------
    table : biom.Table
        The feature table
    representative_sequences: DNAFASTAFormat
        The sequences which correspond to the ASV table
    trim_length  : int
        The length to trim the ASVS. If the length is 0, the minimum sequence
        length will be used.
    hash_feature_ids: bool
        Whether feature and sequence IDs should be hashed.
    """

    # Trims the sequences
    seq_length = representative_sequences.apply(lambda x: len(x))

    if trim_length == 0:
        trim_length = seq_length.min()

    if (seq_length < trim_length).any():
        warnings.warn(
            "There are ASVs shorter than the trim length. "
            "These sequences will be discarded.", UserWarning)
    rep_seqs = representative_sequences.astype(str)
    rep_seqs = rep_seqs.loc[seq_length >= trim_length].copy()
    rep_seqs = pd.DataFrame(data=[rep_seqs.apply(lambda x: x[:trim_length])],
                            index=['sequence']).T

    # Collapses the table based on the trimmed sequences
    table.filter(lambda v, id_, md: id_ in rep_seqs.index,
                 axis='observation',
                 inplace=True)
    table.add_metadata(
        rep_seqs.loc[table.ids(axis='observation')].to_dict(orient='index'),
        axis='observation')

    table2 = table.collapse(lambda id_, md: md['sequence'],
                            norm=False,
                            axis='observation')

    seqs2 = rep_seqs.drop_duplicates()['sequence'].copy()

    if hashed_feature_ids:
        table2.update_ids(
            {seq_: _hash_seq(seq_)
             for seq_ in table2.ids(axis='observation')},
            axis='observation',
            inplace=True)
        seqs2.rename({id_: _hash_seq(seq_)
                      for id_, seq_ in seqs2.items()},
                     inplace=True)
    else:
        seqs2.rename({id_: seq_ for id_, seq_ in seqs2.items()}, inplace=True)

    return table2, seqs2
示例#18
0
def maturity_index(output_dir: str,
                   table: biom.Table,
                   metadata: qiime2.Metadata,
                   column: str,
                   group_by: str,
                   control: str,
                   estimator: str = defaults['estimator_r'],
                   n_estimators: int = defaults['n_estimators'],
                   test_size: float = defaults['test_size'],
                   step: float = defaults['step'],
                   cv: int = defaults['cv'],
                   random_state: int = None,
                   n_jobs: int = defaults['n_jobs'],
                   parameter_tuning: bool = True,
                   optimize_feature_selection: bool = True,
                   stratify: str = False,
                   maz_stats: bool = True,
                   missing_samples: str = defaults['missing_samples']) -> None:

    # select estimator
    param_dist, estimator = _select_estimator(estimator, n_jobs, n_estimators)
    estimator = Pipeline([('dv', DictVectorizer()), ('est', estimator)])
    param_dist = _map_params_to_pipeline(param_dist)

    # split input data into control and treatment groups
    table, metadata = _load_data(table,
                                 metadata,
                                 missing_samples=missing_samples,
                                 extract=False)
    fancy_index = metadata[group_by] == control
    md_control = metadata[fancy_index]
    table_control = table.filter(md_control.index, inplace=False)

    # train model on control data
    estimator, cm, accuracy, importances = split_optimize_classify(
        table_control,
        md_control,
        column,
        estimator,
        output_dir,
        random_state=random_state,
        n_jobs=n_jobs,
        test_size=test_size,
        step=step,
        cv=cv,
        parameter_tuning=parameter_tuning,
        optimize_feature_selection=optimize_feature_selection,
        param_dist=param_dist,
        calc_feature_importance=True,
        load_data=False,
        scoring=mean_squared_error,
        stratify=stratify,
        classification=False,
        missing_samples='ignore')

    # predict treatment data
    index = importances.index
    table = _extract_features(table)
    table = [{k: r[k] for k in r.keys() & index} for r in table]
    y_pred = estimator.predict(table)
    predicted_column = 'predicted {0}'.format(column)
    metadata[predicted_column] = y_pred

    # calculate MAZ score
    metadata = _maz_score(metadata, predicted_column, column, group_by,
                          control)

    # visualize
    table = estimator.named_steps.dv.transform(table).todense()
    table = pd.DataFrame(table,
                         index=metadata.index,
                         columns=estimator.named_steps.dv.get_feature_names())
    _visualize_maturity_index(table,
                              metadata,
                              group_by,
                              column,
                              predicted_column,
                              importances,
                              estimator,
                              accuracy,
                              output_dir,
                              maz_stats=maz_stats)
示例#19
0
def ctf_helper(
    table: biom.Table,
    sample_metadata: DataFrame,
    individual_id_column: str,
    state_columns: list,
    n_components: int = DEFAULT_COMP,
    min_sample_count: int = DEFAULT_MSC,
    min_feature_count: int = DEFAULT_MFC,
    max_iterations_als: int = DEFAULT_MAXITER,
    max_iterations_rptm: int = DEFAULT_MAXITER,
    n_initializations: int = DEFAULT_MAXITER,
    feature_metadata: DataFrame = DEFFM
) -> (dict, OrdinationResults, dict, tuple):
    """ Runs  Compositional Tensor Factorization CTF.
    """

    # validate the metadata using q2 as a wrapper
    if sample_metadata is not None and not isinstance(sample_metadata,
                                                      DataFrame):
        sample_metadata = sample_metadata.to_dataframe()
    keep_cols = state_columns + [individual_id_column]
    all_sample_metadata = sample_metadata.drop(keep_cols, axis=1)
    sample_metadata = sample_metadata[keep_cols]
    # validate the metadata using q2 as a wrapper
    if feature_metadata is not None and not isinstance(feature_metadata,
                                                       DataFrame):
        feature_metadata = feature_metadata.to_dataframe()
    # match the data (borrowed in part from gneiss.util.match)
    subtablefids = table.ids('observation')
    subtablesids = table.ids('sample')
    if len(subtablesids) != len(set(subtablesids)):
        raise ValueError('Data-table contains duplicate sample IDs')
    if len(subtablefids) != len(set(subtablefids)):
        raise ValueError('Data-table contains duplicate feature IDs')
    submetadataids = set(sample_metadata.index)
    subtablesids = set(subtablesids)
    subtablefids = set(subtablefids)
    if feature_metadata is not None:
        submetadatafeat = set(feature_metadata.index)
        fidx = subtablefids & submetadatafeat
        if len(fidx) == 0:
            raise ValueError(("No more features left.  Check to make "
                              "sure that the sample names between "
                              "`feature-metadata` and `table` are "
                              "consistent"))
        feature_metadata = feature_metadata.reindex(fidx)
    sidx = subtablesids & submetadataids
    if len(sidx) == 0:
        raise ValueError(("No more features left.  Check to make sure that "
                          "the sample names between `sample-metadata` and"
                          " `table` are consistent"))
    if feature_metadata is not None:
        table.filter(list(fidx), axis='observation', inplace=True)
    table.filter(list(sidx), axis='sample', inplace=True)
    sample_metadata = sample_metadata.reindex(sidx)

    # filter and import table
    for axis, min_sum in zip(['sample', 'observation'],
                             [min_sample_count, min_feature_count]):
        table = table.filter(table.ids(axis)[table.sum(axis) >= min_sum],
                             axis=axis,
                             inplace=True)

    # table to dataframe
    table = DataFrame(table.matrix_data.toarray(), table.ids('observation'),
                      table.ids('sample'))

    # tensor building
    tensor = build()
    tensor.construct(table, sample_metadata, individual_id_column,
                     state_columns)

    # factorize
    TF = TensorFactorization(n_components=n_components,
                             max_als_iterations=max_iterations_als,
                             max_rtpm_iterations=max_iterations_rptm,
                             n_initializations=n_initializations).fit(
                                 rclr(tensor.counts))
    # label tensor loadings
    TF.label(tensor, taxonomy=feature_metadata)

    # if the n_components is two add PC3 of zeros
    # this is referenced as in issue in
    # <https://github.com/biocore/emperor/commit
    # /a93f029548c421cb0ba365b4294f7a5a6b0209ce>
    if n_components == 2:
        TF.subjects.loc[:, 'PC3'] = [0] * len(TF.subjects.index)
        TF.features.loc[:, 'PC3'] = [0] * len(TF.features.index)
        TF.proportion_explained['PC3'] = 0
        TF.eigvals['PC3'] = 0

    # save ordination results
    short_method_name = 'CTF_Biplot'
    long_method_name = 'Compositional Tensor Factorization Biplot'
    # only keep PC -- other tools merge metadata
    keep_PC = [col for col in TF.features.columns if 'PC' in col]
    subj_ordin = OrdinationResults(
        short_method_name,
        long_method_name,
        TF.eigvals,
        samples=TF.subjects[keep_PC].dropna(axis=0),
        features=TF.features[keep_PC].dropna(axis=0),
        proportion_explained=TF.proportion_explained)
    # save distance matrix for each condition
    distances = {}
    state_ordn = {}
    subject_trajectories = {}
    feature_trajectories = {}
    for condition, cond, dist, straj, ftraj in zip(tensor.conditions,
                                                   TF.conditions,
                                                   TF.subject_distances,
                                                   TF.subject_trajectory,
                                                   TF.feature_trajectory):
        # match distances to metadata
        ids = straj.index
        ind_dict = dict((ind, ind_i) for ind_i, ind in enumerate(ids))
        inter = set(ind_dict).intersection(sample_metadata.index)
        indices = sorted([ind_dict[ind] for ind in inter])
        dist = dist[indices, :][:, indices]
        distances[condition] = skbio.stats.distance.DistanceMatrix(
            dist, ids=ids[indices])
        # fix conditions
        if n_components == 2:
            cond['PC3'] = [0] * len(cond.index)
        cond = OrdinationResults(short_method_name,
                                 long_method_name,
                                 TF.eigvals,
                                 samples=cond[keep_PC].dropna(axis=0),
                                 features=TF.features[keep_PC].dropna(axis=0),
                                 proportion_explained=TF.proportion_explained)
        state_ordn[condition] = cond
        # add the sample metadata before returning output
        # addtionally only keep metadata with trajectory
        # output available.
        pre_merge_cols = list(straj.columns)
        straj = concat(
            [straj.reindex(all_sample_metadata.index), all_sample_metadata],
            axis=1,
            sort=True)
        straj = straj.dropna(subset=pre_merge_cols)
        # ensure index name for q2
        straj.index.name = "#SampleID"
        # save traj.
        keep_PC_traj = [col for col in straj.columns if 'PC' in col]
        straj[keep_PC_traj] -= straj[keep_PC_traj].mean()
        ftraj[keep_PC_traj] -= ftraj[keep_PC_traj].mean()
        subject_trajectories[condition] = straj
        ftraj.index = ftraj.index.astype(str)
        feature_trajectories[condition] = ftraj
    return (state_ordn, subj_ordin, distances, subject_trajectories,
            feature_trajectories)
示例#20
0
文件: rpca.py 项目: mortonjt/gemelli
def rpca(
    table: biom.Table,
    n_components: Union[int, str] = DEFAULT_COMP,
    min_sample_count: int = DEFAULT_MSC,
    min_feature_count: int = DEFAULT_MFC,
    min_feature_frequency: float = DEFAULT_MFF,
    max_iterations: int = DEFAULT_OPTSPACE_ITERATIONS
) -> (skbio.OrdinationResults, skbio.DistanceMatrix):
    """Runs RPCA with an matrix_rclr preprocessing step.

       This code will be run by both the standalone and QIIME 2 versions of
       gemelli.
    """
    # get shape of table
    n_features, n_samples = table.shape

    # filter sample to min seq. depth
    def sample_filter(val, id_, md):
        return sum(val) > min_sample_count

    # filter features to min total counts
    def observation_filter(val, id_, md):
        return sum(val) > min_feature_count

    # filter features by N samples presence
    def frequency_filter(val, id_, md):
        return (np.sum(val > 0) / n_samples) > (min_feature_frequency / 100)

    # filter and import table for each filter above
    table = table.filter(observation_filter, axis='observation')
    table = table.filter(frequency_filter, axis='observation')
    table = table.filter(sample_filter, axis='sample')
    # table to dataframe
    table = pd.DataFrame(table.matrix_data.toarray(), table.ids('observation'),
                         table.ids('sample')).T
    # check the table after filtering
    if len(table.index) != len(set(table.index)):
        raise ValueError('Data-table contains duplicate indices')
    if len(table.columns) != len(set(table.columns)):
        raise ValueError('Data-table contains duplicate columns')
    # Robust-clt (matrix_rclr) preprocessing and OptSpace (RPCA)
    opt = MatrixCompletion(n_components=n_components,
                           max_iterations=max_iterations).fit(
                               matrix_rclr(table))
    # get new n-comp when applicable
    n_components = opt.s.shape[0]
    # get PC column labels for the skbio OrdinationResults
    rename_cols = ['PC' + str(i + 1) for i in range(n_components)]
    # get completed matrix for centering
    X = opt.sample_weights @ opt.s @ opt.feature_weights.T
    # center again around zero after completion
    X = X - X.mean(axis=0)
    X = X - X.mean(axis=1).reshape(-1, 1)
    # re-factor the data
    u, s, v = svd(X)
    # only take n-components
    u = u[:, :n_components]
    v = v.T[:, :n_components]
    # calc. the new variance using projection
    p = s**2 / np.sum(s**2)
    p = p[:n_components]
    s = s[:n_components]
    # save the loadings
    feature_loading = pd.DataFrame(v, index=table.columns, columns=rename_cols)
    sample_loading = pd.DataFrame(u, index=table.index, columns=rename_cols)
    # % var explained
    proportion_explained = pd.Series(p, index=rename_cols)
    # get eigenvalues
    eigvals = pd.Series(s, index=rename_cols)

    # if the n_components is two add PC3 of zeros
    # this is referenced as in issue in
    # <https://github.com/biocore/emperor/commit
    # /a93f029548c421cb0ba365b4294f7a5a6b0209ce>
    # discussed in gemelli -- PR#29
    if n_components == 2:
        feature_loading['PC3'] = [0] * len(feature_loading.index)
        sample_loading['PC3'] = [0] * len(sample_loading.index)
        eigvals.loc['PC3'] = 0
        proportion_explained.loc['PC3'] = 0

    # save ordination results
    short_method_name = 'rpca_biplot'
    long_method_name = '(Robust Aitchison) RPCA Biplot'
    ord_res = skbio.OrdinationResults(
        short_method_name,
        long_method_name,
        eigvals.copy(),
        samples=sample_loading.copy(),
        features=feature_loading.copy(),
        proportion_explained=proportion_explained.copy())
    # save distance matrix
    dist_res = skbio.stats.distance.DistanceMatrix(opt.distance,
                                                   ids=sample_loading.index)

    return ord_res, dist_res
示例#21
0
def rpca(
    table: biom.Table,
    n_components: int = DEFAULT_RANK,
    min_sample_count: int = DEFAULT_MSC,
    min_feature_count: int = DEFAULT_MFC,
    max_iterations: int = DEFAULT_ITERATIONS
) -> (skbio.OrdinationResults, skbio.DistanceMatrix):
    """Runs RPCA with an rclr preprocessing step.

       This code will be run by both the standalone and QIIME 2 versions of
       DEICODE.
    """

    # filter sample to min depth
    def sample_filter(val, id_, md):
        return sum(val) > min_sample_count

    def observation_filter(val, id_, md):
        return sum(val) > min_feature_count

    # filter and import table
    table = table.filter(observation_filter, axis='observation')
    table = table.filter(sample_filter, axis='sample')
    table = table.to_dataframe().T
    if len(table.index) != len(set(table.index)):
        raise ValueError('Data-table contains duplicate indices')
    if len(table.columns) != len(set(table.columns)):
        raise ValueError('Data-table contains duplicate columns')

    # rclr preprocessing and OptSpace (RPCA)
    opt = MatrixCompletion(n_components=n_components,
                           max_iterations=max_iterations).fit(rclr(table))

    rename_cols = ['PC' + str(i + 1) for i in range(n_components)]
    X = opt.sample_weights @ opt.s @ opt.feature_weights.T
    X = X - X.mean(axis=0)
    X = X - X.mean(axis=1).reshape(-1, 1)
    u, s, v = svd(X)
    u = u[:, :n_components]
    v = v.T[:, :n_components]
    p = s**2 / np.sum(s**2)
    p = p[:n_components]
    s = s[:n_components]
    feature_loading = pd.DataFrame(v, index=table.columns, columns=rename_cols)
    sample_loading = pd.DataFrame(u, index=table.index, columns=rename_cols)

    # % var explained
    proportion_explained = pd.Series(p, index=rename_cols)
    # get eigenvalues
    eigvals = pd.Series(s, index=rename_cols)

    # if the n_components is two add PC3 of zeros
    # this is referenced as in issue in
    # <https://github.com/biocore/emperor/commit
    # /a93f029548c421cb0ba365b4294f7a5a6b0209ce>
    # discussed in DEICODE -- PR#29
    if n_components == 2:
        feature_loading['PC3'] = [0] * len(feature_loading.index)
        sample_loading['PC3'] = [0] * len(sample_loading.index)
        eigvals.loc['PC3'] = 0
        proportion_explained.loc['PC3'] = 0

    # save ordination results
    short_method_name = 'rpca_biplot'
    long_method_name = '(Robust Aitchison) RPCA Biplot'
    ord_res = skbio.OrdinationResults(
        short_method_name,
        long_method_name,
        eigvals.copy(),
        samples=sample_loading.copy(),
        features=feature_loading.copy(),
        proportion_explained=proportion_explained.copy())
    # save distance matrix
    dist_res = skbio.stats.distance.DistanceMatrix(opt.distance,
                                                   ids=sample_loading.index)

    return ord_res, dist_res
示例#22
0
文件: biom.py 项目: qiyunzhu/woltka
def collapse_biom(table: biom.Table, mapping: dict, divide=False, field=None):
    """Collapse a BIOM table in many-to-many mode.

    Parameters
    ----------
    table : biom.Table
        Table to collapse.
    mapping : dict of list of str
        Source-to-target(s) mapping.
    divide : bool, optional
        Whether divide per-target counts by number of targets per source.
    field : int, optional
        Index of field to be collapsed in a stratified table.

    Returns
    -------
    biom.Table
        Collapsed BIOM table.

    Raises
    ------
    ValueError
        Field index is not present in a feature ID.

    Notes
    -----
    Metadata will not be retained in the collapsed table.

    See Also
    --------
    .table.collapse_table
    """
    # generate metadata
    metadata = {}
    for id_ in table.ids('observation'):
        feature = id_
        if field:
            fields = feature.split('|')
            try:
                feature = fields[field]
            except IndexError:
                raise ValueError(
                    f'Feature "{feature}" has less than {field + 1} fields.')
        if feature not in mapping:
            continue
        targets = []
        for target in mapping[feature]:
            if field:
                fields[field] = target
                target = '|'.join(fields)
            targets.append(target)
        metadata[id_] = dict(part=targets)

    # filter table features
    table = table.filter(lambda data, id_, md: id_ in metadata,
                         axis='observation',
                         inplace=False)

    # stop if no feature left
    if table.is_empty():
        return table

    # add mapping to table metadata
    table.add_metadata(metadata, axis='observation')

    # determine collapsing method
    kwargs = dict(norm=False,
                  one_to_many=True,
                  axis='observation',
                  one_to_many_mode=('divide' if divide else 'add'))

    # collapse table in many-to-many mode
    table = table.collapse(lambda _, md: zip(md['part'], md['part']), **kwargs)

    # round to integers
    if divide:
        round_biom(table)

    # clean up
    table.del_metadata(keys=['Path'])
    return table