예제 #1
0
def aldex2(table: pd.DataFrame,
           metadata: qiime2.Metadata,
           condition: str,
           mc_samples: int = 128,
           test: str = 't',
           denom: str = 'all') -> pd.DataFrame:

    with tempfile.TemporaryDirectory() as temp_dir_name:
        biom_fp = os.path.join(temp_dir_name, 'input.tsv.biom')
        map_fp = os.path.join(temp_dir_name, 'input.map.txt')
        summary_fp = os.path.join(temp_dir_name, 'output.summary.txt')

        table.to_csv(biom_fp, sep='\t')
        metadata.to_dataframe().to_csv(map_fp, sep='\t')

        cmd = [
            'run_aldex2.R', biom_fp, map_fp, condition, mc_samples, test,
            denom, summary_fp
        ]
        cmd = list(map(str, cmd))

        try:
            run_commands([cmd])
        except subprocess.CalledProcessError as e:
            raise Exception("An error was encountered while running ALDEx2"
                            " in R (return code %d), please inspect stdout"
                            " and stderr to learn more." % e.returncode)

        summary = pd.read_csv(summary_fp, index_col=0)
        differentials = summary[['effect']]
        # don't return summary for now (TODO!)
        return differentials
예제 #2
0
def _generic_plot(output_dir: str, master: skbio.OrdinationResults,
                  metadata: qiime2.Metadata,
                  other_pcoa: skbio.OrdinationResults, plot_name,
                  custom_axes: str=None,
                  feature_metadata: qiime2.Metadata=None):

    mf = metadata.to_dataframe()
    if feature_metadata is not None:
        feature_metadata = feature_metadata.to_dataframe()

    if other_pcoa is None:
        procrustes = None
    else:
        procrustes = [other_pcoa]

    viz = Emperor(master, mf, feature_mapping_file=feature_metadata,
                  procrustes=procrustes, remote='.')

    if custom_axes is not None:
        viz.custom_axes = custom_axes

    if other_pcoa:
        viz.procrustes_names = ['reference', 'other']

    html = viz.make_emperor(standalone=True)
    viz.copy_support_files(output_dir)
    with open(os.path.join(output_dir, 'emperor.html'), 'w') as fh:
        fh.write(html)

    index = os.path.join(TEMPLATES, 'index.html')
    q2templates.render(index, output_dir, context={'plot_name': plot_name})
예제 #3
0
def bioenv(output_dir: str, distance_matrix: skbio.DistanceMatrix,
           metadata: qiime2.Metadata) -> None:
    # Filter metadata to only include IDs present in the distance matrix.
    # Also ensures every distance matrix ID is present in the metadata.
    metadata = metadata.filter_ids(distance_matrix.ids)

    # drop non-numeric columns and empty columns
    pre_filtered_cols = set(metadata.columns)
    metadata = metadata.filter_columns(column_type='numeric')
    non_numeric_cols = pre_filtered_cols - set(metadata.columns)

    # Drop samples that have any missing values.
    # TODO use Metadata API if more filtering is supported in the future.
    df = metadata.to_dataframe()
    df = df.dropna()
    metadata = qiime2.Metadata(df)

    # filter 0 variance numerical columns and empty columns
    pre_filtered_cols = set(metadata.columns)
    metadata = metadata.filter_columns(drop_zero_variance=True,
                                       drop_all_missing=True)
    zero_variance_cols = pre_filtered_cols - set(metadata.columns)

    df = metadata.to_dataframe()

    # filter the distance matrix to exclude samples that were dropped from
    # the metadata, and keep track of how many samples survived the filtering
    # so that information can be presented to the user.
    initial_dm_length = distance_matrix.shape[0]
    distance_matrix = distance_matrix.filter(df.index)
    filtered_dm_length = distance_matrix.shape[0]

    result = skbio.stats.distance.bioenv(distance_matrix, df)
    result = q2templates.df_to_html(result)

    index = os.path.join(TEMPLATES, 'bioenv_assets', 'index.html')
    q2templates.render(index,
                       output_dir,
                       context={
                           'initial_dm_length':
                           initial_dm_length,
                           'filtered_dm_length':
                           filtered_dm_length,
                           'non_numeric_cols':
                           ', '.join(sorted(non_numeric_cols)),
                           'zero_variance_cols':
                           ', '.join(sorted(zero_variance_cols)),
                           'result':
                           result
                       })
예제 #4
0
def barplot(output_dir: str, table: pd.DataFrame, taxonomy: pd.Series,
            metadata: Metadata) -> None:
    metadata = metadata.to_dataframe()
    filenames = []
    collapsed_tables = _extract_to_level(taxonomy, table)

    for level, df in enumerate(collapsed_tables, 1):
        # Join collapsed table with metadata
        taxa_cols = df.columns.values.tolist()
        df = df.join(metadata, how='left')
        df = df.reset_index(drop=False)  # Move SampleID index into columns
        df = df.fillna('')  # JS sort works best with empty strings vs null
        all_cols = df.columns.values.tolist()

        filename = 'lvl-%d.jsonp' % level
        filenames.append(filename)

        with open(os.path.join(output_dir, filename), 'w') as fh:
            fh.write("load_data('Level %d'," % level)
            json.dump(taxa_cols, fh)
            fh.write(",")
            json.dump(all_cols, fh)
            fh.write(",")
            df.to_json(fh, orient='records')
            fh.write(");")

    # Now that the tables have been collapsed, write out the index template
    index = os.path.join(TEMPLATES, 'barplot', 'index.html')
    q2templates.render(index, output_dir, context={'filenames': filenames})

    # Copy assets for rendering figure
    shutil.copytree(os.path.join(TEMPLATES, 'barplot', 'dst'),
                    os.path.join(output_dir, 'dist'))
예제 #5
0
def _generic_plot(output_dir: str, master: skbio.OrdinationResults,
                  metadata: qiime2.Metadata,
                  other_pcoa: skbio.OrdinationResults, plot_name,
                  custom_axes: str=None):

    mf = metadata.to_dataframe()

    if other_pcoa is None:
        procrustes = None
    else:
        procrustes = [other_pcoa]

    viz = Emperor(master, mf, procrustes=procrustes, remote='.')

    if custom_axes is not None:
        viz.custom_axes = custom_axes

    if other_pcoa:
        viz.procrustes_names = ['reference', 'other']

    html = viz.make_emperor(standalone=True)
    viz.copy_support_files(output_dir)
    with open(os.path.join(output_dir, 'emperor.html'), 'w') as fh:
        fh.write(html)

    index = os.path.join(TEMPLATES, 'index.html')
    q2templates.render(index, output_dir, context={'plot_name': plot_name})
예제 #6
0
def filter_results(
    results: GrowthResults,
    metadata: Metadata,
    query: str = None,
    exclude: bool = False,
) -> GrowthResults:
    """Filter samples from the simulation results."""
    sids = results.growth_rates.sample_id
    exchanges = results.exchanges
    rates = results.growth_rates
    metadata = metadata.to_dataframe()
    if query is not None:
        metadata = metadata.query(query)
    if exclude:
        filtered_sids = sids[~sids.isin(metadata.index)]
    else:
        filtered_sids = sids[sids.isin(metadata.index)]
    if len(filtered_sids) == 0:
        raise ValueError("There are no samples left after filtering :O")
    filtered_results = GrowthResults(
        growth_rates=rates[rates.sample_id.isin(filtered_sids)],
        exchanges=exchanges[exchanges.sample_id.isin(filtered_sids)],
        annotations=results.annotations,
    )
    return filtered_results
예제 #7
0
def bioenv(output_dir: str, distance_matrix: skbio.DistanceMatrix,
           metadata: qiime2.Metadata) -> None:
    # convert metadata to numeric values where applicable, drop the non-numeric
    # values, and then drop samples that contain NaNs
    df = metadata.to_dataframe()
    df = df.apply(lambda x: pd.to_numeric(x, errors='ignore'))

    # filter categorical columns
    pre_filtered_cols = set(df.columns)
    df = df.select_dtypes([numpy.number]).dropna()
    filtered_categorical_cols = pre_filtered_cols - set(df.columns)

    # filter 0 variance numerical columns
    pre_filtered_cols = set(df.columns)
    df = df.loc[:, df.var() != 0]
    filtered_zero_variance_cols = pre_filtered_cols - set(df.columns)

    # filter the distance matrix to exclude samples that were dropped from
    # the metadata, and keep track of how many samples survived the filtering
    # so that information can be presented to the user.
    initial_dm_length = distance_matrix.shape[0]
    distance_matrix = distance_matrix.filter(df.index, strict=False)
    filtered_dm_length = distance_matrix.shape[0]

    result = skbio.stats.distance.bioenv(distance_matrix, df)
    result = result.to_html(classes='table table-striped table-hover').replace(
        'border="1"', 'border="0"')

    index = os.path.join(TEMPLATES, 'bioenv_assets', 'index.html')
    q2templates.render(index, output_dir, context={
        'initial_dm_length': initial_dm_length,
        'filtered_dm_length': filtered_dm_length,
        'filtered_categorical_cols': ', '.join(filtered_categorical_cols),
        'filtered_zero_variance_cols': ', '.join(filtered_zero_variance_cols),
        'result': result})
예제 #8
0
def tabulate(output_dir: str,
             input: qiime2.Metadata,
             page_size: int = 100) -> None:
    if page_size < 1:
        raise ValueError('Cannot render less than one record per page.')

    df = input.to_dataframe()
    df.reset_index(inplace=True)
    table = df.to_json(orient='split')
    # JSON spec doesn't allow single quotes in string values, at all. It does
    # however allow unicode values.
    table = table.replace("'", r'\u0027')

    index = os.path.join(TEMPLATES, 'tabulate', 'index.html')
    q2templates.render(index,
                       output_dir,
                       context={
                           'table': table,
                           'page_size': page_size
                       })

    js = os.path.join(TEMPLATES, 'tabulate', 'datatables.min.js')
    os.mkdir(os.path.join(output_dir, 'js'))
    shutil.copy(js, os.path.join(output_dir, 'js', 'datatables.min.js'))

    css = os.path.join(TEMPLATES, 'tabulate', 'datatables.min.css')
    os.mkdir(os.path.join(output_dir, 'css'))
    shutil.copy(css, os.path.join(output_dir, 'css', 'datatables.min.css'))
예제 #9
0
def bioenv(output_dir: str, distance_matrix: skbio.DistanceMatrix,
           metadata: qiime2.Metadata) -> None:
    # convert metadata to numeric values where applicable, drop the non-numeric
    # values, and then drop samples that contain NaNs
    df = metadata.to_dataframe()
    df = df.apply(lambda x: pd.to_numeric(x, errors='ignore'))

    # filter categorical columns
    pre_filtered_cols = set(df.columns)
    df = df.select_dtypes([numpy.number]).dropna()
    filtered_categorical_cols = pre_filtered_cols - set(df.columns)

    # filter 0 variance numerical columns
    pre_filtered_cols = set(df.columns)
    df = df.loc[:, df.var() != 0]
    filtered_zero_variance_cols = pre_filtered_cols - set(df.columns)

    # filter the distance matrix to exclude samples that were dropped from
    # the metadata, and keep track of how many samples survived the filtering
    # so that information can be presented to the user.
    initial_dm_length = distance_matrix.shape[0]
    distance_matrix = distance_matrix.filter(df.index, strict=False)
    filtered_dm_length = distance_matrix.shape[0]

    result = skbio.stats.distance.bioenv(distance_matrix, df)
    result = result.to_html(classes='table table-striped table-hover').replace(
        'border="1"', 'border="0"')

    index = os.path.join(TEMPLATES, 'bioenv_assets', 'index.html')
    q2templates.render(index, output_dir, context={
        'initial_dm_length': initial_dm_length,
        'filtered_dm_length': filtered_dm_length,
        'filtered_categorical_cols': ', '.join(filtered_categorical_cols),
        'filtered_zero_variance_cols': ', '.join(filtered_zero_variance_cols),
        'result': result})
예제 #10
0
def first_differences(metadata: qiime2.Metadata,
                      state_column: str,
                      individual_id_column: str,
                      metric: str,
                      replicate_handling: str = 'error',
                      baseline: float = None,
                      table: pd.DataFrame = None) -> pd.Series:

    # find metric in metadata or derive from table and merge into metadata
    if table is not None:
        _validate_metadata_is_superset(metadata.to_dataframe(), table)
        metadata = _add_metric_to_metadata(table, metadata, metric)
    else:
        metadata = _load_metadata(metadata)
        _validate_is_numeric_column(metadata, metric)

    # validate columns
    _validate_input_columns(metadata, individual_id_column, None, state_column,
                            metric)

    return _first_differences(metadata,
                              state_column,
                              individual_id_column,
                              metric,
                              replicate_handling,
                              baseline=baseline,
                              distance_matrix=None)
예제 #11
0
def tabulate(output_dir: str,
             input: qiime2.Metadata,
             page_size: int = 100) -> None:
    if page_size < 1:
        raise ValueError('Cannot render less than one record per page.')

    df = input.to_dataframe()
    df_columns = pd.MultiIndex.from_tuples([(n, t.type)
                                            for n, t in input.columns.items()],
                                           names=['column header', 'type'])
    df.columns = df_columns
    df.reset_index(inplace=True)
    table = df.to_json(orient='split')
    index = os.path.join(TEMPLATES, 'tabulate', 'index.html')
    q2templates.render(index,
                       output_dir,
                       context={
                           'table': table,
                           'page_size': page_size
                       })

    input.save(os.path.join(output_dir, 'metadata.tsv'))

    js = os.path.join(TEMPLATES, 'tabulate', 'datatables.min.js')
    os.mkdir(os.path.join(output_dir, 'js'))
    shutil.copy(js, os.path.join(output_dir, 'js', 'datatables.min.js'))

    css = os.path.join(TEMPLATES, 'tabulate', 'datatables.min.css')
    os.mkdir(os.path.join(output_dir, 'css'))
    shutil.copy(css, os.path.join(output_dir, 'css', 'datatables.min.css'))
예제 #12
0
def quadtree(metadata: qiime2.Metadata, y_coord: str, x_coord: str,
             threshold: int) -> (skbio.TreeNode, pd.DataFrame):
    metadata = metadata.to_dataframe()
    index = metadata.index.name
    cleaned_df = clean(metadata, y_coord, x_coord)
    tree, samples = get_results(cleaned_df, threshold, index)
    return tree, samples
예제 #13
0
def normalize(metadata: q2.Metadata, rules_dir: q2.plugin.Str) -> q2.Metadata:
    """
    Parameters
    ----------
    metadata : q2.Metadata
        The sample metadata.
    rules_dir : q2.plugin.Str
        The path to the yaml rules folder.

    Returns
    -------
    metadata_curated : q2.Metadata
        Curated metadata table.
    """

    # TEMPORARY FUNCTION TO PASS THE DEFAULT FOLDER CONTAINING OUR 8 RULES
    # (A REAL USER SHOULD PASS ANOTHER FOLDER LOCATION TO '--p-rules-dir')
    variables_rules_dir = get_variables_rules_dir(rules_dir, RULES)

    # Collect rules from yaml files folder by instantiating a class
    rules = RulesCollection(variables_rules_dir)

    # Get metadata as pandas data frame
    md = metadata.to_dataframe()

    # get metadata variables that have rules
    focus = get_intersection(rules.get_variables_names(), md.columns.tolist())

    # apply rules one variable at a time
    # for variable in focus:
    #     md[variable] = rules.normalize(variable, md[variable])

    # only during dev so that the function return something :)
    md_out = pd.DataFrame()
    return q2.Metadata(md_out)
예제 #14
0
def filter_models(
    models: CommunityModelDirectory,
    metadata: Metadata,
    query: str = None,
    exclude: bool = False,
) -> CommunityModelDirectory:
    """Filter samples from a set of community models."""
    manifest = models.manifest.view(pd.DataFrame)
    metadata = metadata.to_dataframe()
    if query is not None:
        metadata = metadata.query(query)
    if exclude:
        filtered_manifest = manifest[~manifest.sample_id.isin(metadata.index)]
    else:
        filtered_manifest = manifest[manifest.sample_id.isin(metadata.index)]
    if filtered_manifest.shape[0] == 0:
        raise ValueError("There are no samples left after filtering :O")
    out = CommunityModelDirectory()
    filtered_manifest.to_csv(out.manifest.path_maker())
    filtered_manifest.sample_id.apply(
        lambda sid: shutil.copy(
            models.model_files.path_maker(model_id=sid),
            out.model_files.path_maker(model_id=sid),
        )
    )
    return out
예제 #15
0
def community_plot(output_dir: str,
                   tree: NewickFormat,
                   feature_table: biom.Table,
                   sample_metadata: qiime2.Metadata,
                   pcoa: OrdinationResults = None,
                   feature_metadata: qiime2.Metadata = None,
                   ignore_missing_samples: bool = False,
                   filter_extra_samples: bool = False,
                   filter_missing_features: bool = False,
                   number_of_features: int = 5,
                   shear_tree: bool = True) -> None:
    """Visualizes a tree alongside community-level data.

       The functionality available in this visualization is a superset of the
       functionality in tree_plot() -- including sample metadata coloring /
       barplots, animations, and Emperor integration support.
    """
    if pcoa is not None and pcoa.features is not None:
        # select the top N most important features based on the vector's
        # magnitude (coped from q2-emperor)
        feats = pcoa.features.copy()
        # in cases where the axes are all zero there might be all-NA
        # columns
        feats.fillna(0, inplace=True)
        origin = np.zeros_like(feats.columns)
        feats['importance'] = feats.apply(euclidean, axis=1, args=(origin, ))
        feats.sort_values('importance', inplace=True, ascending=False)
        feats.drop(['importance'], inplace=True, axis=1)
        pcoa.features = feats[:number_of_features].copy()

    sample_metadata = sample_metadata.to_dataframe()

    if feature_metadata is not None:
        feature_metadata = feature_metadata.to_dataframe()

    t = get_bp(tree)
    viz = Empress(tree=t,
                  table=feature_table,
                  sample_metadata=sample_metadata,
                  feature_metadata=feature_metadata,
                  ordination=pcoa,
                  ignore_missing_samples=ignore_missing_samples,
                  filter_extra_samples=filter_extra_samples,
                  filter_missing_features=filter_missing_features,
                  shear_tree=shear_tree)
    save_viz(viz, output_dir)
예제 #16
0
def plot(output_dir: str,
         tree: NewickFormat,
         feature_table: pd.DataFrame,
         sample_metadata: qiime2.Metadata,
         pcoa: OrdinationResults = None,
         feature_metadata: qiime2.Metadata = None,
         ignore_missing_samples: bool = False,
         filter_missing_features: bool = False,
         number_of_features: int = 5,
         filter_unobserved_features_from_phylogeny: bool = True) -> None:

    if pcoa is not None and pcoa.features is not None:
        # select the top N most important features based on the vector's
        # magnitude (coped from q2-emperor)
        feats = pcoa.features.copy()
        origin = np.zeros_like(feats.columns)
        feats['importance'] = feats.apply(euclidean, axis=1, args=(origin, ))
        feats.sort_values('importance', inplace=True, ascending=False)
        feats.drop(['importance'], inplace=True, axis=1)
        pcoa.features = feats[:number_of_features].copy()

    sample_metadata = sample_metadata.to_dataframe()

    if feature_metadata is not None:
        feature_metadata = feature_metadata.to_dataframe()

    # path to the actual newick file
    with open(str(tree)) as file:
        t = parse_newick(file.readline())
    trim_tree = filter_unobserved_features_from_phylogeny
    viz = Empress(tree=t,
                  table=feature_table,
                  sample_metadata=sample_metadata,
                  feature_metadata=feature_metadata,
                  ordination=pcoa,
                  ignore_missing_samples=ignore_missing_samples,
                  filter_missing_features=filter_missing_features,
                  filter_unobserved_features_from_phylogeny=trim_tree)

    with open(os.path.join(output_dir, 'empress.html'), 'w') as file:
        file.write(str(viz))

    viz.copy_support_files(output_dir)

    index = os.path.join(TEMPLATES, 'index.html')
    q2templates.render(index, output_dir)
예제 #17
0
def parse_meta(
        query_results: MetaStormsSearchResultsDirFmt,
        metadata: Metadata,
        number_predicted: str = _default_params['number_predicted'],
        base_of_similarity: str = _default_params['base_of_similarity'],
        max_number_matches: str = _default_params['max_number_matches'],
        number_of_skipped: str = _default_params['number_of_skipped']) -> str:
    tmpdir = tempfile.mkdtemp()
    qr_path = os.path.join(str(query_results), 'query.out')
    result_fname = os.path.join(tmpdir, 'query.out.meta')
    md_fname = os.path.join(tmpdir, 'metadata.tsv')
    metadata.to_dataframe().to_csv(md_fname, sep='\t', index=True, header=True)
    run_command(
        _build_parse_meta_command(qr_path, md_fname, result_fname,
                                  number_predicted, base_of_similarity,
                                  max_number_matches, number_of_skipped))
    return result_fname
예제 #18
0
def label_seqs(seqs: pd.Series, delimiter: str,
               metadata: qiime2.Metadata = None, columns: str = None,
               missing_value: str = 'missing') \
                   -> pd.Series:
    if columns is not None and metadata is None \
            or metadata is not None and columns is None:
        raise ValueError('Columns and metadata must be passed or not passed '
                         'together.')

    if delimiter in missing_value:
        raise ValueError(f'The provided delimiter ({repr(delimiter)}) cannot '
                         'be contained in the missing value placeholder '
                         f'({repr(missing_value)}).')

    # This is necessary because QIIME 2 will not accept an empty list as an
    # argument of type List[str]
    if columns is None:
        columns = []

    # Make sure we have strings at this point not skbio DNA objects because we
    # experienced a bizarre segmentation fault while using DNA objects
    seqs = seqs.apply(str)
    seqs.index = seqs.index.map(lambda x: x.split(delimiter)[0])

    if metadata is not None:
        md_df = metadata.to_dataframe()

        for column in columns:
            if column not in md_df.columns:
                raise ValueError(f'The column {repr(column)} is not present '
                                 'in the metadata')

        missing_ids = seqs.index.difference(md_df.index)
        if len(missing_ids):
            difference = \
                ' '.join(repr(value) for value in missing_ids.values[0:10])
            additional_missing = len(missing_ids.values[10:])

            error_message = ('The following ids are present in the sequences '
                             f'but not the metadata {difference}')

            if additional_missing > 0:
                error_message += (f' ({additional_missing} additional ids are'
                                  ' missing from metadata but omitted from'
                                  ' this list)')

            raise ValueError(error_message)
    else:
        md_df = pd.DataFrame({}, index=seqs.index)

    selected = md_df[columns]
    selected = selected.fillna(missing_value)
    rename = pd.Series([delimiter.join(row) for row in selected.itertuples()],
                       index=selected.index)
    seqs.index = seqs.index.map(rename)

    return seqs
예제 #19
0
def db(meta: Metadata, rank: str = "genus", threads: int = 1) -> JSONDirectory:
    """Create a model database from a set of SBML files."""
    meta = meta.to_dataframe()
    json_dir = JSONDirectory()
    path = str(json_dir.json_files.path_maker(model_id="dummy"))
    path = os.path.dirname(path)
    meta = build_database(meta, path, rank, threads, compress=False)
    os.rename(os.path.join(path, "manifest.csv"),
              json_dir.manifest.path_maker())
    return json_dir
예제 #20
0
def generic_plot(output_dir: str,
                 master: skbio.OrdinationResults,
                 metadata: qiime2.Metadata,
                 other_pcoa: skbio.OrdinationResults,
                 plot_name: str,
                 info: str = None,
                 custom_axes: str = None,
                 settings: dict = None,
                 ignore_missing_samples: bool = False,
                 feature_metadata: qiime2.Metadata = None):

    mf = metadata.to_dataframe()
    if feature_metadata is not None:
        feature_metadata = feature_metadata.to_dataframe()

    if other_pcoa is None:
        procrustes = None
    else:
        procrustes = [other_pcoa]

    viz = Emperor(master,
                  mf,
                  feature_mapping_file=feature_metadata,
                  ignore_missing_samples=ignore_missing_samples,
                  procrustes=procrustes,
                  remote='.')

    if custom_axes is not None:
        viz.custom_axes = custom_axes

    if other_pcoa:
        viz.procrustes_names = ['reference', 'other']

    viz.info = info
    viz.settings = settings

    html = viz.make_emperor(standalone=True)
    viz.copy_support_files(output_dir)
    with open(os.path.join(output_dir, 'emperor.html'), 'w') as fh:
        fh.write(html)

    index = os.path.join(TEMPLATES, 'index.html')
    q2templates.render(index, output_dir, context={'plot_name': plot_name})
예제 #21
0
def pibble(table: pd.DataFrame,
           metadata: qiime2.Metadata,
           formula: str,
           learning_rate: float = 1e-3,
           beta1: float = 0.9,
           beta2: float = 0.99) -> (pd.DataFrame, pd.DataFrame):

    with tempfile.TemporaryDirectory() as temp_dir_name:
        biom_fp = os.path.join(temp_dir_name, 'input.tsv.biom')
        map_fp = os.path.join(temp_dir_name, 'input.map.txt')
        differential_fp = os.path.join(temp_dir_name,
                                       'output.differential.csv')
        posterior_fp = os.path.join(temp_dir_name, 'output.posterior.csv')

        table.to_csv(biom_fp, sep='\t')
        metadata.to_dataframe().to_csv(map_fp, sep='\t')

        cmd = [
            'run_pibble.R', biom_fp, map_fp, formula, learning_rate, beta1,
            beta2, differential_fp, posterior_fp
        ]
        cmd = list(map(str, cmd))

        try:
            run_commands([cmd])
        except subprocess.CalledProcessError as e:
            raise Exception("An error was encountered while running stray"
                            " in R (return code %d), please inspect stdout"
                            " and stderr to learn more." % e.returncode)

        lam_summary = pd.read_csv(differential_fp, index_col=0)

        alr_diffs = summary[['covariate', 'coord', 'mean']]
        alr_diffs = differentials.pivot('coord', 'covariate', 'mean')
        diffs = np.vstack((alr_diffs.values, np.zeros(alr_diffs.shape[1])))
        # convert to alr coordinates
        diffs = diffs - diffs.mean(axis=0)
        differential = pd.Dataframe(diff,
                                    columns=alr_diffs.columns,
                                    index=table.index)

        posterior = pd.read_csv(posterior_fp, index_col=0)
        return differential, posterior
예제 #22
0
def barplot(output_dir: str,
            proportions: pd.DataFrame,
            sample_metadata: Metadata,
            category_column: str = DEFAULT_CAT) -> None:

    # scriptable metadata
    sample_metadata = sample_metadata.to_dataframe()

    # make the sample metadata
    # check if proportion index in metadata index
    if sum([i in sample_metadata.index for i in proportions.columns]) > 0:
        # then subset sample metadata by index
        mf_samples = sample_metadata.loc[proportions.columns, :]
        mf_samples.index.name = 'sampleid'
    else:
        # else subset sample metadata by category (in loo case)
        keep_ = sample_metadata[category_column].isin(proportions.columns)
        mf_samples = sample_metadata[keep_]
        mf_samples = mf_samples.set_index(category_column)
        mf_samples = mf_samples.loc[~mf_samples.index.duplicated(keep='first')]
        mf_samples[category_column] = list(mf_samples.index)
        mf_samples = mf_samples[mf_samples.columns[::-1]]
        mf_samples.index.name = 'sampleid'

    # make the feature metadata (mock taxonomy)
    keep_ = sample_metadata[category_column].isin(proportions.index)
    mf_feature = sample_metadata[keep_]
    mf_feature = mf_feature.set_index(category_column)
    mf_feature = mf_feature.loc[~mf_feature.index.duplicated(keep='first')]
    mf_feature.loc['Unknown', :] = 'Unknown'
    mf_feature[category_column] = list(mf_feature.index)
    mf_feature = mf_feature[mf_feature.columns[::-1]]
    mf_feature = mf_feature.astype(str).apply(lambda x: '; '.join(x), axis=1)
    mf_feature = pd.DataFrame(mf_feature, columns=['Taxon'])
    mf_feature.index.name = 'Feature ID'

    # make barplot
    _barplot(output_dir, proportions.T, pd.Series(mf_feature.Taxon),
             Metadata(mf_samples))

    # grab bundle location to fix
    bundle = os.path.join(output_dir, 'dist', 'bundle.js')
    # bundle terms to fix for our purpose
    bundle_rplc = {
        'Relative Frequency': 'Source Contribution',
        'Taxonomic Level': 'Source Grouping',
        'Sample': 'Sink'
    }
    # make small text chnage to bundle
    with open(bundle) as f:
        newText = f.read()
        for prev, repl in bundle_rplc.items():
            newText = newText.replace(prev, repl)
    with open(bundle, "w") as f:
        f.write(newText)
예제 #23
0
def tree_plot(output_dir: str,
              tree: NewickFormat,
              feature_metadata: qiime2.Metadata = None) -> None:
    """Visualizes a tree (optionally with feature metadata)."""

    if feature_metadata is not None:
        feature_metadata = feature_metadata.to_dataframe()

    t = get_bp(tree)
    viz = Empress(tree=t, feature_metadata=feature_metadata)
    save_viz(viz, output_dir)
예제 #24
0
def sample_random(ids: qiime2.Metadata, n: int, seed: int = None) \
        -> IDSelection:
    if n > ids.id_count:
        raise ValueError("Value for n is larger than the number of IDs"
                         " present")

    df = ids.to_dataframe()
    samples = df.sample(n, replace=False, random_state=seed)
    inclusion = pd.Series(False, index=df.index)
    inclusion[samples.index] = True

    return IDSelection(inclusion, ids, "sample_random")
예제 #25
0
def augment(table: biom.Table, sampling_depth: int, augment_times: int, output_path_metadata: str,
        raw_metadata: qiime2.Metadata, with_replacement: bool = False, rarefy_start: bool = True) -> biom.Table:

    metadata = raw_metadata.to_dataframe()
    metadata = metadata.sort_index()

    all_df = table.to_dataframe().sort_index().sort_index(axis=1)
    ## change sorted table back to biom
    table = biom.Table(all_df.values, all_df.index.to_list(), all_df.columns.to_list())

    zero_df = all_df[all_df==0].fillna(0)
    zero_table = biom.Table(zero_df.values, zero_df.index.to_list(), zero_df.columns.to_list())

    sub_table = table.subsample(sampling_depth, axis='sample', by_id=False,
            with_replacement=with_replacement)

    if rarefy_start == True:
        output_table = zero_table.merge(sub_table)
    else:
        output_table = table

    output_metadata = metadata

    for i in range(augment_times):
        num = i+1
        sub_table = table.subsample(sampling_depth, axis='sample', by_id=False,
              with_replacement=with_replacement)
        sub_df = sub_table.to_dataframe().sort_index().sort_index(axis=1)

        ## rename
        sub_df_names = sub_df.columns.to_list()
        sub_df_names_added = [x + '_' + str(num) for x in sub_df_names]

        sub_df.columns = sub_df_names_added
        sub_table = biom.Table(sub_df.values, sub_df.index.to_list(), sub_df.columns.to_list())
        output_table = output_table.merge(sub_table)

        metadata_names = metadata.index.to_list()
        metadata_names_added = [x + '_' + str(num) for x in metadata_names]

        tmp_metadata = metadata.copy()
        tmp_metadata.index = metadata_names_added
        print(output_metadata)
        output_metadata = pd.concat((output_metadata, tmp_metadata))

    output_metadata.index.name = 'sample-id'
    output_metadata = qiime2.metadata.Metadata(output_metadata)
    output_metadata.save(output_path_metadata)

    if output_table.is_empty():
        raise ValueError('The output table contains no features.')

    return output_table
예제 #26
0
def report(output_dir: str, pcoa: skbio.OrdinationResults, metadata: Metadata,
           alpha: pd.Series, table: biom.Table, taxonomy: pd.Series,
           samples: list) -> None:
    metadata = metadata.to_dataframe()

    _insanity_checker(samples, metadata, table, alpha, pcoa)

    index = os.path.join(TEMPLATES, 'report', 'index.html')
    q2templates.render(index, output_dir, context={'name': 'foo'})

    # Copy assets for rendering figure
    shutil.copytree(os.path.join(TEMPLATES, 'report', 'resources'),
                    os.path.join(output_dir, 'resources'))
예제 #27
0
def anova(output_dir: str,
          metadata: qiime2.Metadata,
          formula: str,
          sstype: str = 'II') -> None:

    # Grab metric and covariate names from formula
    metric, group_columns = _parse_formula(formula)
    columns = [metric] + list(group_columns)

    # Validate formula (columns are in metadata, etc)
    for col in columns:
        metadata.get_column(col)
    # store categorical column names for later use
    cats = metadata.filter_columns(column_type='categorical').columns.keys()
    metadata = metadata.to_dataframe()[columns].dropna()

    # Run anova
    lm = ols(formula, metadata).fit()
    results = pd.DataFrame(sm.stats.anova_lm(lm, typ=sstype)).fillna('')
    results.to_csv(os.path.join(output_dir, 'anova.tsv'), sep='\t')

    # Run pairwise t-tests with multiple test correction
    pairwise_tests = pd.DataFrame()
    for group in group_columns:
        # only run on categorical columns — numeric columns raise error
        if group in cats:
            ttests = lm.t_test_pairwise(group, method='fdr_bh').result_frame
            pairwise_tests = pd.concat([pairwise_tests, pd.DataFrame(ttests)])
    if pairwise_tests.empty:
        pairwise_tests = False

    # Plot fit vs. residuals
    metadata['residual'] = lm.resid
    metadata['fitted_values'] = lm.fittedvalues
    res = _regplot_subplots_from_dataframe('fitted_values',
                                           'residual',
                                           metadata,
                                           group_columns,
                                           lowess=False,
                                           ci=95,
                                           palette='Set1',
                                           fit_reg=False)

    # Visualize results
    _visualize_anova(output_dir,
                     pairwise_tests=pairwise_tests,
                     model_results=results,
                     residuals=res,
                     pairwise_test_name='Pairwise t-tests')
예제 #28
0
def barplot(output_dir: str, table: pd.DataFrame, taxonomy: pd.Series,
            metadata: Metadata = None) -> None:

    if metadata is None:
        metadata = Metadata(pd.DataFrame({'id': table.index}).set_index('id'))

    ids_not_in_metadata = set(table.index) - set(metadata.ids)
    if ids_not_in_metadata:
        raise ValueError('Sample IDs found in the table are missing in the '
                         f'metadata: {ids_not_in_metadata!r}.')

    metadata = metadata.to_dataframe()
    jsonp_files, csv_files = [], []
    collapsed_tables = _extract_to_level(taxonomy, table)

    for level, df in enumerate(collapsed_tables, 1):
        # Stash column labels before manipulating dataframe
        taxa_cols = df.columns.values.tolist()
        # Join collapsed table with metadata
        df = df.join(metadata, how='left')
        df = df.reset_index(drop=False)  # Move index into columns
        # Our JS sort works best with empty strings vs nulls
        df = df.fillna('')
        all_cols = df.columns.values.tolist()

        jsonp_file = 'level-%d.jsonp' % level
        csv_file = 'level-%d.csv' % level

        jsonp_files.append(jsonp_file)
        csv_files.append(csv_file)

        df.to_csv(os.path.join(output_dir, csv_file), index=False)

        with open(os.path.join(output_dir, jsonp_file), 'w') as fh:
            fh.write('load_data(%d,' % level)
            json.dump(taxa_cols, fh)
            fh.write(',')
            json.dump(all_cols, fh)
            fh.write(',')
            df.to_json(fh, orient='records')
            fh.write(');')

    # Now that the tables have been collapsed, write out the index template
    index = os.path.join(TEMPLATES, 'barplot', 'index.html')
    q2templates.render(index, output_dir, context={'jsonp_files': jsonp_files})

    # Copy assets for rendering figure
    shutil.copytree(os.path.join(TEMPLATES, 'barplot', 'dist'),
                    os.path.join(output_dir, 'dist'))
예제 #29
0
def plot(output_dir: str, model: STLDirFmt,
         metadata: qiime2.Metadata) -> None:
    mf = metadata.to_dataframe()

    ili_path = os.path.join(ASSETS, 'ili')

    # copy the ili contents into the output folder
    copy_tree(ili_path, output_dir)

    stl = os.path.join(str(model.path), 'model.stl')

    # we save the data to the workers folder since that's where the files are
    # loaded from, and to avoid requests to external sites, etc.
    mf.to_csv(os.path.join(output_dir, 'js/workers', 'features.csv'))
    copyfile(stl, os.path.join(output_dir, 'js/workers', 'model.stl'))
예제 #30
0
파일: _plot.py 프로젝트: wxhyihuan/empress
def community_plot(output_dir: str,
                   tree: NewickFormat,
                   feature_table: biom.Table,
                   sample_metadata: qiime2.Metadata,
                   pcoa: OrdinationResults = None,
                   feature_metadata: qiime2.Metadata = None,
                   ignore_missing_samples: bool = False,
                   filter_extra_samples: bool = False,
                   filter_missing_features: bool = False,
                   number_of_features: int = 5,
                   shear_to_table: bool = True) -> None:
    """Visualizes a tree alongside community-level data.

       The functionality available in this visualization is a superset of the
       functionality in tree_plot() -- including sample metadata coloring /
       barplots, animations, and Emperor integration support.
    """
    if pcoa is not None and pcoa.features is not None:
        pcoa = prepare_pcoa(pcoa, number_of_features)

    sample_metadata = sample_metadata.to_dataframe()

    if feature_metadata is not None:
        feature_metadata = feature_metadata.to_dataframe()

    t = get_bp(tree)
    viz = Empress(tree=t,
                  table=feature_table,
                  sample_metadata=sample_metadata,
                  feature_metadata=feature_metadata,
                  ordination=pcoa,
                  ignore_missing_samples=ignore_missing_samples,
                  filter_extra_samples=filter_extra_samples,
                  filter_missing_features=filter_missing_features,
                  shear_to_table=shear_to_table)
    save_viz(viz, output_dir)
예제 #31
0
def adonis(output_dir: str,
           distance_matrix: skbio.DistanceMatrix,
           metadata: qiime2.Metadata,
           formula: str,
           permutations: int = 999,
           n_jobs: int = 1) -> None:
    # Validate sample metadata is superset et cetera
    metadata_ids = set(metadata.ids)
    dm_ids = distance_matrix.ids
    _validate_metadata_is_superset(metadata_ids, set(dm_ids))
    # filter ids. ids must be in same order as dm
    filtered_md = metadata.to_dataframe().reindex(dm_ids)
    filtered_md.index.name = 'sample-id'
    metadata = qiime2.Metadata(filtered_md)

    # Validate formula
    terms = ModelDesc.from_formula(formula)
    for t in terms.rhs_termlist:
        for i in t.factors:
            column = metadata.get_column(i.name())
            if column.has_missing_values():
                raise ValueError(
                    'adonis requires metadata columns with no '
                    'NaN values (missing values in column `%s`.)' %
                    (column.name, ))

    # Run adonis
    results_fp = os.path.join(output_dir, 'adonis.tsv')
    with tempfile.TemporaryDirectory() as temp_dir_name:
        dm_fp = os.path.join(temp_dir_name, 'dm.tsv')
        distance_matrix.write(dm_fp)
        md_fp = os.path.join(temp_dir_name, 'md.tsv')
        metadata.save(md_fp)
        cmd = [
            'run_adonis.R', dm_fp, md_fp, formula,
            str(permutations),
            str(n_jobs), results_fp
        ]
        _run_command(cmd)

    # Visualize results
    results = pd.read_csv(results_fp, sep='\t')
    results = q2templates.df_to_html(results)
    index = os.path.join(TEMPLATES, 'adonis_assets', 'index.html')
    q2templates.render(index, output_dir, context={'results': results})
예제 #32
0
def plot(output_dir: str, pcoa: skbio.OrdinationResults,
         metadata: qiime2.Metadata, custom_axis: str=None) -> None:

    mf = metadata.to_dataframe()
    viz = Emperor(pcoa, mf, remote='.')

    if custom_axis is not None:
        # put custom_axis inside a list to workaround the type system not
        # supporting lists of types
        html = viz.make_emperor(standalone=True, custom_axes=[custom_axis])
    else:
        html = viz.make_emperor(standalone=True)
    viz.copy_support_files(output_dir)
    with open(os.path.join(output_dir, 'emperor.html'), 'w') as fh:
        fh.write(html)

    index = os.path.join(TEMPLATES, 'index.html')
    q2templates.render(index, output_dir)
예제 #33
0
def tabulate(output_dir: str, input: qiime2.Metadata,
             page_size: int=100) -> None:
    if page_size < 1:
        raise ValueError('Cannot render less than one record per page.')

    df = input.to_dataframe()
    df.reset_index(inplace=True)
    table = df.to_json(orient='split')

    index = os.path.join(TEMPLATES, 'tabulate', 'index.html')
    q2templates.render(index, output_dir,
                       context={'table': table, 'page_size': page_size})

    js = os.path.join(TEMPLATES, 'tabulate', 'datatables.min.js')
    os.mkdir(os.path.join(output_dir, 'js'))
    shutil.copy(js, os.path.join(output_dir, 'js', 'datatables.min.js'))

    css = os.path.join(TEMPLATES, 'tabulate', 'datatables.min.css')
    os.mkdir(os.path.join(output_dir, 'css'))
    shutil.copy(css, os.path.join(output_dir, 'css', 'datatables.min.css'))
예제 #34
0
def bioenv(output_dir: str, distance_matrix: skbio.DistanceMatrix,
           metadata: qiime2.Metadata) -> None:
    # Filter metadata to only include IDs present in the distance matrix.
    # Also ensures every distance matrix ID is present in the metadata.
    metadata = metadata.filter_ids(distance_matrix.ids)

    # drop non-numeric columns and empty columns
    pre_filtered_cols = set(metadata.columns)
    metadata = metadata.filter_columns(column_type='numeric')
    non_numeric_cols = pre_filtered_cols - set(metadata.columns)

    # filter 0 variance numerical columns and empty columns
    pre_filtered_cols = set(metadata.columns)
    metadata = metadata.filter_columns(drop_zero_variance=True,
                                       drop_all_missing=True)
    zero_variance_cols = pre_filtered_cols - set(metadata.columns)

    # Drop samples that have any missing values.
    # TODO use Metadata API if this type of filtering is supported in the
    # future.
    df = metadata.to_dataframe()
    df = df.dropna(axis='index', how='any')

    # filter the distance matrix to exclude samples that were dropped from
    # the metadata, and keep track of how many samples survived the filtering
    # so that information can be presented to the user.
    initial_dm_length = distance_matrix.shape[0]
    distance_matrix = distance_matrix.filter(df.index)
    filtered_dm_length = distance_matrix.shape[0]

    result = skbio.stats.distance.bioenv(distance_matrix, df)
    result = q2templates.df_to_html(result)

    index = os.path.join(TEMPLATES, 'bioenv_assets', 'index.html')
    q2templates.render(index, output_dir, context={
        'initial_dm_length': initial_dm_length,
        'filtered_dm_length': filtered_dm_length,
        'non_numeric_cols': ', '.join(sorted(non_numeric_cols)),
        'zero_variance_cols': ', '.join(sorted(zero_variance_cols)),
        'result': result})
예제 #35
0
def adonis(output_dir: str,
           distance_matrix: skbio.DistanceMatrix,
           metadata: qiime2.Metadata,
           formula: str,
           permutations: int = 999,
           n_jobs: str = 1) -> None:
    # Validate sample metadata is superset et cetera
    metadata_ids = set(metadata.ids)
    dm_ids = distance_matrix.ids
    _validate_metadata_is_superset(metadata_ids, set(dm_ids))
    # filter ids. ids must be in same order as dm
    filtered_md = metadata.to_dataframe().reindex(dm_ids)
    filtered_md.index.name = 'sample-id'
    metadata = qiime2.Metadata(filtered_md)

    # Validate formula
    terms = ModelDesc.from_formula(formula)
    for t in terms.rhs_termlist:
        for i in t.factors:
            metadata.get_column(i.name())

    # Run adonis
    results_fp = os.path.join(output_dir, 'adonis.tsv')
    with tempfile.TemporaryDirectory() as temp_dir_name:
        dm_fp = os.path.join(temp_dir_name, 'dm.tsv')
        distance_matrix.write(dm_fp)
        md_fp = os.path.join(temp_dir_name, 'md.tsv')
        metadata.save(md_fp)
        cmd = ['run_adonis.R', dm_fp, md_fp, formula, str(permutations),
               str(n_jobs), results_fp]
        _run_command(cmd)

    # Visualize results
    results = pd.read_csv(results_fp, sep='\t')
    results = q2templates.df_to_html(results)
    index = os.path.join(TEMPLATES, 'adonis_assets', 'index.html')
    q2templates.render(index, output_dir, context={'results': results})
예제 #36
0
def alpha_rarefaction(output_dir: str, table: biom.Table, max_depth: int,
                      phylogeny: skbio.TreeNode = None, metrics: set = None,
                      metadata: qiime2.Metadata = None, min_depth: int = 1,
                      steps: int = 10, iterations: int = 10) -> None:

    if metrics is None:
        metrics = {'observed_otus', 'shannon'}
        if phylogeny is not None:
            metrics.add('faith_pd')
    elif not metrics:
        raise ValueError('`metrics` was given an empty set.')
    else:
        phylo_overlap = phylogenetic_metrics() & metrics
        if phylo_overlap and phylogeny is None:
            raise ValueError('Phylogenetic metric %s was requested but '
                             'phylogeny was not provided.' % phylo_overlap)

    if max_depth <= min_depth:
        raise ValueError('Provided max_depth of %d must be greater than '
                         'provided min_depth of %d.' % (max_depth, min_depth))
    possible_steps = max_depth - min_depth
    if possible_steps < steps:
        raise ValueError('Provided number of steps (%d) is greater than the '
                         'steps possible between min_depth and '
                         'max_depth (%d).' % (steps, possible_steps))
    if table.is_empty():
        raise ValueError('Provided table is empty.')
    max_frequency = max(table.sum(axis='sample'))
    if max_frequency < max_depth:
        raise ValueError('Provided max_depth of %d is greater than '
                         'the maximum sample total frequency of the '
                         'feature_table (%d).' % (max_depth, max_frequency))

    if metadata is None:
        columns, filtered_columns = set(), set()
    else:
        # Filter metadata to only include sample IDs present in the feature
        # table. Also ensures every feature table sample ID is present in the
        # metadata.
        metadata = metadata.filter_ids(table.ids(axis='sample'))

        # Drop metadata columns that aren't categorical, or consist solely of
        # missing values.
        pre_filtered_cols = set(metadata.columns)
        metadata = metadata.filter_columns(column_type='categorical',
                                           drop_all_missing=True)
        filtered_columns = pre_filtered_cols - set(metadata.columns)

        metadata_df = metadata.to_dataframe()
        if metadata_df.empty or len(metadata.columns) == 0:
            raise ValueError("All metadata filtered after dropping columns "
                             "that contained non-categorical data.")
        metadata_df.columns = pd.MultiIndex.from_tuples(
            [(c, '') for c in metadata_df.columns])
        columns = metadata_df.columns.get_level_values(0)

    data = _compute_rarefaction_data(table, min_depth, max_depth,
                                     steps, iterations, phylogeny, metrics)

    filenames = []
    for m, data in data.items():
        metric_name = quote(m)
        filename = '%s.csv' % metric_name

        if metadata is None:
            n_df = _compute_summary(data, 'sample-id')
            jsonp_filename = '%s.jsonp' % metric_name
            _alpha_rarefaction_jsonp(output_dir, jsonp_filename, metric_name,
                                     n_df, '')
            filenames.append(jsonp_filename)
        else:
            merged = data.join(metadata_df, how='left')
            for column in columns:
                column_name = quote(column)
                reindexed_df, counts = _reindex_with_metadata(column,
                                                              columns,
                                                              merged)
                c_df = _compute_summary(reindexed_df, column, counts=counts)
                jsonp_filename = "%s-%s.jsonp" % (metric_name, column_name)
                _alpha_rarefaction_jsonp(output_dir, jsonp_filename,
                                         metric_name, c_df, column)
                filenames.append(jsonp_filename)

        with open(os.path.join(output_dir, filename), 'w') as fh:
            data.columns = ['depth-%d_iter-%d' % (t[0], t[1])
                            for t in data.columns.values]
            if metadata is not None:
                data = data.join(metadata.to_dataframe(), how='left')
            data.to_csv(fh, index_label=['sample-id'])

    index = os.path.join(TEMPLATES, 'alpha_rarefaction_assets', 'index.html')
    q2templates.render(index, output_dir,
                       context={'metrics': list(metrics),
                                'filenames': [quote(f) for f in filenames],
                                'columns': list(columns),
                                'steps': steps,
                                'filtered_columns': sorted(filtered_columns)})

    shutil.copytree(os.path.join(TEMPLATES, 'alpha_rarefaction_assets',
                                 'dist'),
                    os.path.join(output_dir, 'dist'))
예제 #37
0
def alpha_group_significance(output_dir: str, alpha_diversity: pd.Series,
                             metadata: qiime2.Metadata) -> None:
    metadata_df = metadata.to_dataframe()
    metadata_df = metadata_df.apply(pd.to_numeric, errors='ignore')
    pre_filtered_cols = set(metadata_df.columns)
    metadata_df = metadata_df.select_dtypes(exclude=[np.number])
    post_filtered_cols = set(metadata_df.columns)
    filtered_numeric_categories = pre_filtered_cols - post_filtered_cols
    filtered_group_comparisons = []

    categories = metadata_df.columns
    metric_name = alpha_diversity.name

    if len(categories) == 0:
        raise ValueError('Only numeric data is present in metadata file.')

    filenames = []
    filtered_categories = []
    for category in categories:
        metadata_category = metadata.get_category(category).to_series()
        metadata_category = metadata_category[alpha_diversity.index]
        metadata_category = metadata_category.replace(r'', np.nan).dropna()

        initial_data_length = alpha_diversity.shape[0]
        data = pd.concat([alpha_diversity, metadata_category], axis=1,
                         join='inner')
        filtered_data_length = data.shape[0]

        names = []
        groups = []
        for name, group in data.groupby(metadata_category.name):
            names.append('%s (n=%d)' % (name, len(group)))
            groups.append(list(group[alpha_diversity.name]))

        if (len(groups) > 1 and len(groups) != len(data.index)):
            escaped_category = quote(category)
            filename = 'category-%s.jsonp' % escaped_category
            filenames.append(filename)

            # perform Kruskal-Wallis across all groups
            kw_H_all, kw_p_all = scipy.stats.mstats.kruskalwallis(*groups)

            # perform pairwise Kruskal-Wallis across all pairs of groups and
            # correct for multiple comparisons
            kw_H_pairwise = []
            for i in range(len(names)):
                for j in range(i):
                    try:
                        H, p = scipy.stats.mstats.kruskalwallis(groups[i],
                                                                groups[j])
                        kw_H_pairwise.append([names[j], names[i], H, p])
                    except ValueError:
                        filtered_group_comparisons.append(
                            ['%s:%s' % (category, names[i]),
                             '%s:%s' % (category, names[j])])
            kw_H_pairwise = pd.DataFrame(
                kw_H_pairwise, columns=['Group 1', 'Group 2', 'H', 'p-value'])
            kw_H_pairwise.set_index(['Group 1', 'Group 2'], inplace=True)
            kw_H_pairwise['q-value'] = multipletests(
                kw_H_pairwise['p-value'], method='fdr_bh')[1]
            kw_H_pairwise.sort_index(inplace=True)
            pairwise_fn = 'kruskal-wallis-pairwise-%s.csv' % escaped_category
            pairwise_path = os.path.join(output_dir, pairwise_fn)
            kw_H_pairwise.to_csv(pairwise_path)

            with open(os.path.join(output_dir, filename), 'w') as fh:
                df = pd.Series(groups, index=names)

                fh.write("load_data('%s'," % category)
                df.to_json(fh, orient='split')
                fh.write(",")
                json.dump({'initial': initial_data_length,
                           'filtered': filtered_data_length}, fh)
                fh.write(",")
                json.dump({'H': kw_H_all, 'p': kw_p_all}, fh)
                fh.write(",'")
                table = kw_H_pairwise.to_html(classes="table table-striped "
                                              "table-hover")
                table = table.replace('border="1"', 'border="0"')
                fh.write(table.replace('\n', ''))
                fh.write("','%s', '%s');" % (quote(pairwise_fn), metric_name))
        else:
            filtered_categories.append(category)

    index = os.path.join(
        TEMPLATES, 'alpha_group_significance_assets', 'index.html')
    q2templates.render(index, output_dir, context={
        'categories': [quote(fn) for fn in filenames],
        'filtered_numeric_categories': ', '.join(filtered_numeric_categories),
        'filtered_categories': ', '.join(filtered_categories),
        'filtered_group_comparisons':
            '; '.join([' vs '.join(e) for e in filtered_group_comparisons])})

    shutil.copytree(
        os.path.join(TEMPLATES, 'alpha_group_significance_assets', 'dst'),
        os.path.join(output_dir, 'dist'))
예제 #38
0
def alpha_correlation(output_dir: str,
                      alpha_diversity: pd.Series,
                      metadata: qiime2.Metadata,
                      method: str='spearman') -> None:
    try:
        alpha_correlation_fn = _alpha_correlation_fns[method]
    except KeyError:
        raise ValueError('Unknown alpha correlation method %s. The available '
                         'options are %s.' %
                         (method, ', '.join(_alpha_correlation_fns.keys())))
    metadata_df = metadata.to_dataframe()
    metadata_df = metadata_df.apply(pd.to_numeric, errors='ignore')
    pre_filtered_cols = set(metadata_df.columns)
    metadata_df = metadata_df.select_dtypes(include=[np.number])
    post_filtered_cols = set(metadata_df.columns)
    filtered_categories = pre_filtered_cols - post_filtered_cols

    categories = metadata_df.columns

    if len(categories) == 0:
        raise ValueError('Only non-numeric data is present in metadata file.')

    filenames = []
    for category in categories:
        metadata_category = metadata_df[category]
        metadata_category = metadata_category[alpha_diversity.index]
        metadata_category = metadata_category.dropna()

        # create a dataframe containing the data to be correlated, and drop
        # any samples that have no data in either column
        df = pd.concat([metadata_category, alpha_diversity], axis=1,
                       join='inner')

        # compute correlation
        correlation_result = alpha_correlation_fn(df[metadata_category.name],
                                                  df[alpha_diversity.name])

        warning = None
        if alpha_diversity.shape[0] != df.shape[0]:
            warning = {'initial': alpha_diversity.shape[0],
                       'method': method.title(),
                       'filtered': df.shape[0]}

        escaped_category = quote(category)
        filename = 'category-%s.jsonp' % escaped_category
        filenames.append(filename)

        with open(os.path.join(output_dir, filename), 'w') as fh:
            fh.write("load_data('%s'," % category)
            df.to_json(fh, orient='split')
            fh.write(",")
            json.dump(warning, fh)
            fh.write(",")
            json.dump({
                'method': method.title(),
                'testStat': '%1.4f' % correlation_result[0],
                'pVal': '%1.4f' % correlation_result[1],
                'sampleSize': df.shape[0]}, fh)
            fh.write(");")

    index = os.path.join(TEMPLATES, 'alpha_correlation_assets', 'index.html')
    q2templates.render(index, output_dir, context={
        'categories': [quote(fn) for fn in filenames],
        'filtered_categories': ', '.join(filtered_categories)})

    shutil.copytree(os.path.join(TEMPLATES, 'alpha_correlation_assets', 'dst'),
                    os.path.join(output_dir, 'dist'))
예제 #39
0
def summarize(output_dir: str, table: biom.Table,
              sample_metadata: qiime2.Metadata=None) -> None:
    number_of_features, number_of_samples = table.shape

    sample_summary, sample_frequencies = _frequency_summary(
        table, axis='sample')
    if number_of_samples > 1:
        # Calculate the bin count, with a minimum of 5 bins
        IQR = sample_summary['3rd quartile'] - sample_summary['1st quartile']
        if IQR == 0.0:
            bins = 5
        else:
            # Freedman–Diaconis rule
            bin_width = (2 * IQR) / (number_of_samples ** (1/3))

            bins = max((sample_summary['Maximum frequency'] -
                        sample_summary['Minimum frequency']) / bin_width, 5)

        sample_frequencies_ax = sns.distplot(sample_frequencies, kde=False,
                                             rug=True, bins=int(round(bins)))
        sample_frequencies_ax.get_xaxis().set_major_formatter(
            matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
        sample_frequencies_ax.set_xlabel('Frequency per sample')
        sample_frequencies_ax.set_ylabel('Number of samples')
        sample_frequencies_ax.get_figure().savefig(
            os.path.join(output_dir, 'sample-frequencies.pdf'))
        sample_frequencies_ax.get_figure().savefig(
            os.path.join(output_dir, 'sample-frequencies.png'))
        plt.gcf().clear()

    feature_summary, feature_frequencies = _frequency_summary(
        table, axis='observation')
    if number_of_features > 1:
        feature_frequencies_ax = sns.distplot(feature_frequencies, kde=False,
                                              rug=False)
        feature_frequencies_ax.set_xlabel('Frequency per feature')
        feature_frequencies_ax.set_ylabel('Number of features')
        feature_frequencies_ax.set_xscale('log')
        feature_frequencies_ax.set_yscale('log')
        feature_frequencies_ax.get_figure().savefig(
            os.path.join(output_dir, 'feature-frequencies.pdf'))
        feature_frequencies_ax.get_figure().savefig(
            os.path.join(output_dir, 'feature-frequencies.png'))

    sample_summary_table = q2templates.df_to_html(
        sample_summary.apply('{:,}'.format).to_frame('Frequency'))
    feature_summary_table = q2templates.df_to_html(
        feature_summary.apply('{:,}'.format).to_frame('Frequency'))

    index = os.path.join(TEMPLATES, 'summarize_assets', 'index.html')
    context = {
        'number_of_samples': number_of_samples,
        'number_of_features': number_of_features,
        'total_frequencies': int(np.sum(sample_frequencies)),
        'sample_summary_table': sample_summary_table,
        'feature_summary_table': feature_summary_table,
    }

    feature_qualitative_data = _compute_qualitative_summary(table)
    sample_frequencies.sort_values(inplace=True, ascending=False)
    feature_frequencies.sort_values(inplace=True, ascending=False)
    sample_frequencies.to_csv(
        os.path.join(output_dir, 'sample-frequency-detail.csv'))
    feature_frequencies.to_csv(
        os.path.join(output_dir, 'feature-frequency-detail.csv'))

    feature_frequencies = feature_frequencies.astype(int) \
        .apply('{:,}'.format).to_frame('Frequency')
    feature_frequencies['# of Samples Observed In'] = \
        pd.Series(feature_qualitative_data).astype(int).apply('{:,}'.format)
    feature_frequencies_table = q2templates.df_to_html(feature_frequencies)
    overview_template = os.path.join(
        TEMPLATES, 'summarize_assets', 'overview.html')
    sample_frequency_template = os.path.join(
        TEMPLATES, 'summarize_assets', 'sample-frequency-detail.html')
    feature_frequency_template = os.path.join(
        TEMPLATES, 'summarize_assets', 'feature-frequency-detail.html')

    context.update({'max_count': sample_frequencies.max(),
                    'feature_frequencies_table': feature_frequencies_table,
                    'feature_qualitative_data': feature_qualitative_data,
                    'tabs': [{'url': 'overview.html',
                              'title': 'Overview'},
                             {'url': 'sample-frequency-detail.html',
                              'title': 'Interactive Sample Detail'},
                             {'url': 'feature-frequency-detail.html',
                              'title': 'Feature Detail'}]})
    templates = [index, sample_frequency_template,
                 feature_frequency_template, overview_template]
    q2templates.render(templates, output_dir, context=context)

    shutil.copytree(os.path.join(TEMPLATES, 'summarize_assets', 'dist'),
                    os.path.join(output_dir, 'dist'))

    with open(os.path.join(output_dir, 'data.jsonp'), 'w') as fh:
        fh.write("app.init(")
        if sample_metadata:
            sample_metadata = sample_metadata.filter_ids(
                sample_frequencies.index)
            # TODO use Metadata.to_json() API if/when it exists in the future.
            sample_metadata.to_dataframe().to_json(fh)
        else:
            fh.write('{}')
        fh.write(', ')
        sample_frequencies.to_json(fh)
        fh.write(');')
예제 #40
0
def alpha_rarefaction(output_dir: str, table: biom.Table, max_depth: int,
                      phylogeny: skbio.TreeNode=None, metrics: set=None,
                      metadata: qiime2.Metadata=None, min_depth: int=1,
                      steps: int=10, iterations: int=10) -> None:

    if metrics is None:
        metrics = {'observed_otus', 'shannon'}
        if phylogeny is not None:
            metrics.add('faith_pd')
    elif not metrics:
        raise ValueError('`metrics` was given an empty set.')
    else:
        phylo_overlap = phylogenetic_metrics() & metrics
        if phylo_overlap and phylogeny is None:
            raise ValueError('Phylogenetic metric %s was requested but '
                             'phylogeny was not provided.' % phylo_overlap)

    if max_depth <= min_depth:
        raise ValueError('Provided max_depth of %d must be greater than '
                         'provided min_depth of %d.' % (max_depth, min_depth))
    possible_steps = max_depth - min_depth
    if possible_steps < steps:
        raise ValueError('Provided number of steps (%d) is greater than the '
                         'steps possible between min_depth and '
                         'max_depth (%d).' % (steps, possible_steps))
    if table.is_empty():
        raise ValueError('Provided table is empty.')
    max_frequency = max(table.sum(axis='sample'))
    if max_frequency < max_depth:
        raise ValueError('Provided max_depth of %d is greater than '
                         'the maximum sample total frequency of the '
                         'feature_table (%d).' % (max_depth, max_frequency))
    if metadata is not None:
        metadata_ids = metadata.ids()
        table_ids = set(table.ids(axis='sample'))
        if not table_ids.issubset(metadata_ids):
            raise ValueError('Missing samples in metadata: %r' %
                             table_ids.difference(metadata_ids))

    filenames, categories, empty_columns = [], [], []
    data = _compute_rarefaction_data(table, min_depth, max_depth,
                                     steps, iterations, phylogeny, metrics)
    for m, data in data.items():
        metric_name = quote(m)
        filename = '%s.csv' % metric_name

        if metadata is None:
            n_df = _compute_summary(data, 'sample-id')
            jsonp_filename = '%s.jsonp' % metric_name
            _alpha_rarefaction_jsonp(output_dir, jsonp_filename, metric_name,
                                     n_df, '')
            filenames.append(jsonp_filename)
        else:
            metadata_df = metadata.to_dataframe()
            metadata_df = metadata_df.loc[data.index]

            all_columns = metadata_df.columns
            metadata_df.dropna(axis='columns', how='all', inplace=True)
            empty_columns = set(all_columns) - set(metadata_df.columns)

            metadata_df.columns = pd.MultiIndex.from_tuples(
                [(c, '') for c in metadata_df.columns])
            merged = data.join(metadata_df, how='left')
            categories = metadata_df.columns.get_level_values(0)
            for category in categories:
                category_name = quote(category)
                reindexed_df, counts = _reindex_with_metadata(category,
                                                              categories,
                                                              merged)
                c_df = _compute_summary(reindexed_df, category, counts=counts)
                jsonp_filename = "%s-%s.jsonp" % (metric_name, category_name)
                _alpha_rarefaction_jsonp(output_dir, jsonp_filename,
                                         metric_name, c_df, category_name)
                filenames.append(jsonp_filename)

        with open(os.path.join(output_dir, filename), 'w') as fh:
            data.columns = ['depth-%d_iter-%d' % (t[0], t[1])
                            for t in data.columns.values]
            if metadata is not None:
                data = data.join(metadata.to_dataframe(), how='left')
            data.to_csv(fh, index_label=['sample-id'])

    index = os.path.join(TEMPLATES, 'alpha_rarefaction_assets', 'index.html')
    q2templates.render(index, output_dir,
                       context={'metrics': list(metrics),
                                'filenames': filenames,
                                'categories': list(categories),
                                'empty_columns': sorted(empty_columns)})

    shutil.copytree(os.path.join(TEMPLATES, 'alpha_rarefaction_assets',
                                 'dist'),
                    os.path.join(output_dir, 'dist'))