示例#1
0
def anova(output_dir: str,
          metadata: qiime2.Metadata,
          formula: str,
          sstype: str = 'II') -> None:

    # Grab metric and covariate names from formula
    metric, group_columns = _parse_formula(formula)
    columns = [metric] + list(group_columns)

    # Validate formula (columns are in metadata, etc)
    for col in columns:
        metadata.get_column(col)
    # store categorical column names for later use
    cats = metadata.filter_columns(column_type='categorical').columns.keys()
    metadata = metadata.to_dataframe()[columns].dropna()

    # Run anova
    lm = ols(formula, metadata).fit()
    results = pd.DataFrame(sm.stats.anova_lm(lm, typ=sstype)).fillna('')
    results.to_csv(os.path.join(output_dir, 'anova.tsv'), sep='\t')

    # Run pairwise t-tests with multiple test correction
    pairwise_tests = pd.DataFrame()
    for group in group_columns:
        # only run on categorical columns — numeric columns raise error
        if group in cats:
            ttests = lm.t_test_pairwise(group, method='fdr_bh').result_frame
            pairwise_tests = pd.concat([pairwise_tests, pd.DataFrame(ttests)])
    if pairwise_tests.empty:
        pairwise_tests = False

    # Plot fit vs. residuals
    metadata['residual'] = lm.resid
    metadata['fitted_values'] = lm.fittedvalues
    res = _regplot_subplots_from_dataframe('fitted_values',
                                           'residual',
                                           metadata,
                                           group_columns,
                                           lowess=False,
                                           ci=95,
                                           palette='Set1',
                                           fit_reg=False)

    # Visualize results
    _visualize_anova(output_dir,
                     pairwise_tests=pairwise_tests,
                     model_results=results,
                     residuals=res,
                     pairwise_test_name='Pairwise t-tests')
示例#2
0
def adonis(output_dir: str,
           distance_matrix: skbio.DistanceMatrix,
           metadata: qiime2.Metadata,
           formula: str,
           permutations: int = 999,
           n_jobs: int = 1) -> None:
    # Validate sample metadata is superset et cetera
    metadata_ids = set(metadata.ids)
    dm_ids = distance_matrix.ids
    _validate_metadata_is_superset(metadata_ids, set(dm_ids))
    # filter ids. ids must be in same order as dm
    filtered_md = metadata.to_dataframe().reindex(dm_ids)
    filtered_md.index.name = 'sample-id'
    metadata = qiime2.Metadata(filtered_md)

    # Validate formula
    terms = ModelDesc.from_formula(formula)
    for t in terms.rhs_termlist:
        for i in t.factors:
            metadata.get_column(i.name())

    # Run adonis
    results_fp = os.path.join(output_dir, 'adonis.tsv')
    with tempfile.TemporaryDirectory() as temp_dir_name:
        dm_fp = os.path.join(temp_dir_name, 'dm.tsv')
        distance_matrix.write(dm_fp)
        md_fp = os.path.join(temp_dir_name, 'md.tsv')
        metadata.save(md_fp)
        cmd = [
            'run_adonis.R', dm_fp, md_fp, formula,
            str(permutations),
            str(n_jobs), results_fp
        ]
        _run_command(cmd)

    # Visualize results
    results = pd.read_csv(results_fp, sep='\t')
    results = q2templates.df_to_html(results)
    index = os.path.join(TEMPLATES, 'adonis_assets', 'index.html')
    q2templates.render(index, output_dir, context={'results': results})
示例#3
0
def adonis(output_dir: str,
           distance_matrix: skbio.DistanceMatrix,
           metadata: qiime2.Metadata,
           formula: str,
           permutations: int = 999,
           n_jobs: str = 1) -> None:
    # Validate sample metadata is superset et cetera
    metadata_ids = set(metadata.ids)
    dm_ids = distance_matrix.ids
    _validate_metadata_is_superset(metadata_ids, set(dm_ids))
    # filter ids. ids must be in same order as dm
    filtered_md = metadata.to_dataframe().reindex(dm_ids)
    filtered_md.index.name = 'sample-id'
    metadata = qiime2.Metadata(filtered_md)

    # Validate formula
    terms = ModelDesc.from_formula(formula)
    for t in terms.rhs_termlist:
        for i in t.factors:
            metadata.get_column(i.name())

    # Run adonis
    results_fp = os.path.join(output_dir, 'adonis.tsv')
    with tempfile.TemporaryDirectory() as temp_dir_name:
        dm_fp = os.path.join(temp_dir_name, 'dm.tsv')
        distance_matrix.write(dm_fp)
        md_fp = os.path.join(temp_dir_name, 'md.tsv')
        metadata.save(md_fp)
        cmd = ['run_adonis.R', dm_fp, md_fp, formula, str(permutations),
               str(n_jobs), results_fp]
        _run_command(cmd)

    # Visualize results
    results = pd.read_csv(results_fp, sep='\t')
    results = q2templates.df_to_html(results)
    index = os.path.join(TEMPLATES, 'adonis_assets', 'index.html')
    q2templates.render(index, output_dir, context={'results': results})
示例#4
0
def alpha_group_significance(output_dir: str, alpha_diversity: pd.Series,
                             metadata: qiime2.Metadata) -> None:
    # Filter metadata to only include IDs present in the alpha diversity data.
    # Also ensures every alpha diversity ID is present in the metadata.
    metadata = metadata.filter_ids(alpha_diversity.index)

    # Metadata column filtering could be done in one pass, but this visualizer
    # displays separate warnings for non-categorical columns, and categorical
    # columns that didn't satisfy the requirements of the statistics being
    # computed.
    pre_filtered_cols = set(metadata.columns)
    metadata = metadata.filter_columns(column_type='categorical')
    non_categorical_columns = pre_filtered_cols - set(metadata.columns)

    pre_filtered_cols = set(metadata.columns)
    metadata = metadata.filter_columns(drop_all_unique=True,
                                       drop_zero_variance=True,
                                       drop_all_missing=True)
    filtered_columns = pre_filtered_cols - set(metadata.columns)

    if len(metadata.columns) == 0:
        raise ValueError(
            "Metadata does not contain any columns that satisfy this "
            "visualizer's requirements. There must be at least one metadata "
            "column that contains categorical data, isn't empty, doesn't "
            "consist of unique values, and doesn't consist of exactly one "
            "value.")

    metric_name = alpha_diversity.name

    # save out metadata for download in viz
    alpha_diversity.index.name = 'id'
    alpha = qiime2.Metadata(alpha_diversity.to_frame())
    md = metadata.merge(alpha)
    md.save(os.path.join(output_dir, 'metadata.tsv'))

    filenames = []
    filtered_group_comparisons = []
    for column in metadata.columns:
        metadata_column = metadata.get_column(column)
        metadata_column = metadata_column.drop_missing_values()

        initial_data_length = alpha_diversity.shape[0]
        data = pd.concat(
            [alpha_diversity, metadata_column.to_series()],
            axis=1,
            join='inner')
        filtered_data_length = data.shape[0]

        names = []
        groups = []
        for name, group in data.groupby(metadata_column.name):
            names.append('%s (n=%d)' % (name, len(group)))
            groups.append(list(group[metric_name]))

        escaped_column = quote(column)
        escaped_column = escaped_column.replace('/', '%2F')
        filename = 'column-%s.jsonp' % escaped_column
        filenames.append(filename)

        # perform Kruskal-Wallis across all groups
        kw_H_all, kw_p_all = scipy.stats.mstats.kruskalwallis(*groups)

        # perform pairwise Kruskal-Wallis across all pairs of groups and
        # correct for multiple comparisons
        kw_H_pairwise = []
        for i in range(len(names)):
            for j in range(i):
                try:
                    H, p = scipy.stats.mstats.kruskalwallis(
                        groups[i], groups[j])
                    kw_H_pairwise.append([names[j], names[i], H, p])
                except ValueError:
                    filtered_group_comparisons.append([
                        '%s:%s' % (column, names[i]),
                        '%s:%s' % (column, names[j])
                    ])
        kw_H_pairwise = pd.DataFrame(
            kw_H_pairwise, columns=['Group 1', 'Group 2', 'H', 'p-value'])
        kw_H_pairwise.set_index(['Group 1', 'Group 2'], inplace=True)
        kw_H_pairwise['q-value'] = multipletests(kw_H_pairwise['p-value'],
                                                 method='fdr_bh')[1]
        kw_H_pairwise.sort_index(inplace=True)
        pairwise_fn = 'kruskal-wallis-pairwise-%s.csv' % escaped_column
        pairwise_path = os.path.join(output_dir, pairwise_fn)
        kw_H_pairwise.to_csv(pairwise_path)

        with open(os.path.join(output_dir, filename), 'w') as fh:
            series = pd.Series(groups, index=names)

            fh.write("load_data('%s'," % column)
            series.to_json(fh, orient='split')
            fh.write(",")
            json.dump(
                {
                    'initial': initial_data_length,
                    'filtered': filtered_data_length
                }, fh)
            fh.write(",")
            json.dump({'H': kw_H_all, 'p': kw_p_all}, fh)
            fh.write(",'")
            table = q2templates.df_to_html(kw_H_pairwise)
            fh.write(table.replace('\n', '').replace("'", "\\'"))
            fh.write("','%s', '%s');" % (quote(pairwise_fn), metric_name))

    index = os.path.join(TEMPLATES, 'alpha_group_significance_assets',
                         'index.html')
    q2templates.render(
        index,
        output_dir,
        context={
            'columns': [quote(fn) for fn in filenames],
            'non_categorical_columns':
            ', '.join(sorted(non_categorical_columns)),
            'filtered_columns':
            ', '.join(sorted(filtered_columns)),
            'filtered_group_comparisons':
            '; '.join([' vs '.join(e) for e in filtered_group_comparisons])
        })

    shutil.copytree(
        os.path.join(TEMPLATES, 'alpha_group_significance_assets', 'dist'),
        os.path.join(output_dir, 'dist'))
示例#5
0
def alpha_correlation(output_dir: str,
                      alpha_diversity: pd.Series,
                      metadata: qiime2.Metadata,
                      method: str = 'spearman') -> None:
    try:
        alpha_correlation_fn = _alpha_correlation_fns[method]
    except KeyError:
        raise ValueError('Unknown alpha correlation method %s. The available '
                         'options are %s.' %
                         (method, ', '.join(_alpha_correlation_fns.keys())))

    # Filter metadata to only include IDs present in the alpha diversity data.
    # Also ensures every alpha diversity ID is present in the metadata.
    metadata = metadata.filter_ids(alpha_diversity.index)

    pre_filtered_cols = set(metadata.columns)
    metadata = metadata.filter_columns(column_type='numeric',
                                       drop_all_missing=True)
    filtered_columns = pre_filtered_cols - set(metadata.columns)

    if len(metadata.columns) == 0:
        raise ValueError(
            "Metadata contains only non-numeric or empty columns. This "
            "visualizer requires at least one numeric metadata column to "
            "execute.")

    # save out metadata for download in viz
    alpha_diversity.index.name = 'id'
    alpha = qiime2.Metadata(alpha_diversity.to_frame())
    md = metadata.merge(alpha)
    md.save(os.path.join(output_dir, 'metadata.tsv'))

    filenames = []
    for column in metadata.columns:
        metadata_column = metadata.get_column(column)
        metadata_column = metadata_column.drop_missing_values()

        # create a dataframe containing the data to be correlated, and drop
        # any samples that have no data in either column
        df = pd.concat([metadata_column.to_series(), alpha_diversity],
                       axis=1,
                       join='inner')

        # compute correlation
        correlation_result = alpha_correlation_fn(df[metadata_column.name],
                                                  df[alpha_diversity.name])

        warning = None
        if alpha_diversity.shape[0] != df.shape[0]:
            warning = {
                'initial': alpha_diversity.shape[0],
                'method': method.title(),
                'filtered': df.shape[0]
            }

        escaped_column = quote(column)
        filename = 'column-%s.jsonp' % escaped_column
        filenames.append(filename)

        with open(os.path.join(output_dir, filename), 'w') as fh:
            fh.write("load_data('%s'," % column)
            df.to_json(fh, orient='split')
            fh.write(",")
            json.dump(warning, fh)
            fh.write(",")
            json.dump(
                {
                    'method': method.title(),
                    'testStat': '%1.4f' % correlation_result[0],
                    'pVal': '%1.4f' % correlation_result[1],
                    'sampleSize': df.shape[0]
                }, fh)
            fh.write(");")

    index = os.path.join(TEMPLATES, 'alpha_correlation_assets', 'index.html')
    q2templates.render(index,
                       output_dir,
                       context={
                           'columns': [quote(fn) for fn in filenames],
                           'filtered_columns':
                           ', '.join(sorted(filtered_columns))
                       })

    shutil.copytree(
        os.path.join(TEMPLATES, 'alpha_correlation_assets', 'dist'),
        os.path.join(output_dir, 'dist'))
示例#6
0
def alpha_group_significance(output_dir: str, alpha_diversity: pd.Series,
                             metadata: qiime2.Metadata) -> None:
    # Filter metadata to only include IDs present in the alpha diversity data.
    # Also ensures every alpha diversity ID is present in the metadata.
    metadata = metadata.filter_ids(alpha_diversity.index)

    # Metadata column filtering could be done in one pass, but this visualizer
    # displays separate warnings for non-categorical columns, and categorical
    # columns that didn't satisfy the requirements of the statistics being
    # computed.
    pre_filtered_cols = set(metadata.columns)
    metadata = metadata.filter_columns(column_type='categorical')
    non_categorical_columns = pre_filtered_cols - set(metadata.columns)

    pre_filtered_cols = set(metadata.columns)
    metadata = metadata.filter_columns(
        drop_all_unique=True, drop_zero_variance=True, drop_all_missing=True)
    filtered_columns = pre_filtered_cols - set(metadata.columns)

    if len(metadata.columns) == 0:
        raise ValueError(
            "Metadata does not contain any columns that satisfy this "
            "visualizer's requirements. There must be at least one metadata "
            "column that contains categorical data, isn't empty, doesn't "
            "consist of unique values, and doesn't consist of exactly one "
            "value.")

    metric_name = alpha_diversity.name

    # save out metadata for download in viz
    alpha_diversity.index.name = 'id'
    alpha = qiime2.Metadata(alpha_diversity.to_frame())
    md = metadata.merge(alpha)
    md.save(os.path.join(output_dir, 'metadata.tsv'))

    filenames = []
    filtered_group_comparisons = []
    for column in metadata.columns:
        metadata_column = metadata.get_column(column)
        metadata_column = metadata_column.drop_missing_values()

        initial_data_length = alpha_diversity.shape[0]
        data = pd.concat([alpha_diversity, metadata_column.to_series()],
                         axis=1, join='inner')
        filtered_data_length = data.shape[0]

        names = []
        groups = []
        for name, group in data.groupby(metadata_column.name):
            names.append('%s (n=%d)' % (name, len(group)))
            groups.append(list(group[metric_name]))

        escaped_column = quote(column)
        escaped_column = escaped_column.replace('/', '%2F')
        filename = 'column-%s.jsonp' % escaped_column
        filenames.append(filename)

        # perform Kruskal-Wallis across all groups
        kw_H_all, kw_p_all = scipy.stats.mstats.kruskalwallis(*groups)

        # perform pairwise Kruskal-Wallis across all pairs of groups and
        # correct for multiple comparisons
        kw_H_pairwise = []
        for i in range(len(names)):
            for j in range(i):
                try:
                    H, p = scipy.stats.mstats.kruskalwallis(groups[i],
                                                            groups[j])
                    kw_H_pairwise.append([names[j], names[i], H, p])
                except ValueError:
                    filtered_group_comparisons.append(
                        ['%s:%s' % (column, names[i]),
                         '%s:%s' % (column, names[j])])
        kw_H_pairwise = pd.DataFrame(
            kw_H_pairwise, columns=['Group 1', 'Group 2', 'H', 'p-value'])
        kw_H_pairwise.set_index(['Group 1', 'Group 2'], inplace=True)
        kw_H_pairwise['q-value'] = multipletests(
            kw_H_pairwise['p-value'], method='fdr_bh')[1]
        kw_H_pairwise.sort_index(inplace=True)
        pairwise_fn = 'kruskal-wallis-pairwise-%s.csv' % escaped_column
        pairwise_path = os.path.join(output_dir, pairwise_fn)
        kw_H_pairwise.to_csv(pairwise_path)

        with open(os.path.join(output_dir, filename), 'w') as fh:
            series = pd.Series(groups, index=names)

            fh.write("load_data('%s'," % column)
            series.to_json(fh, orient='split')
            fh.write(",")
            json.dump({'initial': initial_data_length,
                       'filtered': filtered_data_length}, fh)
            fh.write(",")
            json.dump({'H': kw_H_all, 'p': kw_p_all}, fh)
            fh.write(",'")
            table = q2templates.df_to_html(kw_H_pairwise)
            fh.write(table.replace('\n', '').replace("'", "\\'"))
            fh.write("','%s', '%s');" % (quote(pairwise_fn), metric_name))

    index = os.path.join(
        TEMPLATES, 'alpha_group_significance_assets', 'index.html')
    q2templates.render(index, output_dir, context={
        'columns': [quote(fn) for fn in filenames],
        'non_categorical_columns': ', '.join(sorted(non_categorical_columns)),
        'filtered_columns': ', '.join(sorted(filtered_columns)),
        'filtered_group_comparisons':
            '; '.join([' vs '.join(e) for e in filtered_group_comparisons])})

    shutil.copytree(
        os.path.join(TEMPLATES, 'alpha_group_significance_assets', 'dist'),
        os.path.join(output_dir, 'dist'))
示例#7
0
def alpha_correlation(output_dir: str,
                      alpha_diversity: pd.Series,
                      metadata: qiime2.Metadata,
                      method: str = 'spearman') -> None:
    try:
        alpha_correlation_fn = _alpha_correlation_fns[method]
    except KeyError:
        raise ValueError('Unknown alpha correlation method %s. The available '
                         'options are %s.' %
                         (method, ', '.join(_alpha_correlation_fns.keys())))

    # Filter metadata to only include IDs present in the alpha diversity data.
    # Also ensures every alpha diversity ID is present in the metadata.
    metadata = metadata.filter_ids(alpha_diversity.index)

    pre_filtered_cols = set(metadata.columns)
    metadata = metadata.filter_columns(column_type='numeric',
                                       drop_all_missing=True)
    filtered_columns = pre_filtered_cols - set(metadata.columns)

    if len(metadata.columns) == 0:
        raise ValueError(
            "Metadata contains only non-numeric or empty columns. This "
            "visualizer requires at least one numeric metadata column to "
            "execute.")

    # save out metadata for download in viz
    alpha_diversity.index.name = 'id'
    alpha = qiime2.Metadata(alpha_diversity.to_frame())
    md = metadata.merge(alpha)
    md.save(os.path.join(output_dir, 'metadata.tsv'))

    filenames = []
    for column in metadata.columns:
        metadata_column = metadata.get_column(column)
        metadata_column = metadata_column.drop_missing_values()

        # create a dataframe containing the data to be correlated, and drop
        # any samples that have no data in either column
        df = pd.concat([metadata_column.to_series(), alpha_diversity], axis=1,
                       join='inner')

        # compute correlation
        correlation_result = alpha_correlation_fn(df[metadata_column.name],
                                                  df[alpha_diversity.name])

        warning = None
        if alpha_diversity.shape[0] != df.shape[0]:
            warning = {'initial': alpha_diversity.shape[0],
                       'method': method.title(),
                       'filtered': df.shape[0]}

        escaped_column = quote(column)
        filename = 'column-%s.jsonp' % escaped_column
        filenames.append(filename)

        with open(os.path.join(output_dir, filename), 'w') as fh:
            fh.write("load_data('%s'," % column)
            df.to_json(fh, orient='split')
            fh.write(",")
            json.dump(warning, fh)
            fh.write(",")
            json.dump({
                'method': method.title(),
                'testStat': '%1.4f' % correlation_result[0],
                'pVal': '%1.4f' % correlation_result[1],
                'sampleSize': df.shape[0]}, fh)
            fh.write(");")

    index = os.path.join(TEMPLATES, 'alpha_correlation_assets', 'index.html')
    q2templates.render(index, output_dir, context={
        'columns': [quote(fn) for fn in filenames],
        'filtered_columns': ', '.join(sorted(filtered_columns))})

    shutil.copytree(os.path.join(TEMPLATES, 'alpha_correlation_assets',
                                 'dist'),
                    os.path.join(output_dir, 'dist'))
示例#8
0
def gtr_single_partition(alignment: qiime2.Metadata,
                         time: qiime2.NumericMetadataColumn,
                         n_generations: int,
                         sample_every: int,
                         time_uncertainty: qiime2.NumericMetadataColumn = None,
                         base_freq: str = "estimated",
                         site_gamma: int = 4,
                         site_invariant: bool = True,
                         clock: str = 'ucln',
                         coalescent_model: str = 'skygrid',
                         skygrid_intervals: int = None,
                         skygrid_duration: float = None,
                         print_every: int = None,
                         use_gpu: bool = False,
                         n_threads: int = 1) -> BEASTPosteriorDirFmt:

    if coalescent_model == 'skygrid':
        if skygrid_duration is None or skygrid_intervals is None:
            raise ValueError("skygrid not parameterized (TODO: better error)")

    # Parallelization options
    beast_call = ['beast']
    if use_gpu:
        if n_threads != 1:
            raise ValueError
        beast_call += ['-beagle_GPU', '-beagle_cuda', '-beagle_instances', '1']
    else:
        beast_call += [
            '-beagle_CPU', '-beagle_SSE', '-beagle_instances',
            str(n_threads)
        ]

    # Set up directory format where BEAST will write everything
    result = BEASTPosteriorDirFmt()
    control_file = str(result.control.path_maker())

    ops_file = str(result.ops.path_maker().relative_to(result.path))
    log_file = str(result.log.path_maker().relative_to(result.path))
    trees_file = str(result.trees.path_maker().relative_to(result.path))

    # Setup up samples for templating into control file
    seq_series = alignment.get_column('Sequence').to_series()
    time_series = time.to_series()

    if time_uncertainty is not None:
        uncertainty_series = time_uncertainty.to_series()
    else:
        uncertainty_series = time_series.copy()
        uncertainty_series[...] = None

    samples_df = pd.concat([seq_series, time_series, uncertainty_series],
                           axis='columns',
                           join='inner')
    samples_df.index.name = 'id'
    samples_df.columns = ['seq', 'time', 'time_uncertainty']
    samples_df = samples_df.replace({pd.np.nan: None})
    samples = list(samples_df.itertuples(index=True))

    # Default print behavior
    if print_every is None:
        print_every = sample_every

    # Generate control file for BEAST
    template_kwargs = dict(trees_file=trees_file,
                           ops_file=ops_file,
                           log_file=log_file,
                           sample_every=sample_every,
                           print_every=print_every,
                           n_generations=n_generations,
                           time_unit='years',
                           samples=samples,
                           base_freq=base_freq,
                           site_gamma=site_gamma,
                           site_invariant=site_invariant,
                           clock=clock,
                           coalescent_model=coalescent_model,
                           skygrid_duration=skygrid_duration,
                           skygrid_intervals=skygrid_intervals)

    template = _get_template("gtr_single_partition.xml")
    template.stream(**template_kwargs).dump(control_file)

    beast_call += [str(control_file)]

    # Execute
    subprocess.run(beast_call, check=True, cwd=result.path)

    return result
示例#9
0
def site_heterogeneous_hky(
        coding_regions: qiime2.Metadata,
        noncoding_regions: qiime2.Metadata,
        time: qiime2.NumericMetadataColumn,
        n_generations: int,
        sample_every: int,
        print_every: int = None,
        time_uncertainty: qiime2.NumericMetadataColumn = None,
        use_gpu: bool = False,
        n_threads: int = 1) -> BEASTPosteriorDirFmt:

    # Parallelization options
    beast_call = ['beast']
    if use_gpu:
        if n_threads != 1:
            raise ValueError
        beast_call += ['-beagle_GPU', '-beagle_cuda', '-beagle_instances', '1']
    else:
        beast_call += [
            '-beagle_CPU', '-beagle_SSE', '-beagle_instances',
            str(n_threads)
        ]

    # Set up directory format where BEAST will write everything
    result = BEASTPosteriorDirFmt()
    control_file = str(result.control.path_maker())

    ops_file = str(result.ops.path_maker().relative_to(result.path))
    log_file = str(result.log.path_maker().relative_to(result.path))
    trees_file = str(result.trees.path_maker().relative_to(result.path))

    # Setup up samples for templating into control file
    orf_series = coding_regions.get_column('Sequence').to_series()
    nc_series = noncoding_regions.get_column('Sequence').to_series()
    time_series = time.to_series()
    uncertainty_series = time_uncertainty.to_series()

    samples_df = pd.concat(
        [orf_series, nc_series, time_series, uncertainty_series],
        axis='columns',
        join='inner')
    samples_df.index.name = 'id'
    samples_df.columns = ['seq_orf', 'seq_nc', 'time', 'time_uncertainty']
    samples_df = samples_df.replace({pd.np.nan: None})
    samples = list(samples_df.itertuples(index=True))

    # Default print behavior
    if print_every is None:
        print_every = sample_every

    # Generate control file for BEAST
    template_kwargs = dict(trees_file=trees_file,
                           ops_file=ops_file,
                           log_file=log_file,
                           sample_every=sample_every,
                           print_every=print_every,
                           n_generations=n_generations,
                           time_unit='years',
                           samples=samples)
    template = _get_template("orf_and_nc.xml")
    template.stream(**template_kwargs).dump(control_file)

    beast_call += [str(control_file)]

    # Execute
    subprocess.run(beast_call, check=True, cwd=result.path)

    return result
示例#10
0
def preprocess(
    ctx,
    table,
    metadata,
    sampling_depth,
    min_frequency,
    target_variable,
    discrete,
    phylogeny=None,
    with_replacement=False,
    n_jobs=1,
):

    # Define QIIME2 methods to call
    rarefy = ctx.get_action("feature_table", "rarefy")
    filter_min_features = ctx.get_action("feature_table", "filter_features")
    filter_samples = ctx.get_action("feature_table", "filter_samples")
    beta = ctx.get_action("diversity", "beta")
    beta_phylogenetic = ctx.get_action("diversity", "beta_phylogenetic")
    filter_features = ctx.get_action("fragment-insertion", "filter_features")
    results = []

    print("Inital sizes")
    print_datasize(table, metadata)

    initial_ids_to_keep = table.view(biom.Table).ids()
    table_id_set = set(initial_ids_to_keep)
    metadata_id_set = set(metadata.ids)
    shared_ids = table_id_set.intersection(metadata_id_set)
    num_shared_ids = len(shared_ids)
    if num_shared_ids == 0:
        raise ValueError("No sample IDs are shared between Table and Metadata")
    print(
        "# of shared sample IDs between Table and Metadata: ",
        num_shared_ids, "\n"
    )

    # Filter metadata by samples in table
    print("Filtering Metadata by samples in table")
    filteredmetadata = metadata.filter_ids(ids_to_keep=shared_ids)
    print_datasize(table, filteredmetadata)

    # Filter samples from metadata where NaN in target_variable column
    # Reduce metadata to 1 column mapping of sample-id to target
    print(
        "Filtering samples from Metadata where NaN in target_variable column"
    )
    print("Reducing Metadata to 1 column mapping of sample-id to target")
    df = filteredmetadata.to_dataframe()
    clean_subset_df = clean_metadata(
        df=df, target_variable=target_variable, discrete=discrete
    )
    target_mapping = Metadata(clean_subset_df)
    print_datasize(table, target_mapping)

    # Filter features that do not exist in phylogeny
    if phylogeny:
        print("Filtering features from Table that do not exist in phylogeny")
        phylo_filtered_results = filter_features(table=table, tree=phylogeny)
        table = phylo_filtered_results.filtered_table
        print_datasize(table, target_mapping)

    # Filter low-abundance features from table
    print(
        f"Filtering low-abundance features (frequency<{min_frequency}) from Table"
    )
    (table,) = filter_min_features(
        table=table, min_frequency=min_frequency
    )
    print_datasize(table, target_mapping)

    # Rarefy Table to sampling_depth
    print(f"Rarefying Table to sampling depth of {sampling_depth}")
    (rarefied_table,) = rarefy(
        table=table,
        sampling_depth=sampling_depth,
        with_replacement=with_replacement,
    )
    print_datasize(rarefied_table, target_mapping)

    print("Filtering Rarefied Table by samples in Metadata")
    filtered_rarefied_table_results = filter_samples(
        table=rarefied_table, metadata=target_mapping
    )
    filtered_rarefied_table = filtered_rarefied_table_results.filtered_table
    print_datasize(filtered_rarefied_table, target_mapping)
    results += filtered_rarefied_table_results

    # Refilter target_mapping by samples in table
    print("Refiltering Metadata by samples in Rarefied Table")
    ids_to_keep = filtered_rarefied_table.view(biom.Table).ids()
    target_mapping = target_mapping.filter_ids(ids_to_keep=ids_to_keep)
    print_datasize(filtered_rarefied_table, target_mapping)

    # Filter Rarefied Table by samples in metadata
    print("Filtering Unrarefied Table by samples in Metadata to match Rarefied Table")
    filtered_table_results = filter_samples(
        table=table, metadata=target_mapping
    )
    print_datasize(filtered_table_results.filtered_table, target_mapping)
    results += filtered_table_results

    # Some transformations to get data into correct format for artifact
    target_mapping_col = target_mapping.get_column(target_variable)
    target_mapping_series = target_mapping_col.to_series()
    print("Reindexing Metadata to match Sample ID order of Table")
    target_mapping_series = target_mapping_series.reindex(
        index=ids_to_keep, copy=False
    )
    print("Validating Table and Metadata Sample ID agreement...")
    if list(target_mapping_series.index) != list(ids_to_keep):
        print(list(target_mapping_series.index))
        print(ids_to_keep)
        raise ValueError(
            "Table and Metadata Sample IDs do not match in contents and/or order"
        )
    target_mapping_artifact = ctx.make_artifact(
        "SampleData[Target]", target_mapping_series
    )
    results += [target_mapping_artifact]

    # Generate Distance Matrices
    print("Generating Distance Matrices...")
    for metric in ["jaccard", "braycurtis", "jensenshannon", "aitchison"]:
        beta_results = beta(
            table=filtered_rarefied_table, metric=metric, n_jobs=n_jobs
        )
        results += beta_results
    if phylogeny:
        for metric in ["unweighted_unifrac", "weighted_unifrac"]:
            beta_phylo_results = beta_phylogenetic(
                table=filtered_rarefied_table,
                phylogeny=phylogeny,
                metric=metric,
                threads=n_jobs,
            )
            results += beta_phylo_results
    else:
        # No phylogeny, return empty (1,1) matrices.
        results += 2*[Artifact.import_data(
            "DistanceMatrix", skbio.DistanceMatrix(data=[])
        )]
    return tuple(results)