Пример #1
0
    def test_combine_selections_alt_metadata(self):
        df = pd.DataFrame([[42, 88], [3, 88], [99, 88], [np.nan, 88]],
                          index=['a', 'b', 'c', 'd'],
                          columns=['value', 'time-travel-speed-mph'])
        df.index.name = 'id'
        alt_md = qiime2.Metadata(df)

        sel4 = IDSelection(self.sel3.inclusion, alt_md, 'abc')

        sel = combine_selections([self.sel1, self.sel2, sel4])

        exp_inclusion = pd.Series([True, True, True, False],
                                  index=['a', 'b', 'c', 'd'],
                                  name='inclusion')
        exp_df = pd.DataFrame([['x', 88, 42], ['y', 88, 3],
                               ['z', 88, 99], ['a', 88, np.nan]],
                              index=['a', 'b', 'c', 'd'],
                              columns=['locale',  'time-travel-speed-mph',
                                       'value'])
        exp_df.index.name = 'id'
        exp_md = qiime2.Metadata(exp_df)

        pdt.assert_series_equal(sel.inclusion, exp_inclusion)
        self.assertEqual(sel.metadata, exp_md)
        self.assertEqual(sel.label, 'combined_selections')
Пример #2
0
def subsample_longitudinal(dates: qiime2.CategoricalMetadataColumn,
                           start_date: str = None,
                           samples_per_interval: int = 7,
                           days_per_interval: int = 7,
                           seed: int = None) -> IDSelection:

    window_size = '%dD' % days_per_interval

    dt_series = pd.to_datetime(dates.to_series(), errors='coerce')
    df = pd.DataFrame({'ids': dates.to_series().index}, index=dt_series)

    if start_date is not None:
        filter_before = pd.Timestamp(start_date)
        df = df.iloc[np.where(dt_series >= filter_before)]
        if filter_before not in df.index:
            # this will be stripped in _sample_group::_sampler
            # the purpose is to force Pandas to begin the window at this
            # time instead of the first observation (by making NaN the first
            # observation)
            df.loc[filter_before] = float('nan')

    grouped = df.groupby(pd.Grouper(freq=window_size,
                                    convention='start',
                                    closed='left'),
                         group_keys=False)
    filtered_df = grouped.apply(_sample_group(samples_per_interval, seed))

    df = df.dropna(axis=0)
    selection = pd.Series(False, index=dates.to_series().index)
    selection[filtered_df['ids']] = True

    md = qiime2.Metadata(dates.to_dataframe())
    return IDSelection(selection, md, 'subsample_longitudinal')
Пример #3
0
def _3(fmt: IDSelectionDirFmt) -> IDSelection:
    md = fmt.metadata.view(IDMetadataFormat).to_metadata()
    inclusion = pd.Series(False, index=md.to_dataframe().index)
    included = fmt.included.view(UNIXListFormat).to_list()
    inclusion[included] = True
    with fmt.label.view(UNIXListFormat).open() as fh:
        label = fh.read().strip()
    return IDSelection(inclusion, md, label)
Пример #4
0
    def test_combine_selections_inconsistent_metadata(self):
        df = pd.DataFrame([['x'], ['y'], ['w'], ['a']],
                          index=['a', 'b', 'c', 'd'],
                          columns=['locale'])
        df.index.name = 'id'
        alt_md = qiime2.Metadata(df)

        sel4 = IDSelection(self.sel3.inclusion, alt_md, 'abc')

        with self.assertRaisesRegex(ValueError, 'inconsistent metadata'):
            combine_selections([self.sel1, self.sel2, sel4])
Пример #5
0
def sample_random(ids: qiime2.Metadata, n: int, seed: int = None) \
        -> IDSelection:
    if n > ids.id_count:
        raise ValueError("Value for n is larger than the number of IDs"
                         " present")

    df = ids.to_dataframe()
    samples = df.sample(n, replace=False, random_state=seed)
    inclusion = pd.Series(False, index=df.index)
    inclusion[samples.index] = True

    return IDSelection(inclusion, ids, "sample_random")
Пример #6
0
def subsample_neighbors(focal_seqs: DNAFASTAFormat,
                        context_seqs: DNAFASTAFormat,
                        percent_id: float,
                        samples_per_cluster: int,
                        locale: CategoricalMetadataColumn = None,
                        max_accepts: int = 10,
                        n_threads: int = 1,
                        seed: int = None) -> IDSelection:

    if max_accepts < samples_per_cluster:
        raise ValueError('max_accepts (%d) must be greater than or equal to '
                         'samples_per_cluster (%d), since it is determines '
                         'the largest number of samples that could be '
                         'obtained per cluster.' %
                         (max_accepts, samples_per_cluster))

    context_ids = ids_from_fasta(str(context_seqs))

    inclusion = pd.Series(False, index=context_ids, name='inclusion')
    if locale is not None:
        locale = locale.filter_ids(inclusion.index).to_series()
        metadata = pd.DataFrame(locale)
    else:
        metadata = pd.DataFrame(index=pd.Index(inclusion.index))
    metadata.index.name = 'id'

    with tempfile.NamedTemporaryFile() as vsearch_out_f:
        command = [
            'vsearch', '--threads',
            str(n_threads), '--usearch_global',
            str(focal_seqs), '--id',
            str(percent_id), '--db',
            str(context_seqs), '--userout', vsearch_out_f.name, '--qmask',
            'none', '--maxaccepts',
            str(max_accepts), '--uc_allhits', '--userfields',
            'query+target+mism'
        ]
        run_command(command)

        vsearch_out = pd.read_csv(
            open(vsearch_out_f.name),
            sep='\t',
            na_values='*',
            names=['focal_id', 'context_id', 'n_mismatches'])

        clusters = _clusters_from_vsearch_out(vsearch_out, locale)
        context_seqs_to_keep = \
            _sample_clusters(clusters, samples_per_cluster, seed=seed)
        inclusion[context_seqs_to_keep] = True

    return IDSelection(inclusion, qiime2.Metadata(metadata),
                       "subsample_neighbors")
Пример #7
0
def combine_selections(selections: IDSelection) -> IDSelection:
    output_label = "combined_selections"
    if len(selections) == 1:
        return IDSelection(selections[0].inclusion,
                           selections[0].metadata,
                           label=output_label)

    inclusion = selections[0].inclusion
    inclusion_ids = set(inclusion.index)
    metadata = selections[0].metadata.to_dataframe()
    metadata_ids = set(metadata.index)
    for e in selections[1:]:
        if inclusion_ids != set(e.inclusion.index):
            raise ValueError("Inclusion id sets are not equal. Can't combine.")
        inclusion = inclusion.combine(e.inclusion, operator.or_)

        df = e.metadata.to_dataframe()
        if metadata_ids != set(df.index):
            raise ValueError("Metadata id sets are not equal. Can't combine.")
        metadata = metadata.combine(df, _combine_df_error_if_not_equal)

    return IDSelection(inclusion, qiime2.Metadata(metadata), output_label)
Пример #8
0
    def test_error_on_non_equal_inclusion_id_sets(self):
        bad_sel1 = IDSelection(pd.Series([False, False, True],
                                         index=['a', 'b', 'c'],
                                         name='inclusion'),
                               self.md1,
                               label='somthing')
        with self.assertRaisesRegex(ValueError, "id sets are not equal"):
            combine_selections([self.sel1, bad_sel1])

        with self.assertRaisesRegex(ValueError, "id sets are not equal"):
            combine_selections([bad_sel1, self.sel1])

        with self.assertRaisesRegex(ValueError, "id sets are not equal"):
            combine_selections([self.sel1, self.sel2, self.sel3, bad_sel1])
Пример #9
0
    def setUp(self):
        super().setUp()
        df1 = pd.DataFrame([['x'], ['y'], ['z'], ['a']],
                           index=['a', 'b', 'c', 'd'],
                           columns=['locale'])
        df1.index.name = 'id'
        self.md1 = qiime2.Metadata(df1)

        self.sel1 = IDSelection(pd.Series([True, False, False, False],
                                          index=['a', 'b', 'c', 'd'],
                                          name='inclusion'),
                                self.md1,
                                label='sel1')
        self.sel2 = IDSelection(pd.Series([False, True, False, False],
                                          index=['a', 'b', 'c', 'd'],
                                          name='inclusion'),
                                self.md1,
                                label='sel2')
        self.sel3 = IDSelection(pd.Series([False, False, True, False],
                                          index=['a', 'b', 'c', 'd'],
                                          name='inclusion'),
                                self.md1,
                                label='sel3')
Пример #10
0
    def test_error_on_non_equal_metadata_id_sets(self):
        df = pd.DataFrame([['x'], ['y'], ['z']],
                          index=['a', 'b', 'c'],
                          columns=['locale'])
        df.index.name = 'id'
        bad_md1 = qiime2.Metadata(df)

        bad_sel1 = IDSelection(self.sel1.inclusion,
                               bad_md1,
                               label='somthing')

        with self.assertRaisesRegex(ValueError, "id sets are not equal"):
            combine_selections([self.sel1, bad_sel1])

        with self.assertRaisesRegex(ValueError, "id sets are not equal"):
            combine_selections([bad_sel1, self.sel1])

        with self.assertRaisesRegex(ValueError, "id sets are not equal"):
            combine_selections([self.sel1, self.sel2, self.sel3, bad_sel1])
def subsample_diversity(context_seqs: DNAFASTAFormat,
                        percent_id: float,
                        max_accepts: int = 10,
                        n_threads: int = 1) -> IDSelection:

    context_ids = ids_from_fasta(str(context_seqs))
    inclusion = pd.Series(False, index=context_ids, name='inclusion')
    metadata = pd.DataFrame(index=pd.Index(inclusion.index))
    metadata.index.name = 'id'

    with tempfile.NamedTemporaryFile() as uc_out_f:
        command = [
            'vsearch',
            '--threads',
            str(n_threads),
            '--cluster_fast',
            str(context_seqs),
            '--id',
            str(percent_id),
            '--uc',
            uc_out_f.name,
            '--qmask',
            'none',
            '--maxaccepts',
            str(max_accepts),
        ]
        run_command(command)

        uc = pd.read_csv(uc_out_f.name,
                         sep='\t',
                         na_values='*',
                         names=[
                             'type', 'cluster_id', 'length', 'perc_id',
                             'strand', 'BLANK1', 'BLANK2', 'cigar', 'query',
                             'target'
                         ])

    # the S lines define the cluster centroids
    context_seqs_to_keep = uc[uc['type'] == 'S'].index
    inclusion[context_seqs_to_keep] = True

    return IDSelection(inclusion, qiime2.Metadata(metadata),
                       "subsample_diversity")