示例#1
0
    def test_tsv_builder(self):
        seqs = DNAIterator(skbio.DNA(a, metadata=b)for a, b in (
            ('A', {'id': 'seq01'}),
            ('AA', {'id': 'seq02'}),
            ('AAA', {'id': 'seq03'}),
            ('AAAA', {'id': 'seq04'}),
            ('AAAA', {'id': 'seq05'}),
            ('AAA', {'id': 'seq06'}),
            ('AA', {'id': 'seq07'}),
            ('AAAAAAAAAA', {'id': 'seq08'})))

        # Do the files exist?
        with tempfile.TemporaryDirectory() as output_dir:
            tabulate_seqs(output_dir, seqs)

            expected_stats_fp = os.path.join(
                output_dir, 'descriptive_stats.tsv')
            expected_summary_fp = os.path.join(
                output_dir, 'seven_number_summary.tsv')
            self.assertTrue(os.path.exists(expected_stats_fp))
            self.assertTrue(os.path.exists(expected_summary_fp))

            # Was data written to the files?
            with open(expected_stats_fp) as stats_tsv:
                tsv_reader = csv.reader(stats_tsv, dialect="excel-tab")
                tsv_text = []
                for row in tsv_reader:
                    tsv_text.append(row)
            self.assertEqual(['Statistic', 'Value'], tsv_text[0])
            self.assertEqual(['count', '8'], tsv_text[1])

            with open(expected_summary_fp) as summ_tsv:
                tsv_reader = csv.reader(summ_tsv, dialect="excel-tab")
                tsv_text = []
                for row in tsv_reader:
                    tsv_text.append(row)
            self.assertEqual(['Quantile', 'Value'], tsv_text[0])
            self.assertEqual(['0.02', '1.14'], tsv_text[1])

            # Does link html generate correctly?
            expected_index_fp = os.path.join(output_dir, 'index.html')
            with open(expected_index_fp) as fh:
                self.assertTrue('href="descriptive_stats.tsv"' in fh.read())

            with open(expected_index_fp) as fh:
                self.assertTrue(
                    'href="seven_number_summary.tsv"' in fh.read())
    def test_basic(self):
        seqs = DNAIterator(
            skbio.DNA(a, metadata=b) for a, b in (('ACGT', {
                'id': 'seq1'
            }), ('AAAA', {
                'id': 'seq2'
            })))

        with tempfile.TemporaryDirectory() as output_dir:
            tabulate_seqs(output_dir, seqs)

            expected_fp = os.path.join(output_dir, 'index.html')
            self.assertTrue(os.path.exists(expected_fp))
            with open(expected_fp) as fh:
                file_text = fh.read()
                self.assertTrue('ACGT</a>' in file_text)
                self.assertTrue('<td>4</td>' in file_text)
                self.assertTrue('<td>seq2</td>' in file_text)
示例#3
0
def cross_validate_classifier(ref_taxa, ref_seqs, classifier_spec, obs_dir,
                              results_dir, intermediate_dir, n_jobs, log_file,
                              log_level, confidence, classifier_directory):

    classifier_spec = classifier_spec.read()

    # set up logging
    setup_logging(log_level, log_file)
    logging.info(locals())

    # load folds
    taxon_defaults_file = join(intermediate_dir, 'taxon_defaults.json')
    with open(taxon_defaults_file) as fh:
        taxon_defaults = json.load(fh)
    folds = glob.glob(join(intermediate_dir, 'fold-*'))
    logging.info('Got folds')

    # load ref_seq
    _, ref_seqs = load_references(ref_taxa, ref_seqs)
    ref_seqs = Artifact.import_data('FeatureData[Sequence]',
                                    DNAIterator(ref_seqs))

    # for each fold
    for fold in folds:
        # load new file for different folds
        weights_file = join(fold, 'weights.qza')
        training_taxa_file = join(fold, 'train_taxa.qza')

        # load the simulated test samples
        test_samples = load_simulated_samples(fold, results_dir)

        # load the test seqs, training taxa, traing seqs, and weights
        weights = Artifact.load(weights_file)
        #test_seqs = Artifact.load(test_seqs_file)
        train_taxa = Artifact.load(training_taxa_file)

        # train the weighted classifier and classify the test samples
        classification = classify_samples_sklearn(test_samples, train_taxa,
                                                  ref_seqs, classifier_spec,
                                                  confidence, n_jobs, weights)
        # save the classified taxonomy artifacts
        save_observed(classifier_directory, test_samples, classification,
                      obs_dir)
        logging.info('Done ' + fold)
示例#4
0
def extract_reads(sequences: DNAIterator,
                  f_primer: str,
                  r_primer: str,
                  trunc_len: int = 0,
                  trim_left: int = 0,
                  identity: float = 0.8,
                  min_length: int = 50,
                  max_length: int = 0) -> DNAIterator:
    """Extract the read selected by a primer or primer pair. Only sequences
    which match the primers at greater than the specified identity are returned

    Parameters
    ----------
    sequences : DNAIterator
        an aligned list of skbio.sequence.DNA query sequences
    f_primer : skbio.sequence.DNA
        forward primer sequence
    r_primer : skbio.sequence.DNA
        reverse primer sequence
    trunc_len : int, optional
        read is cut to trunc_len if trunc_len is positive. Applied before
        trim_left.
    trim_left : int, optional
        trim_left nucleotides are removed from the 5' end if trim_left is
        positive. Applied after trunc_len.
    identity : float, optional
        minimum combined primer match identity threshold. Default: 0.8
    min_length: int, optional
        Minimum amplicon length. Shorter amplicons are discarded. Default: 50
    max_length: int, optional
        Maximum amplicon length. Longer amplicons are discarded.
    Returns
    -------
    q2_types.DNAIterator
        containing the reads
    """
    reads = _gen_reads(sequences, f_primer, r_primer, trunc_len, trim_left,
                       identity, min_length, max_length)
    try:
        first_read = next(reads)
    except StopIteration:
        raise RuntimeError('No matches found') from None
    return DNAIterator(chain([first_read], reads))
def classify_sklearn(reads: DNAFASTAFormat,
                     classifier: Pipeline,
                     reads_per_batch: int = 0,
                     n_jobs: int = 1,
                     pre_dispatch: str = '2*n_jobs',
                     confidence: float = 0.7,
                     read_orientation: str = 'auto') -> pd.DataFrame:
    try:
        # autotune reads per batch
        if reads_per_batch == 0:
            reads_per_batch = _autotune_reads_per_batch(reads, n_jobs)

        # transform reads to DNAIterator
        reads = DNAIterator(
            skbio.read(str(reads), format='fasta', constructor=skbio.DNA))

        reads = _autodetect_orientation(reads,
                                        classifier,
                                        read_orientation=read_orientation)
        predictions = predict(reads,
                              classifier,
                              chunk_size=reads_per_batch,
                              n_jobs=n_jobs,
                              pre_dispatch=pre_dispatch,
                              confidence=confidence)
        seq_ids, taxonomy, confidence = list(zip(*predictions))

        result = pd.DataFrame({
            'Taxon': taxonomy,
            'Confidence': confidence
        },
                              index=seq_ids,
                              columns=['Taxon', 'Confidence'])
        result.index.name = 'Feature ID'
        return result
    except MemoryError:
        raise MemoryError("The operation has run out of available memory. "
                          "To correct this error:\n"
                          "1. Reduce the reads per batch\n"
                          "2. Reduce number of n_jobs being performed\n"
                          "3. Use a more powerful machine or allocate "
                          "more resources ")
    def test_descriptive_stats_integration(self):
        seqs = DNAIterator(
            skbio.DNA(a, metadata=b) for a, b in (('A', {
                'id': 'seq01'
            }), ('AA', {
                'id': 'seq02'
            }), ('AAA', {
                'id': 'seq03'
            }), ('AAAA', {
                'id': 'seq04'
            }), ('AAAA', {
                'id': 'seq05'
            }), ('AAA', {
                'id': 'seq06'
            }), ('AA', {
                'id': 'seq07'
            }), ('AAAAAAAAAA', {
                'id': 'seq08'
            })))

        with tempfile.TemporaryDirectory() as output_dir:
            tabulate_seqs(output_dir, seqs)

            expected_fp = os.path.join(output_dir, 'index.html')

            # all expected values are unique. If they all render in index.html, our
            # function likely worked as expected.
            with open(expected_fp) as fh:
                file_text = fh.read()
                self.assertTrue('<td>8</td>' in file_text)
                self.assertTrue('<td>1</td>' in file_text)
                self.assertTrue('<td>10</td>' in file_text)
                self.assertTrue('<td>3.62</td>' in file_text)
                self.assertTrue('<td>9</td>' in file_text)
                self.assertTrue('<td>1</td>' in file_text)
                self.assertTrue('<td>1</td>' in file_text)
                self.assertTrue('<td>2</td>' in file_text)
                self.assertTrue('<td>3</td>' in file_text)
                self.assertTrue('<td>4</td>' in file_text)
                self.assertTrue('<td>6</td>' in file_text)
                self.assertTrue('<td>9</td>' in file_text)
def classify_sklearn(reads: DNAFASTAFormat, classifier: Pipeline,
                     reads_per_batch: int = 0, n_jobs: int = 1,
                     pre_dispatch: str = '2*n_jobs', confidence: float = 0.7,
                     read_orientation: str = None
                     ) -> pd.DataFrame:
    # autotune reads per batch
    if reads_per_batch == 0:
        reads_per_batch = _autotune_reads_per_batch(reads, n_jobs)

    # transform reads to DNAIterator
    reads = DNAIterator(
        skbio.read(str(reads), format='fasta', constructor=skbio.DNA))

    reads = _autodetect_orientation(
        reads, classifier, read_orientation=read_orientation)
    predictions = predict(reads, classifier, chunk_size=reads_per_batch,
                          n_jobs=n_jobs, pre_dispatch=pre_dispatch,
                          confidence=confidence)
    seq_ids, taxonomy, confidence = list(zip(*predictions))
    result = pd.DataFrame({'Taxon': taxonomy, 'Confidence': confidence},
                          index=seq_ids, columns=['Taxon', 'Confidence'])
    result.index.name = 'Feature ID'
    return result
示例#8
0
def _denoise_helper(
        demultiplexed_seqs: SingleLanePerSampleSingleEndFastqDirFmt,
        trim_length: int,
        left_trim_len: int = 0,
        sample_stats: bool = False,
        reference_seqs: DNAFASTAFormat = None,
        mean_error: float = 0.005,
        indel_prob: float = 0.01,
        indel_max: int = 3,
        min_reads: int = 10,
        min_size: int = 2,
        jobs_to_start: int = 1,
        hashed_feature_ids: bool = True) -> (biom.Table,
                                             DNAIterator,
                                             pd.DataFrame):
    _check_inputs(**locals())
    df = demultiplexed_seqs.manifest.view(pd.DataFrame)
    ids_with_underscores = df.index.astype(str).str.contains('_')
    ids_with_underscores = df[ids_with_underscores].index.tolist()
    if ids_with_underscores:
        ids_with_underscores = ', '.join(ids_with_underscores)
        raise ValueError("Deblur cannot operate on sample IDs that "
                         "contain underscores. The following ID(s) "
                         "contain one or more underscores: "
                         f"{ids_with_underscores}.")
    with tempfile.TemporaryDirectory() as tmp:
        seqs_fp = str(demultiplexed_seqs)
        cmd = ['deblur', 'workflow',
               '--seqs-fp', seqs_fp,
               '--output-dir', tmp,
               '--mean-error', str(mean_error),
               '--indel-prob', str(indel_prob),
               '--indel-max', str(indel_max),
               '--trim-length', str(trim_length),
               '--left-trim-length', str(left_trim_len),
               '--min-reads', str(min_reads),
               '--min-size', str(min_size),
               '--jobs-to-start', str(jobs_to_start),
               '-w']

        if reference_seqs is not None:
            cmd.append('--pos-ref-fp')
            cmd.append(str(reference_seqs))

        if sample_stats:
            cmd.append('--keep-tmp-files')

        subprocess.run(cmd, check=True)

        # this is one of the outputs from Deblur, however it isn't clear what
        # the utility of it is for the majority of qiime2 users. on the other
        # hand, it is very easy to test to see if the run completed.
        all_seqs = os.path.join(tmp, 'all.seqs.fa')
        if os.stat(all_seqs).st_size == 0:
            raise ValueError("No sequences passed the filter. It is possible "
                             "the trim_length (%d) may exceed the longest "
                             "sequence, that all of the sequences are "
                             "artifacts like PhiX or adapter, or that the "
                             "positive reference used is not representative "
                             "of the data being denoised." % trim_length)

        table = _load_table(tmp)

        if hashed_feature_ids:
            obs_map = _hash_ids(table)  # inplace operation
        else:
            obs_map = {i: i for i in table.ids(axis='observation')}

        rep_sequences = DNAIterator(
            (skbio.DNA(k, metadata={'id': v}, lowercase='ignore')
             for k, v in obs_map.items()))

        if sample_stats:
            stats = _gather_stats(demultiplexed_seqs, tmp)
        else:
            stats = pd.DataFrame([], columns=STATS_HEADER)
            stats.set_index('sample-id', inplace=True)

    return (table, rep_sequences, stats)
示例#9
0
def _7(data: RNAFASTAFormat) -> DNAIterator:
    iterator = _read_fasta(str(data), constructor=skbio.RNA)
    generator = _rna_to_dna_iterator(iterator)
    return DNAIterator(generator)
示例#10
0
def _7(data: RNAFASTAFormat) -> DNAIterator:
    converted_dna = _rna_to_dna(str(data))
    generator = _read_dna_fasta(str(converted_dna))
    return DNAIterator(generator)
示例#11
0
def sequence_variants_from_samples(samples: biom.Table) -> DNAIterator:
    seqs = (DNA(s, metadata={'id': s})
            for s in samples.ids(axis='observation'))
    return DNAIterator(seqs)
示例#12
0
def _denoise_helper(biom_fp, track_fp, hashed_feature_ids, retain_all_samples):
    _check_featureless_table(biom_fp)
    with open(biom_fp) as fh:
        table = biom.Table.from_tsv(fh, None, None, None)

    df = pd.read_csv(track_fp, sep='\t', index_col=0)
    df.index.name = 'sample-id'
    df = df.rename(index=_filepath_to_sample)

    PASSED_FILTER = 'percentage of input passed filter'
    NON_CHIMERIC = 'percentage of input non-chimeric'

    round_cols = {PASSED_FILTER: 2, NON_CHIMERIC: 2}

    df[PASSED_FILTER] = df['filtered'] / df['input'] * 100
    df[NON_CHIMERIC] = df['non-chimeric'] / df['input'] * 100

    col_order = [
        'input', 'filtered', PASSED_FILTER, 'denoised', 'non-chimeric',
        NON_CHIMERIC
    ]

    # only calculate percentage of input merged if paired end
    if 'merged' in df:
        MERGED = 'percentage of input merged'
        round_cols[MERGED] = 2
        df[MERGED] = df['merged'] / df['input'] * 100
        col_order.insert(4, 'merged')
        col_order.insert(5, MERGED)

    df = df[col_order]
    df.fillna(0, inplace=True)
    df = df.round(round_cols)
    metadata = qiime2.Metadata(df)

    # Currently the sample IDs in DADA2 are the file names. We make
    # them the sample id part of the filename here.
    sid_map = {
        id_: _filepath_to_sample(id_)
        for id_ in table.ids(axis='sample')
    }
    table.update_ids(sid_map, axis='sample', inplace=True)
    # Reintroduce empty samples dropped by dada2.
    table_cols = table.ids(axis='observation')
    table_rows = list(set(df.index) - set(table.ids()))
    table_to_add = biom.Table(np.zeros((len(table_cols), len(table_rows))),
                              table_cols,
                              table_rows,
                              type="OTU table")
    table = table.concat(table_to_add)
    # This is necissary (instead of just not reintroducing above)
    # dada2 will discard samples which are empty after filtering
    # but will keep samples that are empty after merging
    # so there are potentially samples removed here that were not
    # reintroduced above!
    if not retain_all_samples:
        table = table.remove_empty(axis="sample", inplace=False)
    # The feature IDs in DADA2 are the sequences themselves.
    if hashed_feature_ids:
        # Make feature IDs the md5 sums of the sequences.
        fid_map = {
            id_: hashlib.md5(id_.encode('utf-8')).hexdigest()
            for id_ in table.ids(axis='observation')
        }
        table.update_ids(fid_map, axis='observation', inplace=True)

        rep_sequences = DNAIterator(
            (skbio.DNA(k, metadata={'id': v}) for k, v in fid_map.items()))
    else:
        rep_sequences = DNAIterator((skbio.DNA(id_, metadata={'id': id_})
                                     for id_ in table.ids(axis='observation')))
    return table, rep_sequences, metadata
示例#13
0
def reverse_transcribe(rna_sequences: RNAIterator) -> DNAIterator:
    generator = _rna_to_dna_iterator(rna_sequences)
    return DNAIterator(generator)
def _denoise_helper(biom_fp, track_fp, hashed_feature_ids):
    _check_featureless_table(biom_fp)
    with open(biom_fp) as fh:
        table = biom.Table.from_tsv(fh, None, None, None)

    df = pd.read_csv(track_fp, sep='\t', index_col=0)
    df.index.name = 'sample-id'
    df = df.rename(index=_filepath_to_sample)

    PASSED_FILTER = 'percentage of input passed filter'
    NON_CHIMERIC = 'percentage of input non-chimeric'

    round_cols = {PASSED_FILTER: 2, NON_CHIMERIC: 2}

    df[PASSED_FILTER] = df['filtered'] / df['input'] * 100
    df[NON_CHIMERIC] = df['non-chimeric'] / df['input'] * 100

    col_order = [
        'input', 'filtered', PASSED_FILTER, 'denoised', 'non-chimeric',
        NON_CHIMERIC
    ]

    # only calculate percentage of input merged if paired end
    if 'merged' in df:
        MERGED = 'percentage of input merged'
        round_cols[MERGED] = 2
        df[MERGED] = df['merged'] / df['input'] * 100
        col_order.insert(4, 'merged')
        col_order.insert(5, MERGED)

    # only calculate percentage of input primer-removed if ccs
    if 'primer-removed' in df:
        PASSED_PRIMERREMOVE = 'percentage of input primer-removed'
        round_cols[PASSED_PRIMERREMOVE] = 2
        df[PASSED_PRIMERREMOVE] = df['primer-removed'] / df['input'] * 100
        col_order.insert(1, 'primer-removed')
        col_order.insert(2, PASSED_PRIMERREMOVE)

    df = df[col_order]
    df.fillna(0, inplace=True)
    df = df.round(round_cols)
    metadata = qiime2.Metadata(df)

    # Currently the sample IDs in DADA2 are the file names. We make
    # them the sample id part of the filename here.
    sid_map = {
        id_: _filepath_to_sample(id_)
        for id_ in table.ids(axis='sample')
    }
    table.update_ids(sid_map, axis='sample', inplace=True)
    # The feature IDs in DADA2 are the sequences themselves.
    if hashed_feature_ids:
        # Make feature IDs the md5 sums of the sequences.
        fid_map = {
            id_: hashlib.md5(id_.encode('utf-8')).hexdigest()
            for id_ in table.ids(axis='observation')
        }
        table.update_ids(fid_map, axis='observation', inplace=True)

        rep_sequences = DNAIterator(
            (skbio.DNA(k, metadata={'id': v}) for k, v in fid_map.items()))
    else:
        rep_sequences = DNAIterator((skbio.DNA(id_, metadata={'id': id_})
                                     for id_ in table.ids(axis='observation')))
    return table, rep_sequences, metadata
示例#15
0
def get_train_artifacts(taxonomy_samples,
                        fold,
                        taxon_defaults,
                        ref_taxa,
                        ref_seqs,
                        weights=None):

    if weights is None:
        with open(join(fold, 'sample_train.json')) as fp:
            train_samples = json.load(fp)
        train_samples = extract_sample(train_samples, taxonomy_samples)
    else:
        train_samples = weights.view(Table)
    ref_taxa, ref_seqs = load_references(ref_taxa, ref_seqs)

    with open(join(fold, 'seq_train.json')) as fp:
        train_seqs = json.load(fp)
    train_taxa = {ref_taxa[sid] for sid in train_seqs}

    hits = [0]
    direct_remaps = [0]
    indirect_remaps = [0]

    def collapse(taxon, _):
        if taxon in train_taxa:
            hits[0] += 1
            return taxon
        if taxon_defaults[taxon][0] in train_taxa:
            direct_remaps[0] += 1
            return taxon_defaults[taxon][0]
        for try_taxon in taxon_defaults[taxon][1:]:
            if try_taxon in train_taxa:
                indirect_remaps[0] += 1
                return try_taxon

    train_samples = train_samples.collapse(collapse,
                                           axis='observation',
                                           norm=False)
    logging.info('Train taxon remaps')
    logging.info(str(hits[0]) + ' hits')
    logging.info(str(direct_remaps[0]) + ' direct remaps')
    logging.info(str(indirect_remaps[0]) + ' indirect remaps')
    train_samples = Artifact.import_data('FeatureTable[Frequency]',
                                         train_samples)

    train_taxa = list(train_taxa)
    eye_taxonomy = DataFrame({'Taxon': train_taxa},
                             index=train_taxa,
                             columns=['Taxon'])
    eye_taxonomy.index.name = 'Feature ID'
    eye_taxonomy = Artifact.import_data('FeatureData[Taxonomy]', eye_taxonomy)
    train_taxa = [ref_taxa[sid] for sid in train_seqs]
    train_taxonomy = DataFrame({'Taxon': train_taxa},
                               index=train_seqs,
                               columns=['Taxon'])
    train_taxonomy.index.name = 'Feature ID'
    train_taxonomy = Artifact.import_data('FeatureData[Taxonomy]',
                                          train_taxonomy)
    train_iter = DNAIterator(s for s in ref_seqs
                             if s.metadata['id'] in train_seqs)
    train_art = Artifact.import_data('FeatureData[Sequence]', train_iter)
    unobserved_weight = 1e-6 if weights is None else 0.
    weights = clawback.methods.generate_class_weights(
        train_taxonomy,
        train_art,
        train_samples,
        eye_taxonomy,
        unobserved_weight=unobserved_weight)
    ref_seqs = Artifact.import_data('FeatureData[Sequence]',
                                    DNAIterator(ref_seqs))
    return train_taxonomy, train_art, ref_seqs, weights.class_weight