Пример #1
0
    def test_slanepsample_paired_end_fastq_dir_fmt_validate_negative(self):
        filenames = ('paired_end_data/MANIFEST', 'metadata.yml',
                     'not-fastq.fastq.gz')
        for filename in filenames:
            filepath = self.get_data_path(filename)
            shutil.copy(filepath, self.temp_dir.name)

        format = SingleLanePerSamplePairedEndFastqDirFmt(
            self.temp_dir.name, mode='r')

        with self.assertRaisesRegex(ValueError, 'SingleLanePerSamplePaired'):
            format.validate()
Пример #2
0
    def test_slanepsample_paired_end_fastq_dir_fmt_validate_positive(self):
        filenames = ('paired_end_data/MANIFEST', 'metadata.yml',
                     'Human-Kneecap_S1_L001_R1_001.fastq.gz',
                     'paired_end_data/Human-Kneecap_S1_L001_R2_001.fastq.gz')
        for filename in filenames:
            filepath = self.get_data_path(filename)
            shutil.copy(filepath, self.temp_dir.name)

        format = SingleLanePerSamplePairedEndFastqDirFmt(self.temp_dir.name,
                                                         mode='r')

        format.validate()
Пример #3
0
    def test_slanepsample_paired_end_fastq_dir_fmt_validate_positive(self):
        filenames = ('paired_end_data/MANIFEST', 'metadata.yml',
                     'Human-Kneecap_S1_L001_R1_001.fastq.gz',
                     'paired_end_data/Human-Kneecap_S1_L001_R2_001.fastq.gz')
        for filename in filenames:
            filepath = self.get_data_path(filename)
            shutil.copy(filepath, self.temp_dir.name)

        format = SingleLanePerSamplePairedEndFastqDirFmt(
            self.temp_dir.name, mode='r')

        format.validate()
Пример #4
0
    def test_slanepsample_paired_end_fastq_dir_fmt_validate_missing_pair(self):
        filenames = ('single_end_data/MANIFEST', 'metadata.yml',
                     'Human-Kneecap_S1_L001_R1_001.fastq.gz')
        for filename in filenames:
            filepath = self.get_data_path(filename)
            shutil.copy(filepath, self.temp_dir.name)

        format = SingleLanePerSamplePairedEndFastqDirFmt(self.temp_dir.name,
                                                         mode='r')

        with self.assertRaisesRegex(ValidationError, 'paired'):
            format.validate()
Пример #5
0
    def test_slanepsample_paired_end_fastq_dir_fmt_validate_negative(self):
        filenames = ('paired_end_data/MANIFEST', 'metadata.yml',
                     'not-fastq.fastq.gz')
        for filename in filenames:
            filepath = self.get_data_path(filename)
            shutil.copy(filepath, self.temp_dir.name)

        format = SingleLanePerSamplePairedEndFastqDirFmt(self.temp_dir.name,
                                                         mode='r')

        with self.assertRaisesRegex(ValueError, 'SingleLanePerSamplePaired'):
            format.validate()
Пример #6
0
    def test_slanepsample_paired_end_fastq_dir_fmt_validate_missing_pair(self):
        filenames = ('single_end_data/MANIFEST', 'metadata.yml',
                     'Human-Kneecap_S1_L001_R1_001.fastq.gz')
        for filename in filenames:
            filepath = self.get_data_path(filename)
            shutil.copy(filepath, self.temp_dir.name)

        format = SingleLanePerSamplePairedEndFastqDirFmt(
            self.temp_dir.name, mode='r')

        with self.assertRaisesRegex(ValidationError,
                                    'paired'):
            format.validate()
Пример #7
0
def test_view_artifcat_type():
    testFile = os.path.join(TEST_DIR, "test_data", "paired",
                            "445cf54a-bf06-4852-8010-13a60fa1598c", "data")
    testData = SingleLanePerSamplePairedEndFastqDirFmt(testFile, "r")
    os.chdir(str(testData))
    exp1 = itsxq._view_artifact_type()
    if not ("SampleData[PairedEndSequencesWithQuality]" in exp1):
        raise AssertionError()
    testFile2 = os.path.join(TEST_DIR, "test_data", "pairedbroken",
                             "50d5f31a-a761-4c04-990c-e7668fe6bf00", "data")
    testData2 = SingleLanePerSamplePairedEndFastqDirFmt(testFile2, "r")
    os.chdir(str(testData2))
    assert_raises(ValueError, exp2=itsxq._view_artifact_type())
Пример #8
0
    def setUp(self):
        super().setUp()

        data_single = SingleLanePerSampleSingleEndFastqDirFmt(
            self.get_data_path('filter_samples_single_end/dir_fmt'), mode='r')
        self.sample_single = _PlotQualView(data_single, False)
        self.manifest_single = data_single.manifest.view(pd.DataFrame)

        self.md_single_all = Metadata.load(
            self.get_data_path('filter_samples_single_end/filter_all.tsv'))
        self.md_single_subset = Metadata.load(
            self.get_data_path('filter_samples_single_end/filter_subset.tsv'))
        self.md_single_none = Metadata.load(
            self.get_data_path('filter_samples_single_end/filter_none.tsv'))

        data_paired = SingleLanePerSamplePairedEndFastqDirFmt(
            self.get_data_path('filter_samples_paired_end/dir_fmt'), mode='r')
        self.sample_paired = _PlotQualView(data_paired, True)
        self.manifest_paired = data_paired.manifest.view(pd.DataFrame)

        self.md_paired_all = Metadata.load(
            self.get_data_path('filter_samples_single_end/filter_all.tsv'))
        self.md_paired_subset = Metadata.load(
            self.get_data_path('filter_samples_single_end/filter_subset.tsv'))
        self.md_paired_none = Metadata.load(
            self.get_data_path('filter_samples_single_end/filter_none.tsv'))
Пример #9
0
def test_fastq_id_maker():
    testFile = os.path.join(TEST_DIR, "test_data", "paired",
                            "445cf54a-bf06-4852-8010-13a60fa1598c", "data")
    testData = SingleLanePerSamplePairedEndFastqDirFmt(testFile, "r")
    artifactType = "SampleData[PairedEndSequencesWithQuality]"
    exp1, exp2 = itsxq._fastq_id_maker(testData, artifactType)
    expList = []
    exp1Set = set(exp1)
    for sequence in exp1Set:
        expList.append(sequence[0])
        expList.append(sequence[1])
    if not expList == [
            '4774-1-MSITS3_0_L001_R1_001.fastq.gz',
            '4774-1-MSITS3_1_L001_R2_001.fastq.gz'
    ]:
        raise AssertionError()
    if exp2 != False:
        raise AssertionError()
Пример #10
0
 def setUp(self):
     super().setUp()
     self.input_seqs = SingleLanePerSamplePairedEndFastqDirFmt(
         self.get_data_path('demux-1'), 'r')
Пример #11
0
 def setUp(self):
     super().setUp()
     self.demux_seqs = SingleLanePerSamplePairedEndFastqDirFmt(
         self.get_data_path('sample_seqs_paired'), 'r')
Пример #12
0
def emp_paired(
    seqs: BarcodePairedSequenceFastqIterator,
    barcodes: qiime2.MetadataCategory,
    rev_comp_barcodes: bool = False,
    rev_comp_mapping_barcodes: bool = False
) -> SingleLanePerSamplePairedEndFastqDirFmt:

    result = SingleLanePerSamplePairedEndFastqDirFmt()
    barcode_map, barcode_len = _make_barcode_map(barcodes,
                                                 rev_comp_mapping_barcodes)

    manifest = FastqManifestFormat()
    manifest_fh = manifest.open()
    manifest_fh.write('sample-id,filename,direction\n')

    per_sample_fastqs = {}
    for barcode_record, forward_record, reverse_record in seqs:
        barcode_read = barcode_record[1]
        if rev_comp_barcodes:
            barcode_read = str(skbio.DNA(barcode_read).reverse_complement())
        barcode_read = barcode_read[:barcode_len]

        try:
            sample_id = barcode_map[barcode_read]
        except KeyError:
            # TODO: this should ultimately be logged, but we don't currently
            # have support for that.
            continue

        if sample_id not in per_sample_fastqs:
            barcode_id = len(per_sample_fastqs) + 1
            fwd_path = result.sequences.path_maker(sample_id=sample_id,
                                                   barcode_id=barcode_id,
                                                   lane_number=1,
                                                   read_number=1)
            rev_path = result.sequences.path_maker(sample_id=sample_id,
                                                   barcode_id=barcode_id,
                                                   lane_number=1,
                                                   read_number=2)

            _maintain_open_fh_count(per_sample_fastqs, paired=True)
            per_sample_fastqs[sample_id] = (gzip.open(str(fwd_path), mode='a'),
                                            gzip.open(str(rev_path), mode='a'))
            manifest_fh.write('%s,%s,%s\n' %
                              (sample_id, fwd_path.name, 'forward'))
            manifest_fh.write('%s,%s,%s\n' %
                              (sample_id, rev_path.name, 'reverse'))

        if per_sample_fastqs[sample_id][0].closed:
            _maintain_open_fh_count(per_sample_fastqs, paired=True)
            fwd, rev = per_sample_fastqs[sample_id]
            per_sample_fastqs[sample_id] = (gzip.open(fwd.name, mode='a'),
                                            gzip.open(rev.name, mode='a'))

        fwd, rev = per_sample_fastqs[sample_id]
        fwd.write(('\n'.join(forward_record) + '\n').encode('utf-8'))
        rev.write(('\n'.join(reverse_record) + '\n').encode('utf-8'))

    if len(per_sample_fastqs) == 0:
        raise ValueError('No sequences were mapped to samples. Check that '
                         'your barcodes are in the correct orientation (see '
                         'the rev_comp_barcodes and/or '
                         'rev_comp_mapping_barcodes options).')

    for fwd, rev in per_sample_fastqs.values():
        fwd.close()
        rev.close()

    manifest_fh.close()
    result.manifest.write_data(manifest, FastqManifestFormat)

    _write_metadata_yaml(result)

    return result
Пример #13
0
    SingleLanePerSampleSingleEndFastqDirFmt,
    SingleLanePerSamplePairedEndFastqDirFmt, FastqManifestFormat)

import itsxpress._itsxpress as _itsxpress

import qiime2
from qiime2.util import redirected_stdio
import pandas as pd

# The test data dir
TEST_DIR = os.path.dirname(os.path.abspath(__file__))
# Test info 1
TEST_FILE = os.path.join(TEST_DIR, "test_data", "paired",
                         "445cf54a-bf06-4852-8010-13a60fa1598c", "data")

TEST_DATA = SingleLanePerSamplePairedEndFastqDirFmt(TEST_FILE, "r")
# Test info 2
TEST_FILE_PBMD = os.path.join(TEST_DIR, "test_data", "pairedBrokenMissingData",
                              "50d5f31a-a761-4c04-990c-e7668fe6bf00", "data")

TEST_DATA_PBMD = SingleLanePerSamplePairedEndFastqDirFmt(TEST_FILE_PBMD, "r")
# Test info 3
TEST_FILE_PAF = os.path.join(TEST_DIR, "test_data", "pairedAllForward",
                             "445cf54a-bf06-4852-8010-13a60fa1598c", "data")
TEST_DATA_PAF = SingleLanePerSamplePairedEndFastqDirFmt(TEST_FILE_PAF, "r")
# Test info 4
TEST_FILE_OUT = os.path.join(TEST_DIR, "test_data", "out",
                             "d9955749-00d5-44ae-a628-4b2da43000e1", "data")
TEST_DATA_OUT = SingleLanePerSamplePairedEndFastqDirFmt(TEST_FILE_OUT, "r")
# Test info 5
TEST_FILE_SINGLEOUT = os.path.join(TEST_DIR, "test_data", "singleOut",
Пример #14
0
def emp_paired(
    seqs: BarcodePairedSequenceFastqIterator,
    barcodes: qiime2.CategoricalMetadataColumn,
    golay_error_correction: bool = True,
    rev_comp_barcodes: bool = False,
    rev_comp_mapping_barcodes: bool = False,
    ignore_description_mismatch: bool = False
) -> (SingleLanePerSamplePairedEndFastqDirFmt, ErrorCorrectionDetailsFmt):
    seqs.ignore_description_mismatch = ignore_description_mismatch
    result = SingleLanePerSamplePairedEndFastqDirFmt()
    barcode_map, barcode_len = _make_barcode_map(barcodes,
                                                 rev_comp_mapping_barcodes)

    if golay_error_correction:
        decoder = GolayDecoder()

    manifest = FastqManifestFormat()
    manifest_fh = manifest.open()
    manifest_fh.write('sample-id,filename,direction\n')

    per_sample_fastqs = {}

    ec_details_fmt = ErrorCorrectionDetailsFmt()
    ec_details = ECDetails(ec_details_fmt)

    for i, record in enumerate(seqs, start=1):
        barcode_record, forward_record, reverse_record = record
        barcode_read = barcode_record[1]
        if rev_comp_barcodes:
            barcode_read = str(skbio.DNA(barcode_read).reverse_complement())
        raw_barcode_read = barcode_read[:barcode_len]

        if golay_error_correction:
            # A three bit filter is implicitly used by the decoder. See Hamady
            # and Knight 2009 Genome Research for the justification:
            #
            # https://genome.cshlp.org/content/19/7/1141.full
            #
            # Specifically that "...Golay codes of 12 bases can correct all
            # triple-bit errors and detect all quadruple-bit errors."
            barcode_read, ecc_errors = decoder.decode(raw_barcode_read)
            golay_stats = [barcode_read, ecc_errors]
        else:
            barcode_read = raw_barcode_read
            golay_stats = [None, None]

        sample_id = barcode_map.get(barcode_read)

        record = [
            f'record-{i}',
            sample_id,
            barcode_record[0],
            raw_barcode_read,
        ]
        ec_details.write(record + golay_stats)

        if sample_id is None:
            continue

        if sample_id not in per_sample_fastqs:
            barcode_id = len(per_sample_fastqs) + 1
            fwd_path = result.sequences.path_maker(sample_id=sample_id,
                                                   barcode_id=barcode_id,
                                                   lane_number=1,
                                                   read_number=1)
            rev_path = result.sequences.path_maker(sample_id=sample_id,
                                                   barcode_id=barcode_id,
                                                   lane_number=1,
                                                   read_number=2)

            _maintain_open_fh_count(per_sample_fastqs, paired=True)
            per_sample_fastqs[sample_id] = (gzip.open(str(fwd_path), mode='a'),
                                            gzip.open(str(rev_path), mode='a'))
            manifest_fh.write('%s,%s,%s\n' %
                              (sample_id, fwd_path.name, 'forward'))
            manifest_fh.write('%s,%s,%s\n' %
                              (sample_id, rev_path.name, 'reverse'))

        if per_sample_fastqs[sample_id][0].closed:
            _maintain_open_fh_count(per_sample_fastqs, paired=True)
            fwd, rev = per_sample_fastqs[sample_id]
            per_sample_fastqs[sample_id] = (gzip.open(fwd.name, mode='a'),
                                            gzip.open(rev.name, mode='a'))

        fwd, rev = per_sample_fastqs[sample_id]
        fwd.write(('\n'.join(forward_record) + '\n').encode('utf-8'))
        rev.write(('\n'.join(reverse_record) + '\n').encode('utf-8'))

    if len(per_sample_fastqs) == 0:
        raise ValueError('No sequences were mapped to samples. Check that '
                         'your barcodes are in the correct orientation (see '
                         'the rev_comp_barcodes and/or '
                         'rev_comp_mapping_barcodes options). If barcodes are '
                         'NOT Golay format set golay_error_correction '
                         'to False.')

    for fwd, rev in per_sample_fastqs.values():
        fwd.close()
        rev.close()

    manifest_fh.close()
    result.manifest.write_data(manifest, FastqManifestFormat)

    _write_metadata_yaml(result)

    return result, ec_details_fmt
Пример #15
0
def main(per_sample_sequences: _SingleLanePerSampleFastqDirFmt, threads: int,
         taxa: str, region: str, paired: bool, cluster_id: float):
    """The main communication between the plugin and the ITSxpress program.

    Args:
        per_sample_sequences (SingleLanePerSampleSingleEndFastqDirFmt): The SingleLanePerSampleSingleEndFastqDirFmt type
        of the input.
        threads (int) : The number of threads to use.
        taxa (str): The taxa to be used for the search.
        region (str) : The region to be used for the search.
        cluster_id (float):The percent identity for clustering reads, set to 1 for exact dereplication.

    Returns:
        (SingleLanePerSampleSingleEndFastqDirFmt): The SingleLanePerSampleSingleEndFastqDirFmt type
        of the output.

    Raises:
        ValueError1: hmmsearch error.

    """
    #Seeing if cluter_id is equal to 1
    # Finding the artifact type.
    artifact_type = _view_artifact_type(
        per_sample_sequence=per_sample_sequences)
    # Setting the taxa
    taxa = _taxa_prefix_to_taxa(taxa)
    # Writing the manifest for the output qza
    manifest = FastqManifestFormat()
    manifest_fn = manifest.open()
    manifest_fn.write('sample-id,filename,direction\n')
    # Getting the sequences from the manifest
    sequences, single_end = _fastq_id_maker(
        per_sample_sequences=per_sample_sequences, artifact_type=artifact_type)
    barcode = 0
    # Creating result dir
    if paired:
        results = SingleLanePerSamplePairedEndFastqDirFmt()
    else:
        results = SingleLanePerSampleSingleEndFastqDirFmt()
    # Running the for loop for each sample

    for sequence in sequences:
        # writing fastqs and there attributes and checking the files
        sequence_id, sobj = _set_fastqs_and_check(
            per_sample_sequences=per_sample_sequences,
            artifact_type=artifact_type,
            sequence=sequence,
            single_end=single_end,
            threads=threads)

        # Deduplicate
        if math.isclose(cluster_id, 1, rel_tol=1e-05):
            sobj.deduplicate(threads=threads)
        else:
            sobj.cluster(threads=threads, cluster_id=cluster_id)
        try:
            # HMMSearch for ITS regions
            hmmfile = os.path.join(ROOT_DIR, "ITSx_db", "HMMs",
                                   taxa_dict[taxa])
            sobj._search(hmmfile=hmmfile, threads=threads)
        except (ModuleNotFoundError, FileNotFoundError, NotADirectoryError):

            raise ValueError(
                "hmmsearch was not found, make sure HMMER3 is installed and executable"
            )

        # Parse HMMseach output.
        its_pos = itsxpress.ItsPosition(domtable=sobj.dom_file, region=region)
        # Create deduplication object.
        dedup_obj = itsxpress.Dedup(uc_file=sobj.uc_file,
                                    rep_file=sobj.rep_file,
                                    seq_file=sobj.seq_file,
                                    fastq=sobj.r1,
                                    fastq2=sobj.fastq2)

        path_forward = results.sequences.path_maker(sample_id=sequence_id,
                                                    barcode_id=barcode,
                                                    lane_number=1,
                                                    read_number=1)
        path_reverse = results.sequences.path_maker(sample_id=sequence_id,
                                                    barcode_id=barcode,
                                                    lane_number=1,
                                                    read_number=2)

        manifest_fn.write("{},{},forward\n".format(sequence_id,
                                                   path_forward.name))
        # Create trimmed sequences.
        if paired:
            dedup_obj.create_paired_trimmed_seqs(str(path_forward),
                                                 str(path_reverse),
                                                 gzipped=True,
                                                 itspos=its_pos)
        else:
            dedup_obj.create_trimmed_seqs(str(path_forward),
                                          gzipped=True,
                                          itspos=its_pos)
        # Deleting the temp files.
        shutil.rmtree(sobj.tempdir)
        # Adding one to the barcode
        barcode += 1
    # Writing out the results.
    manifest_fn.close()
    _write_metadata(results=results)
    results.manifest.write_data(manifest, FastqManifestFormat)
    return results
Пример #16
0
    def setUp(self):
        super().setUp()

        self.demux_seqs = SingleLanePerSamplePairedEndFastqDirFmt(
            self.get_data_path('paired-end'), mode='r')
        self.trimmed_seqs = CasavaOneEightSingleLanePerSampleDirFmt()