def test_multiplex_umitools_extract_fq(extract_umis, expected_fixture, dir_tmp, multiplex_name): """ Test ``umi_tools extract`` multiplexed FASTQ files for equality. See :py:func:`riboviz.fastq.equal_fastq`. Skipped by ``pytest`` automatically if ``multiplex_name`` fixture is not injected. Skipped if :py:const:`riboviz.params.EXTRACT_UMIS` is ``False``. :param extract_umi: Configuration parameter :type extract_umis: bool :param expected_fixture: Expected data directory :type expected_fixture: str or unicode :param dir_tmp: Temporary directory :type dir_tmp: str or unicode :param multiplex_name: Multiplexed FASTQ file name prefix :type multiplex_name: str or unicode """ if not extract_umis: pytest.skip('Skipped test as extract_umis: {}'.format(extract_umis)) file_name = workflow_files.UMI_EXTRACT_FQ_FORMAT.format(multiplex_name) dir_tmp_name = os.path.basename(os.path.normpath(dir_tmp)) fastq.equal_fastq(os.path.join(expected_fixture, dir_tmp_name, file_name), os.path.join(dir_tmp, file_name))
def test_demultiplex_gz(tmp_dir, file_format): """ Test :py:func:`riboviz.demultiplex_fastq.demultiplex` using GZIPped FASTQ files. Each ``file_format`` consists of a FASTQ GZIP file name format and the corresponding non-GZIP FASTQ file name format. :param tmp_dir: Temporary directory :type tmp_dir: str or unicode :param file_format: File name format :type file_format: tuple(str or unicode, str or unicode) """ gz_fmt, fmt = file_format tmp_fastq_file = os.path.join(tmp_dir, gz_fmt.format("test_multiplex")) with open(os.path.join(riboviz.test.SIMDATA_DIR, "multiplex.fastq"), "rb") as fr: with gzip.open(tmp_fastq_file, "wb") as fw: shutil.copyfileobj(fr, fw) demultiplex_fastq.demultiplex( os.path.join(riboviz.test.SIMDATA_DIR, "multiplex_barcodes.tsv"), tmp_fastq_file, mismatches=2, out_dir=tmp_dir) actual_num_reads = os.path.join( tmp_dir, demultiplex_fastq.NUM_READS_FILE) expected_num_reads = os.path.join( riboviz.test.SIMDATA_DIR, "deplex", demultiplex_fastq.NUM_READS_FILE) utils.equal_tsv(expected_num_reads, actual_num_reads) for tag in ["Tag0", "Tag1", "Tag2", "Unassigned"]: # Actual data has extension matching lower-case version # of multiplexed file's extension. actual_fq_gz = os.path.join(tmp_dir, gz_fmt.lower().format(tag)) actual_fq = os.path.join(tmp_dir, fmt.format(tag)) # Simulated data always has a .fastq extension. expected_fq = os.path.join(riboviz.test.SIMDATA_DIR, "deplex", fastq.FASTQ_FORMAT.format(tag)) # Decompress actual_fq_gz with gzip.open(actual_fq_gz, "rb") as fr: with open(actual_fq, "wb") as fw: shutil.copyfileobj(fr, fw) fastq.equal_fastq(expected_fq, actual_fq) # The definition of the simulated data means that Tag3 has no # matches, as Tag0|1|2 will match any barcodes first. Check # there is no Tag3-related output file. assert not os.path.exists(os.path.join(tmp_dir, gz_fmt.lower().format("Tag3")))
def compare_files(file1, file2, compare_names=True): """ Compare two files for equality. The following functions are used to compare each type of file: * ``bai``: :py:func:`riboviz.utils.equal_file_sizes` * ``bam``: :py:func:`riboviz.sam_bam.equal_bam` * ``bedgraph``: :py:func:`riboviz.bedgraph.equal_bedgraph` * ``fq``: :py:func:`riboviz.fastq.equal_fastq` * ``h5``: :py:func:`riboviz.h5.equal_h5` * ``ht2``: :py:func:`riboviz.utils.equal_file_sizes` * ``pdf``: :py:func:`riboviz.utils.equal_file_names` * ``sam``: :py:func:`riboviz.sam_bam.equal_sam` * ``tsv``: :py:func:`riboviz.utils.equal_tsv` :param file1: File name :type file1: str or unicode :param file2: File name :type file2: str or unicode :param compare_names: Compare file names? :type: bool :raise AssertionError: If one or other file does not exist, \ is a directory or their contents differ """ assert os.path.exists(file1), "Non-existent file: %s" % file1 assert os.path.exists(file2), "Non-existent file: %s" % file2 assert not os.path.isdir(file1), "Directory: %s" % file1 assert not os.path.isdir(file2), "Directory: %s" % file2 if compare_names: utils.equal_file_names(file1, file2) ext = utils.get_file_ext(file1) if ext.endswith(tuple(["pdf"])): utils.equal_file_names(file1, file2) elif ext.endswith(tuple([hisat2.HT2_EXT, sam_bam.BAI_EXT])): utils.equal_file_sizes(file1, file2) elif ext.endswith(tuple([h5.H5_EXT])): h5.equal_h5(file1, file2) elif ext.endswith(tuple([bedgraph.BEDGRAPH_EXT])): bedgraph.equal_bedgraph(file1, file2) elif ext.endswith(tuple([sam_bam.BAM_EXT])): sam_bam.equal_bam(file1, file2) elif ext.endswith(tuple([sam_bam.SAM_EXT])): sam_bam.equal_sam(file1, file2) elif ext.endswith(tuple(["tsv"])): utils.equal_tsv(file1, file2) elif ext.endswith(tuple(fastq.FASTQ_ALL_EXTS)): fastq.equal_fastq(file1, file2) else: assert False, "Unknown file type: " + ext
def test_barcode_umi_extract(configuration_module): """ Test that the results of barcode and UMI extraction are as expected. :param configuration_module: temporary configuration and \ configuration file :type configuration_module: tuple(dict, str or unicode) """ config, _ = configuration_module expected_output = os.path.join(riboviz.test.SIMDATA_DIR, fastq.FASTQ_FORMAT.format("multiplex")) actual_output = os.path.join( config[params.TMP_DIR], workflow_files.UMI_EXTRACT_FQ_FORMAT.format( "multiplex_umi_barcode_adaptor")) fastq.equal_fastq(expected_output, actual_output)
def test_demultiplex(tmp_dir, file_format): """ Test :py:func:`riboviz.demultiplex_fastq.demultiplex`. :param tmp_dir: Temporary directory :type tmp_dir: str or unicode :param file_format: FASTQ file format :type file_format: str or unicode """ tmp_fastq_file = os.path.join(tmp_dir, file_format.format("test_multiplex")) shutil.copyfile(os.path.join(riboviz.test.SIMDATA_DIR, "multiplex.fastq"), tmp_fastq_file) demultiplex_fastq.demultiplex( os.path.join(riboviz.test.SIMDATA_DIR, "multiplex_barcodes.tsv"), tmp_fastq_file, mismatches=2, out_dir=tmp_dir) actual_num_reads = os.path.join( tmp_dir, demultiplex_fastq.NUM_READS_FILE) expected_num_reads = os.path.join( riboviz.test.SIMDATA_DIR, "deplex", demultiplex_fastq.NUM_READS_FILE) utils.equal_tsv(expected_num_reads, actual_num_reads) for tag in ["Tag0", "Tag1", "Tag2", "Unassigned"]: # Actual data has extension matching lower-case version # of multiplexed file's extension. actual_fq = os.path.join(tmp_dir, file_format.lower().format(tag)) # Simulated data always has a .fastq extension. expected_fq = os.path.join(riboviz.test.SIMDATA_DIR, "deplex", fastq.FASTQ_FORMAT.format(tag)) fastq.equal_fastq(expected_fq, actual_fq) # The definition of the simulated data means that Tag3 has no # matches, as Tag0|1|2 will match any barcodes first. Check # there is no Tag3-related output file. assert not os.path.exists(os.path.join(tmp_dir, file_format.lower().format("Tag3")))
def test_umi_extract(configuration_module, sample_id): """ Test that the results of UMI extraction are as expected. :param configuration_module: temporary configuration and \ configuration file :type configuration_module: tuple(dict, str or unicode) :param sample_id: sample ID :type sample_id: str or unicode """ config, _ = configuration_module expected_output = os.path.join( riboviz.test.SIMDATA_DIR, fastq.FASTQ_FORMAT.format(sample_id)) actual_output = os.path.join( config[params.TMP_DIR], sample_id, workflow_files.UMI_EXTRACT_FQ) fastq.equal_fastq(expected_output, actual_output)
def test_adaptor_trimming(configuration_module, sample_id): """ Test that the results of adaptor trimming are as expected. :param configuration_module: temporary configuration and \ configuration file :type configuration_module: tuple(dict, str or unicode) :param sample_id: sample ID :type sample_id: str or unicode """ config, _ = configuration_module expected_output = os.path.join( riboviz.test.SIMDATA_DIR, fastq.FASTQ_FORMAT.format(sample_id + "_umi")) actual_output = os.path.join( config[params.TMP_DIR], sample_id, workflow_files.ADAPTER_TRIM_FQ) fastq.equal_fastq(expected_output, actual_output)
def test_multiplex_cutadapt_fq(expected_fixture, dir_tmp, multiplex_name): """ Test ``cutadapt`` multiplexed FASTQ files for equality. See :py:func:`riboviz.fastq.equal_fastq`. Skipped by ``pytest`` automatically if ``multiplex_name`` fixture is not injected. :param expected_fixture: Expected data directory :type expected_fixture: str or unicode :param dir_tmp: Temporary directory :type dir_tmp: str or unicode :param multiplex_name: Multiplexed FASTQ file name prefix :type multiplex_name: str or unicode """ file_name = workflow_files.ADAPTER_TRIM_FQ_FORMAT.format(multiplex_name) dir_tmp_name = os.path.basename(os.path.normpath(dir_tmp)) fastq.equal_fastq(os.path.join(expected_fixture, dir_tmp_name, file_name), os.path.join(dir_tmp, file_name))
def compare_fq_files(expected_fixture, directory, subdirectory, file_name): """ Test FASTQ files for equality. See :py:func:`riboviz.fastq.equal_fastq`. :param expected_fixture: Expected data directory :type expected_fixture: str or unicode :param directory: Directory :type directory: str or unicode :param subdirectory: Subdirectory :type subdirectory: str or unicode :param file_name: File name :type file_name: str or unicode """ directory_name = os.path.basename(os.path.normpath(directory)) fastq.equal_fastq( os.path.join(expected_fixture, directory_name, subdirectory, file_name), os.path.join(directory, subdirectory, file_name))
def test_deplex_reads(configuration_module, sample_id): """ Test that the FASTQ files output by demultiplexing are as expected. :param configuration_module: temporary configuration and \ configuration file :type configuration_module: tuple(dict, str or unicode) :param sample_id: sample ID for demultiplexed reads :type sample_id: str or unicode """ # Actual data has a .fq extension. actual_file_name = fastq.FQ_FORMAT.format(sample_id) # Simulated data has a .fastq extension. expected_file_name = fastq.FASTQ_FORMAT.format(sample_id) config, _ = configuration_module actual_dir = os.path.join( config[params.TMP_DIR], workflow_files.DEPLEX_DIR_FORMAT.format( "multiplex_umi_barcode_adaptor")) actual_output = os.path.join(actual_dir, actual_file_name) expected_output = os.path.join( riboviz.test.SIMDATA_DIR, "deplex", expected_file_name) fastq.equal_fastq(expected_output, actual_output)