示例#1
0
    def work(self):
        """
        Worker function for splitting the FASTQ file into smaller chunks

        Parameters
        ----------
        in_fastq_file_1 : str
            Location of the FASTQ file to split
        in_fastq_file_2 : str
            Location of the FASTQ file to split
        fastq_chunk_size : int
            Number of reads that each FASTQ chunk should contain
        """
        fqs = fastq_splitter({
            "fastq_chunk_size": self.fastq_chunk_size,
            "no-untar": True
        })
        results = fqs.paired_splitter(self.in_fastq_file_1,
                                      self.in_fastq_file_2,
                                      self.in_fastq_file_1 + ".tar.gz")

        root_name = self.in_fastq_file_1.split("/")
        with open("/".join(root_name[0:-1]) + "/tmp/fastq_file_log.txt",
                  "w") as f_out:
            for fastq_file in results:
                file_1 = "/".join(root_name[0:-1]) + "/tmp/" + fastq_file[0]
                file_2 = "/".join(root_name[0:-1]) + "/tmp/" + fastq_file[1]
                f_out.write(file_1 + "\t" + file_2 + "\n")
def test_paired_splitter():
    """
    Function to test paired splitter
    """
    resource_path = os.path.join(os.path.dirname(__file__), "data/")
    fastq_1file = resource_path + "bsSeeker.Mouse.SRR892982_1.fastq"
    fastq_2file = resource_path + "bsSeeker.Mouse.SRR892982_2.fastq"

    fqs_handle = fastq_splitter()
    results = fqs_handle.run(
        {
            "fastq1" : fastq_1file,
            "fastq2" : fastq_2file
        },
        {
            "fastq1": Metadata(
                "data_rnaseq", "fastq", [], None,
                {'assembly' : 'test'}),
            "fastq2": Metadata(
                "data_rnaseq", "fastq", [], None,
                {'assembly' : 'test'})
        },
        {"output" : fastq_1file + ".tar.gz"}
    )

    print("WGBS - PAIRED RESULTS:", results)

    assert os.path.isfile(results[0]["output"]) is True
    assert os.path.getsize(results[0]["output"]) > 0
def test_single_splitter():
    """
    Function to test single splitter
    """
    resource_path = os.path.join(os.path.dirname(__file__), "data/")
    fastq_2file = resource_path + "bsSeeker.Mouse.GRCm38_2.fastq"

    fqs_handle = fastq_splitter()
    results = fqs_handle.run([fastq_2file], [], {})

    print("WGBS - SINGLE RESULTS:", results)

    assert os.path.isfile(results[0]) is True
    assert os.path.getsize(results[0]) > 0
示例#4
0
    def run(self, input_files, input_metadata, output_files):  # pylint: disable=too-many-locals,too-many-branches,too-many-statements
        """
        Tool for indexing the genome assembly using BS-Seeker2. In this case it
        is using Bowtie2

        Parameters
        ----------
        input_files : list
            FASTQ file
        output_files : list
            Results files.
        metadata : list

        Returns
        -------
        array : list
            Location of the filtered FASTQ file
        """

        try:
            if "bss_path" in self.configuration:
                bss_path = self.configuration["bss_path"]
            else:
                raise KeyError
            if "aligner_path" in self.configuration:
                aligner_path = self.configuration["aligner_path"]
            else:
                raise KeyError
            if "aligner" in self.configuration:
                aligner = self.configuration["aligner"]
            else:
                raise KeyError
        except KeyError:
            logger.fatal(
                "WGBS - BS SEEKER2: Unassigned configuration variables")

        genome_fasta = input_files["genome"]
        genome_idx = input_files["index"]

        sources = [input_files["genome"]]

        fqs = fastq_splitter()

        fastq1 = input_files["fastq1"]
        sources.append(input_files["fastq1"])

        fastq_file_gz = fastq1 + ".tar.gz"
        if "fastq2" in input_files:
            fastq2 = input_files["fastq2"]
            sources.append(input_files["fastq2"])
            fastq_file_list = fqs.paired_splitter(fastq1, fastq2,
                                                  fastq_file_gz)
            aln_params = self.get_aln_params(self.configuration, True)
        else:
            fastq_file_list = fqs.single_splitter(fastq1, fastq_file_gz)
            aln_params = self.get_aln_params(self.configuration)

        # Required to prevent iterating over the future objects
        fastq_file_list = compss_wait_on(fastq_file_list)
        if not fastq_file_list:
            logger.fatal("FASTQ SPLITTER: run failed")
            return {}, {}

        if hasattr(sys, '_run_from_cmdl') is True:
            pass
        else:
            with compss_open(fastq_file_gz, "rb") as f_in:
                with open(fastq_file_gz, "wb") as f_out:
                    f_out.write(f_in.read())

        gz_data_path = fastq_file_gz.split("/")
        gz_data_path = "/".join(gz_data_path[:-1])

        try:
            tar = tarfile.open(fastq_file_gz)
            tar.extractall(path=gz_data_path)
            tar.close()
        except tarfile.TarError:
            logger.fatal("Split FASTQ files: Malformed tar file")
            return {}, {}

        # input and output share most metadata
        output_metadata = {}

        output_bam_file = output_files["bam"]
        output_bai_file = output_files["bai"]

        output_bam_list = []
        for fastq_file_pair in fastq_file_list:
            logger.info("TMP DIR: " + gz_data_path + "/tmp/")
            if "fastq2" in input_files:
                tmp_fq1 = gz_data_path + "/tmp/" + fastq_file_pair[0]
                tmp_fq2 = gz_data_path + "/tmp/" + fastq_file_pair[1]
                logger.info("TMP_FQ1: " + fastq_file_pair[0])
                logger.info("TMP_FQ2: " + fastq_file_pair[1])
                output_bam_file_tmp = tmp_fq1 + ".bam"
                output_bam_list.append(output_bam_file_tmp)

                self.bs_seeker_aligner(tmp_fq1, tmp_fq2, aligner, aligner_path,
                                       bss_path, aln_params, genome_fasta,
                                       genome_idx, output_bam_file_tmp)
            else:
                tmp_fq = gz_data_path + "/tmp/" + fastq_file_pair[0]
                logger.info("TMP_FQ: " + fastq_file_pair[0])
                output_bam_file_tmp = tmp_fq + ".bam"
                output_bam_list.append(output_bam_file_tmp)

                self.bs_seeker_aligner_single(tmp_fq, aligner, aligner_path,
                                              bss_path, aln_params,
                                              genome_fasta, genome_idx,
                                              output_bam_file_tmp)

        bam_handle = bamUtilsTask()

        logger.info("Merging bam files")
        bam_handle.bam_merge(output_bam_list)

        logger.info("Sorting merged bam file")
        bam_handle.bam_sort(output_bam_list[0])

        logger.info("Copying bam file into the output file")
        bam_handle.bam_copy(output_bam_list[0], output_bam_file)

        logger.info("Creating output bam index file")
        bam_handle.bam_index(output_bam_file, output_bai_file)

        output_metadata = {
            "bam":
            Metadata(data_type="data_wgbs",
                     file_type="BAM",
                     file_path=output_bam_file,
                     sources=sources,
                     taxon_id=input_metadata["genome"].taxon_id,
                     meta_data={
                         "assembly":
                         input_metadata["genome"].meta_data["assembly"],
                         "tool": "bs_seeker_aligner"
                     }),
            "bai":
            Metadata(data_type="data_wgbs",
                     file_type="BAI",
                     file_path=output_bai_file,
                     sources=[input_metadata["genome"].file_path],
                     taxon_id=input_metadata["genome"].taxon_id,
                     meta_data={
                         "assembly":
                         input_metadata["genome"].meta_data["assembly"],
                         "tool": "bs_seeker_aligner"
                     })
        }

        return (output_files, output_metadata)
    def run(self, input_files, input_metadata, output_files):
        """
        The main function to align bam files to a genome using BWA

        Parameters
        ----------
        input_files : dict
            File 0 is the genome file location, file 1 is the FASTQ file
        metadata : dict
        output_files : dict

        Returns
        -------
        output_files : dict
            First element is a list of output_bam_files, second element is the
            matching meta data
        output_metadata : dict
        """

        sources = [input_files["genome"]]

        fqs = fastq_splitter()

        fastq1 = input_files["loc"]
        sources.append(input_files["loc"])

        fastq_file_gz = str(fastq1 + ".tar.gz")
        if "fastq2" in input_files:
            fastq2 = input_files["fastq2"]
            sources.append(input_files["fastq2"])
            fastq_file_list = fqs.paired_splitter(fastq1, fastq2,
                                                  fastq_file_gz)
        else:
            fastq_file_list = fqs.single_splitter(fastq1, fastq_file_gz)

        # Required to prevent iterating over the future objects
        fastq_file_list = compss_wait_on(fastq_file_list)
        if not fastq_file_list:
            logger.fatal("FASTQ SPLITTER: run failed")
            return {}, {}

        if hasattr(sys, '_run_from_cmdl') is True:
            pass
        else:
            logger.info("Getting the tar file")
            with compss_open(fastq_file_gz, "rb") as f_in:
                with open(fastq_file_gz, "wb") as f_out:
                    f_out.write(f_in.read())

        gz_data_path = fastq_file_gz.split("/")
        gz_data_path = "/".join(gz_data_path[:-1])

        try:
            tar = tarfile.open(fastq_file_gz)
            tar.extractall(path=gz_data_path)
            tar.close()
        except tarfile.TarError:
            logger.fatal("Split FASTQ files: Malformed tar file")
            return {}, {}

        # input and output share most metadata
        output_metadata = {}

        output_bam_file = output_files["output"]
        # output_bai_file = output_files["bai"]

        logger.info("BWA ALIGNER: Aligning sequence reads to the genome")

        output_bam_list = []
        for fastq_file_pair in fastq_file_list:
            if "fastq2" in input_files:
                tmp_fq1 = gz_data_path + "/tmp/" + fastq_file_pair[0]
                tmp_fq2 = gz_data_path + "/tmp/" + fastq_file_pair[1]
                output_bam_file_tmp = tmp_fq1 + ".bam"
                output_bam_list.append(output_bam_file_tmp)

                logger.info("BWA MEM FILES: " + tmp_fq1 + " - " + tmp_fq2)
                self.bwa_aligner_paired(
                    str(input_files["genome"]), tmp_fq1, tmp_fq2,
                    output_bam_file_tmp, str(input_files["index"]),
                    self.get_mem_params(self.configuration))
            else:
                tmp_fq = gz_data_path + "/tmp/" + fastq_file_pair[0]
                output_bam_file_tmp = tmp_fq + ".bam"
                output_bam_list.append(output_bam_file_tmp)

                logger.info("BWA MEM FILES: " + tmp_fq)
                self.bwa_aligner_single(
                    str(input_files["genome"]), tmp_fq, output_bam_file_tmp,
                    str(input_files["index"]),
                    self.get_mem_params(self.configuration))

        bam_handle = bamUtilsTask()

        logger.info("Merging bam files")
        bam_handle.bam_merge(output_bam_list)

        logger.info("Sorting merged bam file")
        bam_handle.bam_sort(output_bam_list[0])

        logger.info("Copying bam file into the output file")
        bam_handle.bam_copy(output_bam_list[0], output_bam_file)

        logger.info("BWA ALIGNER: Alignments complete")

        output_metadata = {
            "bam":
            Metadata(data_type=input_metadata['loc'].data_type,
                     file_type="BAM",
                     file_path=output_files["output"],
                     sources=[
                         input_metadata["genome"].file_path,
                         input_metadata['loc'].file_path
                     ],
                     taxon_id=input_metadata["genome"].taxon_id,
                     meta_data={
                         "assembly":
                         input_metadata["genome"].meta_data["assembly"],
                         "tool": "bwa_aligner"
                     })
        }

        return ({"bam": output_files["output"]}, output_metadata)