示例#1
0
    def test_nextflow_qc_using_fastq_input(self):
        '''test nextflow_qc using fastq input'''
        reads1 = os.path.join(data_dir, 'Reads', 'reads.1.1.fq.gz')
        reads2 = os.path.join(data_dir, 'Reads', 'reads.1.2.fq.gz')
        output_dir = 'tmp.test_nextflow_qc_using_fastq_input'
        nextflow_file = os.path.join(nextflow_helper.nextflow_dir, 'qc.nf')
        nextflow_helper.write_config_file()
        work_dir = 'tmp.nextflow_qc.work'
        dag_file = 'nextflow.qc.dag.no_db.pdf'
        try:
            os.unlink(dag_file)
        except:
            pass

        command = ' '.join([
            'nextflow run', '--reads_in1', reads1, '--reads_in2', reads2,
            '--output_dir', output_dir, '--ref_fasta',
            os.path.join(data_dir, 'Reference',
                         'ref.fa'), '-with-dag', dag_file, '-c',
            nextflow_helper.config_file, '-w', work_dir, nextflow_file
        ])
        utils.syscall(command)
        os.unlink(nextflow_helper.config_file)
        shutil.rmtree(work_dir)

        self.assertTrue(os.path.exists(output_dir))
        for method in ['fastqc', 'samtools_qc']:
            qc_dir = os.path.join(output_dir, method)
            self.assertTrue(os.path.exists(qc_dir))
            self.assertTrue(len(os.listdir(qc_dir)) >= 1)

        shutil.rmtree(output_dir)
        nextflow_helper.clean_files()
示例#2
0
    def test_nextflow_variant_call_using_fastq_input(self):
        """test nextflow_variant_call using fastq input"""
        reads1 = os.path.join(data_dir, "Reads", "reads.1.1.fq.gz")
        reads2 = os.path.join(data_dir, "Reads", "reads.1.2.fq.gz")
        outdir = os.path.abspath(
            "tmp.test_nextflow_variant_call_fastq_input.out")
        tmp_data_dir = "tmp.nextflow_variant_call_fastq_input.data"
        if os.path.exists(tmp_data_dir):
            shutil.rmtree(tmp_data_dir)
        shutil.copytree(data_dir, tmp_data_dir)
        nextflow_file = os.path.join(nextflow_helper.nextflow_dir,
                                     "variant_call.nf")
        nextflow_helper.write_config_file()
        work_dir = "tmp.nextflow_variant_call_fastq_input.work"
        sample_name = "test_sample_name"
        dag_file = "nextflow.variant_call.dag.no_db.pdf"
        try:
            os.unlink(dag_file)
        except:
            pass

        command = " ".join([
            "nextflow run",
            "--reads_in1",
            reads1,
            "--reads_in2",
            reads2,
            "--output_dir",
            outdir,
            "--ref_dir",
            os.path.join(tmp_data_dir, "Reference"),
            "--sample_name",
            sample_name,
            "--cortex_mem_height 17",
            "--gvcf",
            "--testing",
            "-with-dag",
            dag_file,
            "-c",
            nextflow_helper.config_file,
            "-w",
            work_dir,
            nextflow_file,
        ])
        utils.syscall(command)
        self._files_are_present_and_correct(outdir,
                                            sample_name,
                                            expect_rmdup_bam=True,
                                            expect_ref_check_files=False)
        self.assertTrue(
            os.path.exists(os.path.join(outdir, "minos", "gvcf.fasta")))
        self.assertTrue(
            os.path.exists(os.path.join(outdir, "minos", "gvcf.vcf")))
        os.unlink(nextflow_helper.config_file)
        shutil.rmtree(work_dir)
        shutil.rmtree(tmp_data_dir)
        shutil.rmtree(outdir)
        nextflow_helper.clean_files()
    def test_nextflow_remove_contam_using_fastq_input(self):
        '''test nextflow_remove_contam using fastq input'''
        reads1 = os.path.join(data_dir, 'Reads', 'reads.1.1.fq.gz')
        reads2 = os.path.join(data_dir, 'Reads', 'reads.1.2.fq.gz')
        outprefix = 'tmp.test_nextflow_remove_contam_using_fastq_input'
        nextflow_file = os.path.join(nextflow_helper.nextflow_dir,
                                     'remove_contam.nf')
        nextflow_helper.write_config_file()
        work_dir = 'tmp.nextflow_remove_contam.work'
        dag_file = 'nextflow.remove_contam.dag.no_db.pdf'
        try:
            os.unlink(dag_file)
        except:
            pass

        command = ' '.join([
            'nextflow run', '--reads_in1', reads1, '--reads_in2', reads2,
            '--outprefix', outprefix, '--ref_metadata_tsv',
            os.path.join(data_dir, 'Reference',
                         'remove_contam_metadata.tsv'), '--ref_fasta',
            os.path.join(data_dir, 'Reference',
                         'ref.fa'), '--testing', '-with-dag', dag_file, '-c',
            nextflow_helper.config_file, '-w', work_dir, nextflow_file
        ])
        utils.syscall(command)
        os.unlink(nextflow_helper.config_file)
        shutil.rmtree(work_dir)

        for reads_type in ('contam', 'remove_contam'):
            for i in ('1', '2'):
                filename = outprefix + '.' + reads_type + '.' + i + '.fq.gz'
                self.assertTrue(os.path.exists(filename))
                os.unlink(filename)

        expected_counts_lines = [
            'Name\tIs_contam\tReads\n',
            'contam\t1\t40\n',
            'ref\t0\t132\n',
            'Unmapped\t0\t26\n',
            'Reads_kept_after_remove_contam\t0\t158\n',
        ]

        counts_tsv = outprefix + '.counts.tsv'
        with open(counts_tsv) as f:
            got_counts_lines = f.readlines()
        self.assertEqual(expected_counts_lines, got_counts_lines)
        os.unlink(counts_tsv)

        nextflow_helper.clean_files()
示例#4
0
    def test_nextflow_qc_using_fastq_input(self):
        """test nextflow_qc using fastq input"""
        reads1 = os.path.join(data_dir, "Reads", "reads.1.1.fq.gz")
        reads2 = os.path.join(data_dir, "Reads", "reads.1.2.fq.gz")
        output_dir = "tmp.test_nextflow_qc_using_fastq_input"
        nextflow_file = os.path.join(nextflow_helper.nextflow_dir, "qc.nf")
        nextflow_helper.write_config_file()
        work_dir = "tmp.nextflow_qc.work"
        dag_file = "nextflow.qc.dag.no_db.pdf"
        try:
            os.unlink(dag_file)
        except:
            pass

        command = " ".join([
            "nextflow run",
            "--reads_in1",
            reads1,
            "--reads_in2",
            reads2,
            "--output_dir",
            output_dir,
            "--ref_fasta",
            os.path.join(data_dir, "Reference", "ref.fa"),
            "-with-dag",
            dag_file,
            "-c",
            nextflow_helper.config_file,
            "-w",
            work_dir,
            nextflow_file,
        ])
        utils.syscall(command)
        os.unlink(nextflow_helper.config_file)
        shutil.rmtree(work_dir)

        self.assertTrue(os.path.exists(output_dir))
        for method in ["fastqc", "samtools_qc"]:
            qc_dir = os.path.join(output_dir, method)
            self.assertTrue(os.path.exists(qc_dir))
            self.assertTrue(len(os.listdir(qc_dir)) >= 1)

        shutil.rmtree(output_dir)
        nextflow_helper.clean_files()
示例#5
0
    def test_nextflow_variant_call_using_fastq_input(self):
        '''test nextflow_variant_call using fastq input'''
        reads1 = os.path.join(data_dir, 'Reads', 'reads.1.1.fq.gz')
        reads2 = os.path.join(data_dir, 'Reads', 'reads.1.2.fq.gz')
        outdir = os.path.abspath(
            'tmp.test_nextflow_variant_call_fastq_input.out')
        tmp_data_dir = 'tmp.nextflow_variant_call_fastq_input.data'
        if os.path.exists(tmp_data_dir):
            shutil.rmtree(tmp_data_dir)
        shutil.copytree(data_dir, tmp_data_dir)
        nextflow_file = os.path.join(nextflow_helper.nextflow_dir,
                                     'variant_call.nf')
        nextflow_helper.write_config_file()
        work_dir = 'tmp.nextflow_variant_call_fastq_input.work'
        sample_name = 'test_sample_name'
        dag_file = 'nextflow.variant_call.dag.no_db.pdf'
        try:
            os.unlink(dag_file)
        except:
            pass

        command = ' '.join([
            'nextflow run', '--reads_in1', reads1, '--reads_in2', reads2,
            '--output_dir', outdir, '--ref_dir',
            os.path.join(tmp_data_dir,
                         'Reference'), '--sample_name', sample_name,
            '--cortex_mem_height 17', '--testing', '-with-dag', dag_file, '-c',
            nextflow_helper.config_file, '-w', work_dir, nextflow_file
        ])
        utils.syscall(command)
        self._files_are_present_and_correct(outdir,
                                            sample_name,
                                            expect_rmdup_bam=True,
                                            expect_ref_check_files=False)
        os.unlink(nextflow_helper.config_file)
        shutil.rmtree(work_dir)
        shutil.rmtree(tmp_data_dir)
        shutil.rmtree(outdir)
        nextflow_helper.clean_files()
    def test_nextflow_fake_remove_contam(self):
        """test nextflow_fake_remove_contam"""
        tmp_data_dir = "tmp.nextflow_fake_remove_contam"
        if os.path.exists(tmp_data_dir):
            shutil.rmtree(tmp_data_dir)
        shutil.copytree(data_dir, tmp_data_dir)
        nextflow_helper.write_config_file()
        mysql_config_file = os.path.join(data_dir, "db.cnf")
        mysql_dump = os.path.join(data_dir, "mysql.dump")
        db_config_data = db_connection.DbConnection._parse_config_file(
            db_ini_file)
        utils.syscall("mysql --defaults-file=" + mysql_config_file +
                      ' -e "DROP DATABASE IF EXISTS ' + db_config_data["db"] +
                      "; CREATE DATABASE " + db_config_data["db"] + '"')
        utils.syscall("mysql --defaults-file=" + mysql_config_file + " " +
                      db_config_data["db"] + " < " + mysql_dump)
        pipeline_root = os.path.join(tmp_data_dir, "Pipeline_root")
        references_root = os.path.join(tmp_data_dir, "Pipeline_refs")
        nextflow_file = os.path.join(nextflow_helper.nextflow_dir,
                                     "fake_remove_contam.nf")
        work_dir = "tmp.nextflow_fake_remove_contam.work"
        dag_file = "nextflow.fake_remove_contam.dag.db.pdf"
        try:
            os.unlink(dag_file)
        except:
            pass

        command = " ".join([
            "nextflow run",
            "--dataset_name g1",  # one read pair has group g2, so should get ignored
            "--pipeline_root",
            os.path.abspath(pipeline_root),
            "--db_config_file",
            db_ini_file,
            "-with-dag",
            dag_file,
            "-c",
            nextflow_helper.config_file,
            "-w",
            work_dir,
            nextflow_file,
        ])
        utils.syscall(command)
        os.unlink(nextflow_helper.config_file)
        shutil.rmtree(work_dir)

        # check database Pipeline table updated as expected
        database = db.Db(db_ini_file)
        got_rows = database.get_rows_from_table("Pipeline")
        got_rows.sort(key=itemgetter("seqrep_id"))
        expected_rows = [
            {
                "isolate_id": 1,
                "seqrep_id": 1,
                "seqrep_pool": None,
                "version": clockwork_version,
                "pipeline_name": "remove_contam",
                "status": 1,
                "reference_id": 0,
            },
            {
                "isolate_id": 2,
                "seqrep_id": 2,
                "seqrep_pool": None,
                "version": clockwork_version,
                "pipeline_name": "remove_contam",
                "status": 1,
                "reference_id": 0,
            },
            {
                "isolate_id": 3,
                "seqrep_id": 3,
                "seqrep_pool": None,
                "version": clockwork_version,
                "pipeline_name": "remove_contam",
                "status": -1,
                "reference_id": 0,
            },
        ]
        self.assertEqual(expected_rows, got_rows)

        # check database Read_counts table updated
        got_rows = database.get_rows_from_table("Read_counts")
        got_rows.sort(key=itemgetter("seqrep_id"))
        expected_rows = [
            {
                "seqrep_id": 1,
                "original_total": 12,
                "contamination": 0,
                "not_contamination": 12,
                "unmapped": 0,
                "total_after_remove_contam": 12,
            },
            {
                "seqrep_id": 2,
                "original_total": 26,
                "contamination": 0,
                "not_contamination": 26,
                "unmapped": 0,
                "total_after_remove_contam": 26,
            },
        ]

        self.assertEqual(expected_rows, got_rows)

        # check FASTQ files got written. No need to check contents, as that is done
        # elsewhere. We're just checking nextflow runs OK here.
        ids = [
            {
                "sample": 1,
                "isolate_id": 1,
                "seq_repl": 1
            },
            {
                "sample": 2,
                "isolate_id": 2,
                "seq_repl": 1
            },
        ]
        for id_dict in ids:
            iso_dir = isolate_dir.IsolateDir(pipeline_root, id_dict["sample"],
                                             id_dict["isolate_id"])
            for read_type in ("original", "remove_contam"):
                for i in (1, 2):
                    self.assertTrue(
                        os.path.exists(
                            iso_dir.reads_filename(read_type,
                                                   id_dict["seq_repl"], i)))

        shutil.rmtree(tmp_data_dir)
        nextflow_helper.clean_files()
示例#7
0
    def test_nextflow_mykrobe_predict(self):
        """test nextflow_mykrobe using database"""
        tmp_data_dir = "tmp.nextflow_mykrobe_db_input.data"
        if os.path.exists(tmp_data_dir):
            shutil.rmtree(tmp_data_dir)
        shutil.copytree(data_dir, tmp_data_dir)
        nextflow_helper.write_config_file()
        mysql_config_file = os.path.join(data_dir, "db.cnf")
        mysql_dump = os.path.join(data_dir, "mysql.dump")
        db_config_data = db_connection.DbConnection._parse_config_file(
            db_ini_file)
        utils.syscall("mysql --defaults-file=" + mysql_config_file +
                      ' -e "DROP DATABASE IF EXISTS ' + db_config_data["db"] +
                      "; CREATE DATABASE " + db_config_data["db"] + '"')
        utils.syscall("mysql --defaults-file=" + mysql_config_file + " " +
                      db_config_data["db"] + " < " + mysql_dump)
        pipeline_root = os.path.join(tmp_data_dir, "Pipeline_root")
        references_root = os.path.join(tmp_data_dir, "Pipeline_refs")
        nextflow_file = os.path.join(nextflow_helper.nextflow_dir,
                                     "mykrobe_predict.nf")
        work_dir = "tmp.nextflow_mykrobe_db_input.work"
        dag_file = "nextflow.mykrobe.dag.db.pdf"
        try:
            os.unlink(dag_file)
        except:
            pass

        command = " ".join([
            "nextflow run",
            "--dataset_name g1",  # one read pair is from group 2 and should get ignored
            "--ref_id 2",
            "--references_root",
            os.path.abspath(references_root),
            "--pipeline_root",
            pipeline_root,
            "--db_config_file",
            db_ini_file,
            "--testing",
            "-with-dag",
            dag_file,
            "-c",
            nextflow_helper.config_file,
            "-w",
            work_dir,
            nextflow_file,
        ])
        utils.syscall(command)
        os.unlink(nextflow_helper.config_file)
        shutil.rmtree(work_dir)

        # check database Pipeline table updated as expected.
        # The --testing option is set up so that the pooled
        # sample fails, hence it gets a status of -1.
        database = db.Db(db_ini_file)
        got_rows = database.get_rows_from_table("Pipeline")
        got_rows.sort(key=itemgetter("isolate_id", "pipeline_name"))
        expected_rows = [
            {
                "isolate_id": 1,
                "seqrep_id": None,
                "seqrep_pool": "1_2",
                "version": clockwork_version,
                "pipeline_name": "mykrobe_predict",
                "status": -1,
                "reference_id": 2,
            },
            {
                "isolate_id": 1,
                "seqrep_id": 1,
                "seqrep_pool": None,
                "version": "0.4.0",
                "pipeline_name": "remove_contam",
                "status": 1,
                "reference_id": 1,
            },
            {
                "isolate_id": 1,
                "seqrep_id": 2,
                "seqrep_pool": None,
                "version": "0.4.0",
                "pipeline_name": "remove_contam",
                "status": 1,
                "reference_id": 1,
            },
            {
                "isolate_id": 2,
                "seqrep_id": 3,
                "seqrep_pool": None,
                "version": clockwork_version,
                "pipeline_name": "mykrobe_predict",
                "status": 1,
                "reference_id": 2,
            },
            {
                "isolate_id": 2,
                "seqrep_id": 3,
                "seqrep_pool": None,
                "version": "0.4.0",
                "pipeline_name": "remove_contam",
                "status": 1,
                "reference_id": 1,
            },
            {
                "isolate_id": 2,
                "seqrep_id": 4,
                "seqrep_pool": None,
                "version": "0.4.0",
                "pipeline_name": "remove_contam",
                "status": 1,
                "reference_id": 1,
            },
            {
                "isolate_id": 2,
                "seqrep_id": 4,
                "seqrep_pool": None,
                "version": clockwork_version,
                "pipeline_name": "mykrobe_predict",
                "status": 1,
                "reference_id": 2,
            },
            {
                "isolate_id": 3,
                "seqrep_id": None,
                "seqrep_pool": "1",
                "version": clockwork_version,
                "pipeline_name": "mykrobe_predict",
                "status": 1,
                "reference_id": 2,
            },
            {
                "isolate_id": 3,
                "seqrep_id": 5,
                "seqrep_pool": None,
                "version": "0.4.0",
                "pipeline_name": "remove_contam",
                "status": 1,
                "reference_id": 1,
            },
            {
                "isolate_id": 4,
                "seqrep_id": 6,
                "seqrep_pool": None,
                "version": "0.4.0",
                "pipeline_name": "remove_contam",
                "status": 1,
                "reference_id": 1,
            },
        ]
        expected_rows.sort(key=itemgetter("isolate_id", "pipeline_name"))
        self.assertEqual(expected_rows, got_rows)

        # check mykrobe output files etc got written. No need to check contents, trust the tools
        # We're just checking nextflow runs OK here.
        ids = [
            {
                "sample": 1,
                "seqrep_id": "1_2",
                "isolate_id": 1,
                "seq_repl": "1_2",
                "sample_name":
                "site.s1.iso.42.subject.p1.lab_id.l1.seq_reps.1_2",
            },
            {
                "sample": 2,
                "seqrep_id": 3,
                "isolate_id": 2,
                "seq_repl": "1",
                "sample_name":
                "site.s2.iso.43.subject.p2.lab_id.l2.seq_reps.1",
            },
            {
                "sample": 2,
                "seqrep_id": 4,
                "isolate_id": 2,
                "seq_repl": "2",
                "sample_name":
                "site.s2.iso.43.subject.p2.lab_id.l2.seq_reps.2",
            },
        ]
        for id_dict in ids:
            iso_dir = isolate_dir.IsolateDir(pipeline_root, id_dict["sample"],
                                             id_dict["isolate_id"])
            pipeline_dir = iso_dir.pipeline_dir(
                id_dict["seq_repl"],
                "mykrobe_predict",
                clockwork_version,
                reference_id=2,
            )
            self.assertTrue(os.path.exists(pipeline_dir))
            log = os.path.join(pipeline_dir, "log.txt")
            json_file = os.path.join(pipeline_dir, "out.json")

            if id_dict["sample_name"].endswith("1_2"):
                self.assertFalse(os.path.exists(log))
                self.assertFalse(os.path.exists(json_file))
            else:
                self.assertTrue(os.path.exists(log))
                self.assertTrue(os.path.exists(json_file))

        shutil.rmtree(tmp_data_dir)
        nextflow_helper.clean_files()
示例#8
0
    def test_nextflow_remove_contam_using_fastq_input(self):
        """test nextflow_remove_contam using fastq input"""
        reads1 = os.path.join(data_dir, "Reads", "reads.1.1.fq.gz")
        reads2 = os.path.join(data_dir, "Reads", "reads.1.2.fq.gz")
        outprefix = "tmp.test_nextflow_remove_contam_using_fastq_input"
        nextflow_file = os.path.join(nextflow_helper.nextflow_dir,
                                     "remove_contam.nf")
        nextflow_helper.write_config_file()
        work_dir = "tmp.nextflow_remove_contam.work"
        dag_file = "nextflow.remove_contam.dag.no_db.pdf"
        try:
            os.unlink(dag_file)
        except:
            pass

        command = " ".join([
            "nextflow run",
            "--reads_in1",
            reads1,
            "--reads_in2",
            reads2,
            "--outprefix",
            outprefix,
            "--ref_metadata_tsv",
            os.path.join(data_dir, "Reference", "remove_contam_metadata.tsv"),
            "--ref_fasta",
            os.path.join(data_dir, "Reference", "ref.fa"),
            "--testing",
            "-with-dag",
            dag_file,
            "-c",
            nextflow_helper.config_file,
            "-w",
            work_dir,
            nextflow_file,
        ])
        utils.syscall(command)
        os.unlink(nextflow_helper.config_file)
        shutil.rmtree(work_dir)

        for reads_type in ("contam", "remove_contam"):
            for i in ("1", "2"):
                filename = outprefix + "." + reads_type + "." + i + ".fq.gz"
                self.assertTrue(os.path.exists(filename))
                os.unlink(filename)

        expected_counts_lines = [
            "Name\tIs_contam\tReads\n",
            "contam\t1\t40\n",
            "ref\t0\t132\n",
            "Unmapped\t0\t26\n",
            "Reads_kept_after_remove_contam\t0\t158\n",
        ]

        counts_tsv = outprefix + ".counts.tsv"
        with open(counts_tsv) as f:
            got_counts_lines = f.readlines()
        self.assertEqual(expected_counts_lines, got_counts_lines)
        os.unlink(counts_tsv)

        nextflow_helper.clean_files()
    def test_nextflow_assemble(self):
        '''test nextflow_assemble'''
        nextflow_helper.write_config_file()
        input_dir = 'tmp.nextflow_assemble.dir'
        utils.rmtree(input_dir)
        samples = ['ERS1', 'ERS2', 'ERS3']
        samples_file = 'tmp.nextflow_assemble.samples'
        with open(samples_file, 'w') as f:
            print(*samples, sep='\n', file=f)

        sdirs = sample_dirs.SampleDirs(input_dir)
        sdirs.add_samples(samples_file)
        os.unlink(samples_file)

        nextflow_file = os.path.join(nextflow_helper.nextflow_dir, 'assemble.nf')
        work_dir = 'tmp.nextflow_assemble.work'
        outdir = 'tmp.nextflow_assemble.out'

        command = ' '.join([
            'nextflow run',
            '--input_dir', input_dir,
            '--testing',
            '--shovill_tempdir /foo/bar',
            '-c', nextflow_helper.config_file,
            '-w ', work_dir,
            nextflow_file,
        ])

        try:
            completed_process = subprocess.check_output(command, stderr=subprocess.STDOUT, shell=True)
        except subprocess.CalledProcessError as e:
            print('Error running nextflow\nCommand: ', command)
            print('Output:', e.stdout.decode(), sep='\n')
            print('\n____________________________________\n')
            self.assertTrue(False)

        expected_json = {
            "ERS3": {
                "reads": False,
                "asm": True,
                "annot": False,
                "ignore": False
                },
            "ERS1": {
                "reads": False,
                "asm": True,
                "annot": False,
                "ignore": False
                },
            "ERS2": {
                "reads": False,
                "asm": True,
                "annot": False,
                "ignore": False
                }
        }

        self.maxDiff = None

        sdirs = sample_dirs.SampleDirs(input_dir)
        self.assertEqual(expected_json, sdirs.sample_data)
        utils.rmtree(input_dir)
        utils.rmtree(work_dir)
        nextflow_helper.clean_files()
    def test_nextflow_generic_pipeline(self):
        """test nextflow generic pipeline using database"""
        tmp_data_dir = "tmp.nextflow_generic_pipeline_db_input.data"
        if os.path.exists(tmp_data_dir):
            shutil.rmtree(tmp_data_dir)
        shutil.copytree(data_dir, tmp_data_dir)
        nextflow_helper.write_config_file()
        mysql_config_file = os.path.join(data_dir, "db.cnf")
        mysql_dump = os.path.join(data_dir, "mysql.dump")
        db_config_data = db_connection.DbConnection._parse_config_file(db_ini_file)
        utils.syscall(
            "mysql --defaults-file="
            + mysql_config_file
            + ' -e "DROP DATABASE IF EXISTS '
            + db_config_data["db"]
            + "; CREATE DATABASE "
            + db_config_data["db"]
            + '"'
        )
        utils.syscall(
            "mysql --defaults-file="
            + mysql_config_file
            + " "
            + db_config_data["db"]
            + " < "
            + mysql_dump
        )
        pipeline_root = os.path.join(tmp_data_dir, "Pipeline_root")
        nextflow_file = os.path.join(
            nextflow_helper.nextflow_dir, "generic_pipeline.nf"
        )
        work_dir = "tmp.nextflow_generic_pipeline.work"
        dag_file = "nextflow.generic_pipeline.dag.pdf"
        pipeline_name = "generic_pipeline"
        script = os.path.join(data_dir, "script.pl")

        try:
            os.unlink(dag_file)
        except:
            pass

        command = " ".join(
            [
                "nextflow run",
                "--dataset_name g1",  # one read pair is from group 2 and should get ignored
                "--pipeline_name",
                pipeline_name,
                "--pipeline_root",
                pipeline_root,
                "--script",
                script,
                "--db_config_file",
                db_ini_file,
                "--max_ram",
                "0.5",
                "-with-dag",
                dag_file,
                "-c",
                nextflow_helper.config_file,
                "-w",
                work_dir,
                nextflow_file,
            ]
        )
        utils.syscall(command)
        os.unlink(nextflow_helper.config_file)
        shutil.rmtree(work_dir)

        # check database Pipeline table updated as expected
        database = db.Db(db_ini_file)
        got_rows = database.get_rows_from_table("Pipeline")
        got_rows.sort(key=itemgetter("isolate_id", "pipeline_name"))
        expected_rows = [
            {
                "isolate_id": 1,
                "seqrep_id": 1,
                "seqrep_pool": None,
                "version": "0.1.2",
                "pipeline_name": "remove_contam",
                "status": 1,
                "reference_id": 1,
            },
            {
                "isolate_id": 1,
                "seqrep_id": 2,
                "seqrep_pool": None,
                "version": "0.1.2",
                "pipeline_name": "remove_contam",
                "status": 1,
                "reference_id": 1,
            },
            {
                "isolate_id": 1,
                "seqrep_id": None,
                "seqrep_pool": "1_2",
                "version": clockwork_version,
                "pipeline_name": pipeline_name,
                "status": 1,
                "reference_id": None,
            },
            {
                "isolate_id": 2,
                "seqrep_id": 3,
                "seqrep_pool": None,
                "version": "0.1.2",
                "pipeline_name": "remove_contam",
                "status": 1,
                "reference_id": 1,
            },
            {
                "isolate_id": 2,
                "seqrep_id": 4,
                "seqrep_pool": None,
                "version": "0.1.2",
                "pipeline_name": "remove_contam",
                "status": 1,
                "reference_id": 1,
            },
            {
                "isolate_id": 2,
                "seqrep_id": 3,
                "seqrep_pool": None,
                "version": clockwork_version,
                "pipeline_name": pipeline_name,
                "status": 1,
                "reference_id": None,
            },
            {
                "isolate_id": 2,
                "seqrep_id": 4,
                "seqrep_pool": None,
                "version": clockwork_version,
                "pipeline_name": pipeline_name,
                "status": 1,
                "reference_id": None,
            },
            {
                "isolate_id": 3,
                "seqrep_id": 5,
                "seqrep_pool": None,
                "version": "0.1.2",
                "pipeline_name": "remove_contam",
                "status": 1,
                "reference_id": 1,
            },
            {
                "isolate_id": 3,
                "seqrep_id": None,
                "seqrep_pool": "1",
                "version": clockwork_version,
                "pipeline_name": pipeline_name,
                "status": -1,
                "reference_id": None,
            },
            {
                "isolate_id": 4,
                "seqrep_id": 6,
                "seqrep_pool": None,
                "version": "0.1.2",
                "pipeline_name": "remove_contam",
                "status": 1,
                "reference_id": 1,
            },
        ]
        expected_rows.sort(key=itemgetter("isolate_id", "pipeline_name"))
        self.assertEqual(expected_rows, got_rows)

        # check that the expected output file from the script.pl
        # got made (except for the sample that is expected to fail)

        ids = [
            {"sample": 1, "seqrep_id": "1_2", "isolate_id": 1, "seq_repl": "1_2"},
            {"sample": 2, "seqrep_id": 3, "isolate_id": 2, "seq_repl": "1"},
            {"sample": 2, "seqrep_id": 4, "isolate_id": 2, "seq_repl": "2"},
        ]
        for id_dict in ids:
            iso_dir = isolate_dir.IsolateDir(
                pipeline_root, id_dict["sample"], id_dict["isolate_id"]
            )
            pipeline_dir = iso_dir.pipeline_dir(
                id_dict["seq_repl"], pipeline_name, clockwork_version
            )
            counts_file = os.path.join(pipeline_dir, "count.txt")
            self.assertTrue(os.path.exists(counts_file))

        shutil.rmtree(tmp_data_dir)
        nextflow_helper.clean_files()
示例#11
0
    def test_nextflow_run_callers(self):
        '''test nextflow_run_callers'''
        nextflow_helper.write_config_file()
        input_data_file = 'tmp.nextflow_run_callers.data.tsv'
        with open(input_data_file, 'w') as f:
            reads_prefix = os.path.join(data_dir, 'reads')
            print('ERR025839',
                  reads_prefix + '.1.1.fq',
                  reads_prefix + '.1.2.fq',
                  sep='\t',
                  file=f)
            print('sample2',
                  reads_prefix + '.2.1.fq',
                  reads_prefix + '.2.2.fq',
                  sep='\t',
                  file=f)

        callers_file = os.path.join(data_dir, 'callers.tsv')
        nextflow_file = os.path.join(nextflow_helper.nextflow_dir,
                                     'run_callers.nf')
        work_dir = 'tmp.nextflow_run_callers.work'
        outdir = 'tmp.nextflow_run_callers.out'

        command = ' '.join([
            'nextflow run',
            '--input_data_file',
            input_data_file,
            '--callers_file',
            callers_file,
            '--output_dir',
            outdir,
            '--species tb',
            '--testing',
            '-c',
            nextflow_helper.config_file,
            '-w ',
            work_dir,
            nextflow_file,
        ])

        try:
            completed_process = subprocess.check_output(
                command, stderr=subprocess.STDOUT, shell=True)
        except subprocess.CalledProcessError as e:
            print('Error running nextflow\nCommand: ', command)
            print('Output:', e.stdout.decode(), sep='\n')
            print('\n____________________________________\n')
            self.assertTrue(False)

        os.unlink(input_data_file)
        nextflow_helper.clean_files()

        expected_json = os.path.join(data_dir, 'expected.summary.json')
        with open(expected_json) as f:
            expect_json_data = json.load(f)

        files_to_check = [
            os.path.join(outdir, 'caller_output', '0', '0', 'summary.json'),
            os.path.join(outdir, 'caller_output', '0', '1', 'summary.json'),
        ]
        tools = [
            'KvarQ', 'Mykrobe.tb.Fail', 'Mykrobe.tb.walker-2015', 'TB-Profiler'
        ]

        for filename in files_to_check:
            with open(filename) as f:
                got = json.load(f)

            for tool in tools:
                # Check resistance calls. Can't check memory and time because
                # will be different each time it's run
                self.assertEqual(expect_json_data[tool]['Success'],
                                 got[tool]['Success'])
                if tool == 'Mykrobe.tb.Fail':
                    continue

                self.assertEqual(expect_json_data[tool]['resistance_calls'],
                                 got[tool]['resistance_calls'])
                self.assertIn('time_and_memory', got[tool])
                self.assertIn('ram', got[tool]['time_and_memory'])
                self.assertIn('system_time', got[tool]['time_and_memory'])
                self.assertIn('user_time', got[tool]['time_and_memory'])
                self.assertIn('wall_clock_time', got[tool]['time_and_memory'])

        shutil.rmtree(work_dir)

        self.assertTrue(os.path.exists(os.path.join(outdir, 'summary.json')))
        shutil.rmtree(outdir)
    def test_nextflow_generic_pipeline(self):
        '''test nextflow generic pipeline using database'''
        tmp_data_dir = 'tmp.nextflow_generic_pipeline_db_input.data'
        if os.path.exists(tmp_data_dir):
            shutil.rmtree(tmp_data_dir)
        shutil.copytree(data_dir, tmp_data_dir)
        nextflow_helper.write_config_file()
        mysql_config_file = os.path.join(data_dir, 'db.cnf')
        mysql_dump = os.path.join(data_dir, 'mysql.dump')
        db_config_data = db_connection.DbConnection._parse_config_file(db_ini_file)
        utils.syscall('mysql --defaults-file=' + mysql_config_file + ' -e "DROP DATABASE IF EXISTS ' + db_config_data['db'] + '; CREATE DATABASE ' + db_config_data['db'] + '"')
        utils.syscall('mysql --defaults-file=' + mysql_config_file + ' ' + db_config_data['db'] + ' < ' + mysql_dump)
        pipeline_root = os.path.join(tmp_data_dir, 'Pipeline_root')
        nextflow_file = os.path.join(nextflow_helper.nextflow_dir, 'generic_pipeline.nf')
        work_dir = 'tmp.nextflow_generic_pipeline.work'
        dag_file = 'nextflow.generic_pipeline.dag.pdf'
        pipeline_name = 'generic_pipeline'
        script = os.path.join(data_dir, 'script.pl')

        try:
            os.unlink(dag_file)
        except:
            pass

        command = ' '.join([
            'nextflow run',
            '--dataset_name g1', # one read pair is from group 2 and should get ignored
            '--pipeline_name', pipeline_name,
            '--pipeline_root', pipeline_root,
            '--script', script,
            '--db_config_file', db_ini_file,
            '--max_ram', '0.5',
            '-with-dag', dag_file,
            '-c', nextflow_helper.config_file,
            '-w', work_dir,
            nextflow_file
        ])
        utils.syscall(command)
        os.unlink(nextflow_helper.config_file)
        shutil.rmtree(work_dir)

        # check database Pipeline table updated as expected
        database = db.Db(db_ini_file)
        got_rows = database.get_rows_from_table('Pipeline')
        got_rows.sort(key=itemgetter('isolate_id', 'pipeline_name'))
        expected_rows = [
            {'isolate_id': 1, 'seqrep_id': 1, 'seqrep_pool': None, 'version': '0.1.2', 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1},
            {'isolate_id': 1, 'seqrep_id': 2, 'seqrep_pool': None, 'version': '0.1.2', 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1},
            {'isolate_id': 1, 'seqrep_id': None, 'seqrep_pool': '1_2', 'version': clockwork_version, 'pipeline_name': pipeline_name, 'status': 1, 'reference_id': None},
            {'isolate_id': 2, 'seqrep_id': 3, 'seqrep_pool': None, 'version': '0.1.2', 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1},
            {'isolate_id': 2, 'seqrep_id': 4, 'seqrep_pool': None, 'version': '0.1.2', 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1},
            {'isolate_id': 2, 'seqrep_id': 3, 'seqrep_pool': None, 'version': clockwork_version, 'pipeline_name': pipeline_name, 'status': 1, 'reference_id': None},
            {'isolate_id': 2, 'seqrep_id': 4, 'seqrep_pool': None, 'version': clockwork_version, 'pipeline_name': pipeline_name, 'status': 1, 'reference_id': None},
            {'isolate_id': 3, 'seqrep_id': 5, 'seqrep_pool': None, 'version': '0.1.2', 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1},
            {'isolate_id': 3, 'seqrep_id': None, 'seqrep_pool': '1', 'version': clockwork_version, 'pipeline_name': pipeline_name, 'status': -1, 'reference_id': None},
            {'isolate_id': 4, 'seqrep_id': 6, 'seqrep_pool': None, 'version': '0.1.2', 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1},
        ]
        expected_rows.sort(key=itemgetter('isolate_id', 'pipeline_name'))
        self.assertEqual(expected_rows, got_rows)

        # check that the expected output file from the script.pl
        # got made (except for the sample that is expected to fail)

        ids = [
            {'sample': 1, 'seqrep_id': '1_2', 'isolate_id': 1, 'seq_repl': '1_2'},
            {'sample': 2, 'seqrep_id': 3, 'isolate_id': 2, 'seq_repl': '1'},
            {'sample': 2, 'seqrep_id': 4, 'isolate_id': 2, 'seq_repl': '2'},
        ]
        for id_dict in ids:
            iso_dir = isolate_dir.IsolateDir(pipeline_root, id_dict['sample'], id_dict['isolate_id'])
            pipeline_dir = iso_dir.pipeline_dir(id_dict['seq_repl'], pipeline_name, clockwork_version)
            counts_file = os.path.join(pipeline_dir, 'count.txt')
            self.assertTrue(os.path.exists(counts_file))

        shutil.rmtree(tmp_data_dir)
        nextflow_helper.clean_files()
示例#13
0
    def test_nextflow_qc_using_database(self):
        """test nextflow_qc using database"""
        tmp_data_dir = "tmp.nextflow_qc"
        if os.path.exists(tmp_data_dir):
            shutil.rmtree(tmp_data_dir)
        shutil.copytree(data_dir, tmp_data_dir)
        nextflow_helper.write_config_file()
        mysql_config_file = os.path.join(data_dir, "db.cnf")
        mysql_dump = os.path.join(data_dir, "mysql.dump")
        db_config_data = db_connection.DbConnection._parse_config_file(
            db_ini_file)
        utils.syscall("mysql --defaults-file=" + mysql_config_file +
                      ' -e "DROP DATABASE IF EXISTS ' + db_config_data["db"] +
                      "; CREATE DATABASE " + db_config_data["db"] + '"')
        utils.syscall("mysql --defaults-file=" + mysql_config_file + " " +
                      db_config_data["db"] + " < " + mysql_dump)
        pipeline_root = os.path.join(tmp_data_dir, "Pipeline_root")
        references_root = os.path.join(tmp_data_dir, "Pipeline_refs")
        nextflow_file = os.path.join(nextflow_helper.nextflow_dir, "qc.nf")
        work_dir = "tmp.nextflow_qc.work"
        dag_file = "nextflow.qc.dag.db.pdf"
        try:
            os.unlink(dag_file)
        except:
            pass

        command = " ".join([
            "nextflow run",
            "--dataset_name g1",  #  one of the samples is in group2 and should get ignored
            "--ref_id 1",
            "--references_root",
            os.path.abspath(references_root),
            "--pipeline_root",
            pipeline_root,
            "--db_config_file",
            db_ini_file,
            "-with-dag",
            dag_file,
            "-c",
            nextflow_helper.config_file,
            "-w",
            work_dir,
            nextflow_file,
        ])
        utils.syscall(command)
        os.unlink(nextflow_helper.config_file)
        shutil.rmtree(work_dir)

        # check database Pipeline table updated as expected
        database = db.Db(db_ini_file)
        got_pipeline_rows = database.get_rows_from_table("Pipeline")
        got_pipeline_rows.sort(key=itemgetter("seqrep_id"))
        expected_pipeline_rows = [
            {
                "isolate_id": 1,
                "seqrep_id": 1,
                "seqrep_pool": None,
                "version": "0.0.1",
                "pipeline_name": "remove_contam",
                "status": 1,
                "reference_id": 1,
            },
            {
                "isolate_id": 1,
                "seqrep_id": 1,
                "seqrep_pool": None,
                "version": clockwork_version,
                "pipeline_name": "qc",
                "status": 1,
                "reference_id": 1,
            },
            {
                "isolate_id": 2,
                "seqrep_id": 2,
                "seqrep_pool": None,
                "version": "0.0.1",
                "pipeline_name": "remove_contam",
                "status": 1,
                "reference_id": 1,
            },
            {
                "isolate_id": 2,
                "seqrep_id": 2,
                "seqrep_pool": None,
                "version": clockwork_version,
                "pipeline_name": "qc",
                "status": 1,
                "reference_id": 1,
            },
            {
                "isolate_id": 3,
                "seqrep_id": 3,
                "seqrep_pool": None,
                "version": "0.0.1",
                "pipeline_name": "remove_contam",
                "status": 1,
                "reference_id": 1,
            },
            {
                "isolate_id": 3,
                "seqrep_id": 3,
                "seqrep_pool": None,
                "version": clockwork_version,
                "pipeline_name": "qc",
                "status": -1,
                "reference_id": 1,
            },
            {
                "isolate_id": 4,
                "seqrep_id": 4,
                "seqrep_pool": None,
                "version": "0.0.1",
                "pipeline_name": "remove_contam",
                "status": 1,
                "reference_id": 1,
            },
        ]
        self.assertEqual(expected_pipeline_rows, got_pipeline_rows)

        #  check QC stats added to database
        got_qc_rows = database.get_rows_from_table("QC")
        got_qc_rows.sort(key=itemgetter("seqrep_id"))
        expected_qc_rows = [
            {
                "seqrep_id": 1,
                "pipeline_version": clockwork_version,
                "fastqc1_adapter_content": "pass",
                "fastqc1_basic_statistics": "pass",
                "fastqc1_gc": 48.0,
                "fastqc1_kmer_content": "fail",
                "fastqc1_max_sequence_length": 75,
                "fastqc1_min_sequence_length": 75,
                "fastqc1_overrepresented_sequences": "fail",
                "fastqc1_per_base_n_content": "pass",
                "fastqc1_per_base_sequence_content": "fail",
                "fastqc1_per_base_sequence_quality": "pass",
                "fastqc1_per_sequence_gc_content": "fail",
                "fastqc1_per_sequence_quality_scores": "fail",
                "fastqc1_sequence_duplication_levels": "pass",
                "fastqc1_sequence_length_distribution": "pass",
                "fastqc1_sequences_flagged_as_poor_quality": 0,
                "fastqc1_total_sequences": 72,
                "fastqc2_adapter_content": "pass",
                "fastqc2_basic_statistics": "pass",
                "fastqc2_gc": 48.0,
                "fastqc2_kmer_content": "fail",
                "fastqc2_max_sequence_length": 75,
                "fastqc2_min_sequence_length": 75,
                "fastqc2_overrepresented_sequences": "fail",
                "fastqc2_per_base_n_content": "pass",
                "fastqc2_per_base_sequence_content": "fail",
                "fastqc2_per_base_sequence_quality": "pass",
                "fastqc2_per_sequence_gc_content": "fail",
                "fastqc2_per_sequence_quality_scores": "fail",
                "fastqc2_sequence_duplication_levels": "pass",
                "fastqc2_sequence_length_distribution": "pass",
                "fastqc2_sequences_flagged_as_poor_quality": 0,
                "fastqc2_total_sequences": 72,
                "samtools_average_quality": 40.0,
                "samtools_bases_mapped_cigar": 9900,
                "samtools_bases_trimmed": 0,
                "samtools_error_rate": 0.0,
                "samtools_insert_size_average": 199.6,
                "samtools_insert_size_standard_deviation": 1.0,
                "samtools_inward_oriented_pairs": 66,
                "samtools_outward_oriented_pairs": 0,
                "samtools_pairs_with_other_orientation": 0,
                "samtools_raw_total_sequences": 144,
                "samtools_reads_duplicated": 4,
                "samtools_reads_mapped": 132,
                "het_snp_het_calls": 0,
                "het_snp_positions": 983,
                "het_snp_total_snps": 0,
            },
            {
                "seqrep_id": 2,
                "pipeline_version": clockwork_version,
                "fastqc1_adapter_content": "pass",
                "fastqc1_basic_statistics": "pass",
                "fastqc1_gc": 48.0,
                "fastqc1_kmer_content": "fail",
                "fastqc1_max_sequence_length": 75,
                "fastqc1_min_sequence_length": 75,
                "fastqc1_overrepresented_sequences": "fail",
                "fastqc1_per_base_n_content": "pass",
                "fastqc1_per_base_sequence_content": "fail",
                "fastqc1_per_base_sequence_quality": "pass",
                "fastqc1_per_sequence_gc_content": "fail",
                "fastqc1_per_sequence_quality_scores": "fail",
                "fastqc1_sequence_duplication_levels": "pass",
                "fastqc1_sequence_length_distribution": "pass",
                "fastqc1_sequences_flagged_as_poor_quality": 0,
                "fastqc1_total_sequences": 72,
                "fastqc2_adapter_content": "pass",
                "fastqc2_basic_statistics": "pass",
                "fastqc2_gc": 49.0,
                "fastqc2_kmer_content": "fail",
                "fastqc2_max_sequence_length": 75,
                "fastqc2_min_sequence_length": 75,
                "fastqc2_overrepresented_sequences": "fail",
                "fastqc2_per_base_n_content": "pass",
                "fastqc2_per_base_sequence_content": "fail",
                "fastqc2_per_base_sequence_quality": "pass",
                "fastqc2_per_sequence_gc_content": "warn",
                "fastqc2_per_sequence_quality_scores": "fail",
                "fastqc2_sequence_duplication_levels": "pass",
                "fastqc2_sequence_length_distribution": "pass",
                "fastqc2_sequences_flagged_as_poor_quality": 0,
                "fastqc2_total_sequences": 72,
                "samtools_average_quality": 40.0,
                "samtools_bases_mapped_cigar": 9900,
                "samtools_bases_trimmed": 0,
                "samtools_error_rate": 0.0,
                "samtools_insert_size_average": 199.7,
                "samtools_insert_size_standard_deviation": 1.1,
                "samtools_inward_oriented_pairs": 66,
                "samtools_outward_oriented_pairs": 0,
                "samtools_pairs_with_other_orientation": 0,
                "samtools_raw_total_sequences": 144,
                "samtools_reads_duplicated": 0,
                "samtools_reads_mapped": 132,
                "het_snp_het_calls": 0,
                "het_snp_positions": 983,
                "het_snp_total_snps": 0,
            },
        ]
        self.assertEqual(expected_qc_rows, got_qc_rows)

        # check QC files got written. No need to check contents, as that is done
        # elsewhere. We're just checking nextflow runs OK here.
        ids = [
            {
                "sample": 1,
                "isolate_id": 1,
                "seq_repl": 43
            },
            {
                "sample": 2,
                "isolate_id": 2,
                "seq_repl": 45
            },
        ]
        for id_dict in ids:
            iso_dir = isolate_dir.IsolateDir(pipeline_root, id_dict["sample"],
                                             id_dict["isolate_id"])
            qc_root_dir = iso_dir.pipeline_dir(id_dict["seq_repl"], "qc",
                                               clockwork_version)
            self.assertTrue(os.path.exists(qc_root_dir))
            for method in ["fastqc", "samtools_qc"]:
                this_qc_dir = os.path.join(qc_root_dir, method)
                self.assertTrue(os.path.exists(this_qc_dir))
                self.assertTrue(len(os.listdir(this_qc_dir)) >= 1)

        shutil.rmtree(tmp_data_dir)
        nextflow_helper.clean_files()
    def test_nextflow_remove_contam_using_database(self):
        '''test nextflow_remove_contam using database'''
        tmp_data_dir = 'tmp.nextflow_remove_contam'
        if os.path.exists(tmp_data_dir):
            shutil.rmtree(tmp_data_dir)
        shutil.copytree(data_dir, tmp_data_dir)
        nextflow_helper.write_config_file()
        mysql_config_file = os.path.join(data_dir, 'db.cnf')
        mysql_dump = os.path.join(data_dir, 'mysql.dump')
        db_config_data = db_connection.DbConnection._parse_config_file(
            db_ini_file)
        utils.syscall('mysql --defaults-file=' + mysql_config_file +
                      ' -e "DROP DATABASE IF EXISTS ' + db_config_data['db'] +
                      '; CREATE DATABASE ' + db_config_data['db'] + '"')
        utils.syscall('mysql --defaults-file=' + mysql_config_file + ' ' +
                      db_config_data['db'] + ' < ' + mysql_dump)
        pipeline_root = os.path.join(tmp_data_dir, 'Pipeline_root')
        references_root = os.path.join(tmp_data_dir, 'Pipeline_refs')
        nextflow_file = os.path.join(nextflow_helper.nextflow_dir,
                                     'remove_contam.nf')
        work_dir = 'tmp.nextflow_remove_contam.work'
        dag_file = 'nextflow.remove_contam.dag.db.pdf'
        try:
            os.unlink(dag_file)
        except:
            pass

        command = ' '.join([
            'nextflow run',
            '--dataset_name g1',  # one read pair has group g2, so should get ignored
            '--ref_id 1',
            '--references_root',
            os.path.abspath(references_root),
            '--pipeline_root',
            os.path.abspath(pipeline_root),
            '--db_config_file',
            db_ini_file,
            '--testing',
            '-with-dag',
            dag_file,
            '-c',
            nextflow_helper.config_file,
            '-w',
            work_dir,
            nextflow_file
        ])
        utils.syscall(command)
        os.unlink(nextflow_helper.config_file)
        shutil.rmtree(work_dir)

        # check database Pipeline table updated as expected
        database = db.Db(db_ini_file)
        got_rows = database.get_rows_from_table('Pipeline')
        got_rows.sort(key=itemgetter('seqrep_id'))
        expected_rows = [
            {
                'isolate_id': 1,
                'seqrep_id': 1,
                'seqrep_pool': None,
                'version': clockwork_version,
                'pipeline_name': 'remove_contam',
                'status': 1,
                'reference_id': 1
            },
            {
                'isolate_id': 2,
                'seqrep_id': 2,
                'seqrep_pool': None,
                'version': clockwork_version,
                'pipeline_name': 'remove_contam',
                'status': 1,
                'reference_id': 1
            },
            {
                'isolate_id': 3,
                'seqrep_id': 3,
                'seqrep_pool': None,
                'version': clockwork_version,
                'pipeline_name': 'remove_contam',
                'status': -1,
                'reference_id': 1
            },
        ]
        self.assertEqual(expected_rows, got_rows)

        # check database Read_counts table updated
        got_rows = database.get_rows_from_table('Read_counts')
        got_rows.sort(key=itemgetter('seqrep_id'))
        expected_rows = [
            {
                'seqrep_id': 1,
                'original_total': 198,
                'contamination': 40,
                'not_contamination': 132,
                'unmapped': 26,
                'total_after_remove_contam': 158,
            },
            {
                'seqrep_id': 2,
                'original_total': 156,
                'contamination': 12,
                'not_contamination': 132,
                'unmapped': 12,
                'total_after_remove_contam': 144,
            },
        ]
        self.assertEqual(expected_rows, got_rows)

        # check FASTQ files got written. No need to check contents, as that is done
        # elsewhere. We're just checking nextflow runs OK here.
        ids = [
            {
                'sample': 1,
                'isolate_id': 1,
                'seq_repl': 43
            },
            {
                'sample': 2,
                'isolate_id': 2,
                'seq_repl': 45
            },
        ]
        for id_dict in ids:
            iso_dir = isolate_dir.IsolateDir(pipeline_root, id_dict['sample'],
                                             id_dict['isolate_id'])
            for read_type in ('original', 'remove_contam', 'contam'):
                for i in (1, 2):
                    self.assertTrue(
                        os.path.exists(
                            iso_dir.reads_filename(read_type,
                                                   id_dict['seq_repl'], i)))

        shutil.rmtree(tmp_data_dir)
        nextflow_helper.clean_files()
示例#15
0
    def test_nextflow_variant_call_using_database(self):
        """test nextflow_variant_call using database"""
        tmp_data_dir = "tmp.nextflow_variant_call_db_input.data"
        if os.path.exists(tmp_data_dir):
            shutil.rmtree(tmp_data_dir)
        shutil.copytree(data_dir, tmp_data_dir)
        nextflow_helper.write_config_file()
        mysql_config_file = os.path.join(data_dir, "db.cnf")
        mysql_dump = os.path.join(data_dir, "mysql.dump")
        db_config_data = db_connection.DbConnection._parse_config_file(
            db_ini_file)
        utils.syscall("mysql --defaults-file=" + mysql_config_file +
                      ' -e "DROP DATABASE IF EXISTS ' + db_config_data["db"] +
                      "; CREATE DATABASE " + db_config_data["db"] + '"')
        utils.syscall("mysql --defaults-file=" + mysql_config_file + " " +
                      db_config_data["db"] + " < " + mysql_dump)
        pipeline_root = os.path.join(tmp_data_dir, "Pipeline_root")
        references_root = os.path.join(tmp_data_dir, "Pipeline_refs")
        nextflow_file = os.path.join(nextflow_helper.nextflow_dir,
                                     "variant_call.nf")
        work_dir = "tmp.nextflow_variant_call_db_input.work"
        dag_file = "nextflow.variant_call.dag.db.pdf"
        try:
            os.unlink(dag_file)
        except:
            pass

        command = " ".join([
            "nextflow run",
            "--dataset_name g1",  # one read pair is from group 2 and should get ignored
            "--ref_id 2",
            "--references_root",
            os.path.abspath(references_root),
            "--pipeline_root",
            pipeline_root,
            "--db_config_file",
            db_ini_file,
            "--cortex_mem_height 17",
            "--testing",
            # Using truth ref is broken, and we nevr use it anyway,
            # so disable this for now
            #"--truth_ref",
            #os.path.join(tmp_data_dir, "truth_ref.fa"),
            "-with-dag",
            dag_file,
            "-c",
            nextflow_helper.config_file,
            "-w",
            work_dir,
            nextflow_file,
        ])
        utils.syscall(command)
        os.unlink(nextflow_helper.config_file)
        shutil.rmtree(work_dir)

        # check database Pipeline table updated as expected
        database = db.Db(db_ini_file)
        got_rows = database.get_rows_from_table("Pipeline")
        got_rows.sort(key=itemgetter("isolate_id", "pipeline_name"))
        expected_rows = [
            {
                "isolate_id": 1,
                "seqrep_id": 1,
                "seqrep_pool": None,
                "version": "0.3.1",
                "pipeline_name": "remove_contam",
                "status": 1,
                "reference_id": 1,
            },
            {
                "isolate_id": 1,
                "seqrep_id": 2,
                "seqrep_pool": None,
                "version": "0.3.1",
                "pipeline_name": "remove_contam",
                "status": 1,
                "reference_id": 1,
            },
            {
                "isolate_id": 1,
                "seqrep_id": None,
                "seqrep_pool": "1_2",
                "version": clockwork_version,
                "pipeline_name": "variant_call",
                "status": 1,
                "reference_id": 2,
            },
            {
                "isolate_id": 2,
                "seqrep_id": 3,
                "seqrep_pool": None,
                "version": "0.3.1",
                "pipeline_name": "remove_contam",
                "status": 1,
                "reference_id": 1,
            },
            {
                "isolate_id": 2,
                "seqrep_id": 4,
                "seqrep_pool": None,
                "version": "0.3.1",
                "pipeline_name": "remove_contam",
                "status": 1,
                "reference_id": 1,
            },
            {
                "isolate_id": 2,
                "seqrep_id": 3,
                "seqrep_pool": None,
                "version": clockwork_version,
                "pipeline_name": "variant_call",
                "status": 1,
                "reference_id": 2,
            },
            {
                "isolate_id": 2,
                "seqrep_id": 4,
                "seqrep_pool": None,
                "version": clockwork_version,
                "pipeline_name": "variant_call",
                "status": 1,
                "reference_id": 2,
            },
            {
                "isolate_id": 3,
                "seqrep_id": 5,
                "seqrep_pool": None,
                "version": "0.3.1",
                "pipeline_name": "remove_contam",
                "status": 1,
                "reference_id": 1,
            },
            {
                "isolate_id": 3,
                "seqrep_id": None,
                "seqrep_pool": "1",
                "version": clockwork_version,
                "pipeline_name": "variant_call",
                "status": -1,
                "reference_id": 2,
            },
            {
                "isolate_id": 4,
                "seqrep_id": 6,
                "seqrep_pool": None,
                "version": "0.3.1",
                "pipeline_name": "remove_contam",
                "status": 1,
                "reference_id": 1,
            },
        ]
        self.assertEqual(expected_rows, got_rows)

        # check VCF files etc got written. No need to check contents, trust the tools
        # We're just checking nextflow runs OK here.
        ids = [
            {
                "sample": 1,
                "seqrep_id": "1_2",
                "isolate_id": 1,
                "seq_repl": "1_2",
                "sample_name":
                "site.s1.iso.42.subject.p1.lab_id.l1.seq_reps.1_2",
            },
            {
                "sample": 2,
                "seqrep_id": 3,
                "isolate_id": 2,
                "seq_repl": "1",
                "sample_name":
                "site.s2.iso.43.subject.p2.lab_id.l2.seq_reps.1",
            },
            {
                "sample": 2,
                "seqrep_id": 4,
                "isolate_id": 2,
                "seq_repl": "2",
                "sample_name":
                "site.s2.iso.43.subject.p2.lab_id.l2.seq_reps.2",
            },
        ]
        for id_dict in ids:
            iso_dir = isolate_dir.IsolateDir(pipeline_root, id_dict["sample"],
                                             id_dict["isolate_id"])
            pipeline_dir = iso_dir.pipeline_dir(id_dict["seq_repl"],
                                                "variant_call",
                                                clockwork_version,
                                                reference_id=2)
            self._files_are_present_and_correct(pipeline_dir,
                                                id_dict["sample_name"],
                                                expect_ref_check_files=False)

        shutil.rmtree(tmp_data_dir)
        nextflow_helper.clean_files()
    def test_nextflow_variant_call_using_database(self):
        '''test nextflow_variant_call using database'''
        tmp_data_dir = 'tmp.nextflow_variant_call_db_input.data'
        if os.path.exists(tmp_data_dir):
            shutil.rmtree(tmp_data_dir)
        shutil.copytree(data_dir, tmp_data_dir)
        nextflow_helper.write_config_file()
        mysql_config_file = os.path.join(data_dir, 'db.cnf')
        mysql_dump = os.path.join(data_dir, 'mysql.dump')
        db_config_data = db_connection.DbConnection._parse_config_file(
            db_ini_file)
        utils.syscall('mysql --defaults-file=' + mysql_config_file +
                      ' -e "DROP DATABASE IF EXISTS ' + db_config_data['db'] +
                      '; CREATE DATABASE ' + db_config_data['db'] + '"')
        utils.syscall('mysql --defaults-file=' + mysql_config_file + ' ' +
                      db_config_data['db'] + ' < ' + mysql_dump)
        pipeline_root = os.path.join(tmp_data_dir, 'Pipeline_root')
        references_root = os.path.join(tmp_data_dir, 'Pipeline_refs')
        nextflow_file = os.path.join(nextflow_helper.nextflow_dir,
                                     'variant_call.nf')
        work_dir = 'tmp.nextflow_variant_call_db_input.work'
        dag_file = 'nextflow.variant_call.dag.db.pdf'
        try:
            os.unlink(dag_file)
        except:
            pass

        command = ' '.join([
            'nextflow run',
            '--dataset_name g1',  # one read pair is from group 2 and should get ignored
            '--ref_id 2',
            '--references_root',
            os.path.abspath(references_root),
            '--pipeline_root',
            pipeline_root,
            '--db_config_file',
            db_ini_file,
            '--cortex_mem_height 17',
            '--testing',
            '-with-dag',
            dag_file,
            '-c',
            nextflow_helper.config_file,
            '-w',
            work_dir,
            nextflow_file
        ])
        utils.syscall(command)
        os.unlink(nextflow_helper.config_file)
        shutil.rmtree(work_dir)

        # check database Pipeline table updated as expected
        database = db.Db(db_ini_file)
        got_rows = database.get_rows_from_table('Pipeline')
        got_rows.sort(key=itemgetter('isolate_id', 'pipeline_name'))
        expected_rows = [
            {
                'isolate_id': 1,
                'seqrep_id': 1,
                'seqrep_pool': None,
                'version': '0.0.1',
                'pipeline_name': 'remove_contam',
                'status': 1,
                'reference_id': 1
            },
            {
                'isolate_id': 1,
                'seqrep_id': 2,
                'seqrep_pool': None,
                'version': '0.0.1',
                'pipeline_name': 'remove_contam',
                'status': 1,
                'reference_id': 1
            },
            {
                'isolate_id': 1,
                'seqrep_id': None,
                'seqrep_pool': '1_2',
                'version': clockwork_version,
                'pipeline_name': 'variant_call',
                'status': 1,
                'reference_id': 2
            },
            {
                'isolate_id': 2,
                'seqrep_id': 3,
                'seqrep_pool': None,
                'version': '0.0.1',
                'pipeline_name': 'remove_contam',
                'status': 1,
                'reference_id': 1
            },
            {
                'isolate_id': 2,
                'seqrep_id': 4,
                'seqrep_pool': None,
                'version': '0.0.1',
                'pipeline_name': 'remove_contam',
                'status': 1,
                'reference_id': 1
            },
            {
                'isolate_id': 2,
                'seqrep_id': 3,
                'seqrep_pool': None,
                'version': clockwork_version,
                'pipeline_name': 'variant_call',
                'status': 1,
                'reference_id': 2
            },
            {
                'isolate_id': 2,
                'seqrep_id': 4,
                'seqrep_pool': None,
                'version': clockwork_version,
                'pipeline_name': 'variant_call',
                'status': 1,
                'reference_id': 2
            },
            {
                'isolate_id': 3,
                'seqrep_id': 5,
                'seqrep_pool': None,
                'version': '0.0.1',
                'pipeline_name': 'remove_contam',
                'status': 1,
                'reference_id': 1
            },
            {
                'isolate_id': 3,
                'seqrep_id': None,
                'seqrep_pool': '1',
                'version': clockwork_version,
                'pipeline_name': 'variant_call',
                'status': -1,
                'reference_id': 2
            },
            {
                'isolate_id': 4,
                'seqrep_id': 6,
                'seqrep_pool': None,
                'version': '0.0.1',
                'pipeline_name': 'remove_contam',
                'status': 1,
                'reference_id': 1
            },
        ]
        self.assertEqual(expected_rows, got_rows)

        # check VCF files etc got written. No need to check contents, trust the tools
        # We're just checking nextflow runs OK here.
        ids = [
            {
                'sample': 1,
                'seqrep_id': '1_2',
                'isolate_id': 1,
                'seq_repl': '1_2'
            },
            {
                'sample': 2,
                'seqrep_id': 3,
                'isolate_id': 2,
                'seq_repl': '1'
            },
            {
                'sample': 2,
                'seqrep_id': 4,
                'isolate_id': 2,
                'seq_repl': '2'
            },
        ]
        for id_dict in ids:
            iso_dir = isolate_dir.IsolateDir(pipeline_root, id_dict['sample'],
                                             id_dict['isolate_id'])
            pipeline_dir = iso_dir.pipeline_dir(id_dict['seq_repl'],
                                                'variant_call',
                                                clockwork_version,
                                                reference_id=2)
            expected_sample = '.'.join([
                str(id_dict[x])
                for x in ['sample', 'isolate_id', 'seqrep_id', 'seq_repl']
            ])
            self._files_are_present_and_correct(pipeline_dir, expected_sample)

        shutil.rmtree(tmp_data_dir)
        nextflow_helper.clean_files()
示例#17
0
    def test_nextflow_qc_using_database(self):
        '''test nextflow_qc using database'''
        tmp_data_dir = 'tmp.nextflow_qc'
        if os.path.exists(tmp_data_dir):
            shutil.rmtree(tmp_data_dir)
        shutil.copytree(data_dir, tmp_data_dir)
        nextflow_helper.write_config_file()
        mysql_config_file = os.path.join(data_dir, 'db.cnf')
        mysql_dump = os.path.join(data_dir, 'mysql.dump')
        db_config_data = db_connection.DbConnection._parse_config_file(
            db_ini_file)
        utils.syscall('mysql --defaults-file=' + mysql_config_file +
                      ' -e "DROP DATABASE IF EXISTS ' + db_config_data['db'] +
                      '; CREATE DATABASE ' + db_config_data['db'] + '"')
        utils.syscall('mysql --defaults-file=' + mysql_config_file + ' ' +
                      db_config_data['db'] + ' < ' + mysql_dump)
        pipeline_root = os.path.join(tmp_data_dir, 'Pipeline_root')
        references_root = os.path.join(tmp_data_dir, 'Pipeline_refs')
        nextflow_file = os.path.join(nextflow_helper.nextflow_dir, 'qc.nf')
        work_dir = 'tmp.nextflow_qc.work'
        dag_file = 'nextflow.qc.dag.db.pdf'
        try:
            os.unlink(dag_file)
        except:
            pass

        command = ' '.join([
            'nextflow run',
            '--dataset_name g1',  # one of the samples is in group2 and should get ignored
            '--ref_id 1',
            '--references_root',
            os.path.abspath(references_root),
            '--pipeline_root',
            pipeline_root,
            '--db_config_file',
            db_ini_file,
            '-with-dag',
            dag_file,
            '-c',
            nextflow_helper.config_file,
            '-w',
            work_dir,
            nextflow_file
        ])
        utils.syscall(command)
        os.unlink(nextflow_helper.config_file)
        shutil.rmtree(work_dir)

        # check database Pipeline table updated as expected
        database = db.Db(db_ini_file)
        got_pipeline_rows = database.get_rows_from_table('Pipeline')
        got_pipeline_rows.sort(key=itemgetter('seqrep_id'))
        expected_pipeline_rows = [
            {
                'isolate_id': 1,
                'seqrep_id': 1,
                'seqrep_pool': None,
                'version': '0.0.1',
                'pipeline_name': 'remove_contam',
                'status': 1,
                'reference_id': 1
            },
            {
                'isolate_id': 1,
                'seqrep_id': 1,
                'seqrep_pool': None,
                'version': clockwork_version,
                'pipeline_name': 'qc',
                'status': 1,
                'reference_id': 1
            },
            {
                'isolate_id': 2,
                'seqrep_id': 2,
                'seqrep_pool': None,
                'version': '0.0.1',
                'pipeline_name': 'remove_contam',
                'status': 1,
                'reference_id': 1
            },
            {
                'isolate_id': 2,
                'seqrep_id': 2,
                'seqrep_pool': None,
                'version': clockwork_version,
                'pipeline_name': 'qc',
                'status': 1,
                'reference_id': 1
            },
            {
                'isolate_id': 3,
                'seqrep_id': 3,
                'seqrep_pool': None,
                'version': '0.0.1',
                'pipeline_name': 'remove_contam',
                'status': 1,
                'reference_id': 1
            },
            {
                'isolate_id': 3,
                'seqrep_id': 3,
                'seqrep_pool': None,
                'version': clockwork_version,
                'pipeline_name': 'qc',
                'status': -1,
                'reference_id': 1
            },
            {
                'isolate_id': 4,
                'seqrep_id': 4,
                'seqrep_pool': None,
                'version': '0.0.1',
                'pipeline_name': 'remove_contam',
                'status': 1,
                'reference_id': 1
            },
        ]
        self.assertEqual(expected_pipeline_rows, got_pipeline_rows)

        # check QC stats added to database
        got_qc_rows = database.get_rows_from_table('QC')
        got_qc_rows.sort(key=itemgetter('seqrep_id'))
        expected_qc_rows = [{
            'seqrep_id': 1,
            'pipeline_version': clockwork_version,
            'fastqc1_adapter_content': 'pass',
            'fastqc1_basic_statistics': 'pass',
            'fastqc1_gc': 48.0,
            'fastqc1_kmer_content': 'fail',
            'fastqc1_max_sequence_length': 75,
            'fastqc1_min_sequence_length': 75,
            'fastqc1_overrepresented_sequences': 'fail',
            'fastqc1_per_base_n_content': 'pass',
            'fastqc1_per_base_sequence_content': 'fail',
            'fastqc1_per_base_sequence_quality': 'pass',
            'fastqc1_per_sequence_gc_content': 'fail',
            'fastqc1_per_sequence_quality_scores': 'fail',
            'fastqc1_sequence_duplication_levels': 'pass',
            'fastqc1_sequence_length_distribution': 'pass',
            'fastqc1_sequences_flagged_as_poor_quality': 0,
            'fastqc1_total_sequences': 72,
            'fastqc2_adapter_content': 'pass',
            'fastqc2_basic_statistics': 'pass',
            'fastqc2_gc': 48.0,
            'fastqc2_kmer_content': 'fail',
            'fastqc2_max_sequence_length': 75,
            'fastqc2_min_sequence_length': 75,
            'fastqc2_overrepresented_sequences': 'fail',
            'fastqc2_per_base_n_content': 'pass',
            'fastqc2_per_base_sequence_content': 'fail',
            'fastqc2_per_base_sequence_quality': 'pass',
            'fastqc2_per_sequence_gc_content': 'fail',
            'fastqc2_per_sequence_quality_scores': 'fail',
            'fastqc2_sequence_duplication_levels': 'pass',
            'fastqc2_sequence_length_distribution': 'pass',
            'fastqc2_sequences_flagged_as_poor_quality': 0,
            'fastqc2_total_sequences': 72,
            'samtools_average_quality': 40.0,
            'samtools_bases_mapped_cigar': 9900,
            'samtools_bases_trimmed': 0,
            'samtools_error_rate': 0.0,
            'samtools_insert_size_average': 199.6,
            'samtools_insert_size_standard_deviation': 1.0,
            'samtools_inward_oriented_pairs': 66,
            'samtools_outward_oriented_pairs': 0,
            'samtools_pairs_with_other_orientation': 0,
            'samtools_raw_total_sequences': 144,
            'samtools_reads_duplicated': 4,
            'samtools_reads_mapped': 132,
            'het_snp_het_calls': 0,
            'het_snp_positions': 983,
            'het_snp_total_snps': 0,
        }, {
            'seqrep_id': 2,
            'pipeline_version': clockwork_version,
            'fastqc1_adapter_content': 'pass',
            'fastqc1_basic_statistics': 'pass',
            'fastqc1_gc': 48.0,
            'fastqc1_kmer_content': 'fail',
            'fastqc1_max_sequence_length': 75,
            'fastqc1_min_sequence_length': 75,
            'fastqc1_overrepresented_sequences': 'fail',
            'fastqc1_per_base_n_content': 'pass',
            'fastqc1_per_base_sequence_content': 'fail',
            'fastqc1_per_base_sequence_quality': 'pass',
            'fastqc1_per_sequence_gc_content': 'fail',
            'fastqc1_per_sequence_quality_scores': 'fail',
            'fastqc1_sequence_duplication_levels': 'pass',
            'fastqc1_sequence_length_distribution': 'pass',
            'fastqc1_sequences_flagged_as_poor_quality': 0,
            'fastqc1_total_sequences': 72,
            'fastqc2_adapter_content': 'pass',
            'fastqc2_basic_statistics': 'pass',
            'fastqc2_gc': 49.0,
            'fastqc2_kmer_content': 'fail',
            'fastqc2_max_sequence_length': 75,
            'fastqc2_min_sequence_length': 75,
            'fastqc2_overrepresented_sequences': 'fail',
            'fastqc2_per_base_n_content': 'pass',
            'fastqc2_per_base_sequence_content': 'fail',
            'fastqc2_per_base_sequence_quality': 'pass',
            'fastqc2_per_sequence_gc_content': 'warn',
            'fastqc2_per_sequence_quality_scores': 'fail',
            'fastqc2_sequence_duplication_levels': 'pass',
            'fastqc2_sequence_length_distribution': 'pass',
            'fastqc2_sequences_flagged_as_poor_quality': 0,
            'fastqc2_total_sequences': 72,
            'samtools_average_quality': 40.0,
            'samtools_bases_mapped_cigar': 9900,
            'samtools_bases_trimmed': 0,
            'samtools_error_rate': 0.0,
            'samtools_insert_size_average': 199.7,
            'samtools_insert_size_standard_deviation': 1.1,
            'samtools_inward_oriented_pairs': 66,
            'samtools_outward_oriented_pairs': 0,
            'samtools_pairs_with_other_orientation': 0,
            'samtools_raw_total_sequences': 144,
            'samtools_reads_duplicated': 0,
            'samtools_reads_mapped': 132,
            'het_snp_het_calls': 0,
            'het_snp_positions': 983,
            'het_snp_total_snps': 0,
        }]
        self.assertEqual(expected_qc_rows, got_qc_rows)

        # check QC files got written. No need to check contents, as that is done
        # elsewhere. We're just checking nextflow runs OK here.
        ids = [
            {
                'sample': 1,
                'isolate_id': 1,
                'seq_repl': 43
            },
            {
                'sample': 2,
                'isolate_id': 2,
                'seq_repl': 45
            },
        ]
        for id_dict in ids:
            iso_dir = isolate_dir.IsolateDir(pipeline_root, id_dict['sample'],
                                             id_dict['isolate_id'])
            qc_root_dir = iso_dir.pipeline_dir(id_dict['seq_repl'], 'qc',
                                               clockwork_version)
            self.assertTrue(os.path.exists(qc_root_dir))
            for method in ['fastqc', 'samtools_qc']:
                this_qc_dir = os.path.join(qc_root_dir, method)
                self.assertTrue(os.path.exists(this_qc_dir))
                self.assertTrue(len(os.listdir(this_qc_dir)) >= 1)

        shutil.rmtree(tmp_data_dir)
        nextflow_helper.clean_files()
示例#18
0
    def test_nextflow_mykrobe_predict(self):
        '''test nextflow_mykrobe using database'''
        tmp_data_dir = 'tmp.nextflow_mykrobe_db_input.data'
        if os.path.exists(tmp_data_dir):
            shutil.rmtree(tmp_data_dir)
        shutil.copytree(data_dir, tmp_data_dir)
        nextflow_helper.write_config_file()
        mysql_config_file = os.path.join(data_dir, 'db.cnf')
        mysql_dump = os.path.join(data_dir, 'mysql.dump')
        db_config_data = db_connection.DbConnection._parse_config_file(
            db_ini_file)
        utils.syscall('mysql --defaults-file=' + mysql_config_file +
                      ' -e "DROP DATABASE IF EXISTS ' + db_config_data['db'] +
                      '; CREATE DATABASE ' + db_config_data['db'] + '"')
        utils.syscall('mysql --defaults-file=' + mysql_config_file + ' ' +
                      db_config_data['db'] + ' < ' + mysql_dump)
        pipeline_root = os.path.join(tmp_data_dir, 'Pipeline_root')
        references_root = os.path.join(tmp_data_dir, 'Pipeline_refs')
        nextflow_file = os.path.join(nextflow_helper.nextflow_dir,
                                     'mykrobe_predict.nf')
        work_dir = 'tmp.nextflow_mykrobe_db_input.work'
        dag_file = 'nextflow.mykrobe.dag.db.pdf'
        try:
            os.unlink(dag_file)
        except:
            pass

        command = ' '.join([
            'nextflow run',
            '--dataset_name g1',  # one read pair is from group 2 and should get ignored
            '--ref_id 2',
            '--references_root',
            os.path.abspath(references_root),
            '--pipeline_root',
            pipeline_root,
            '--db_config_file',
            db_ini_file,
            '--testing',
            '-with-dag',
            dag_file,
            '-c',
            nextflow_helper.config_file,
            '-w',
            work_dir,
            nextflow_file
        ])
        utils.syscall(command)
        os.unlink(nextflow_helper.config_file)
        shutil.rmtree(work_dir)

        # check database Pipeline table updated as expected.
        # The --testing option is set up so that the pooled
        # sample fails, hence it gets a status of -1.
        database = db.Db(db_ini_file)
        got_rows = database.get_rows_from_table('Pipeline')
        got_rows.sort(key=itemgetter('isolate_id', 'pipeline_name'))
        expected_rows = [
            {
                'isolate_id': 1,
                'seqrep_id': None,
                'seqrep_pool': '1_2',
                'version': clockwork_version,
                'pipeline_name': 'mykrobe_predict',
                'status': -1,
                'reference_id': 2
            },
            {
                'isolate_id': 1,
                'seqrep_id': 1,
                'seqrep_pool': None,
                'version': '0.4.0',
                'pipeline_name': 'remove_contam',
                'status': 1,
                'reference_id': 1
            },
            {
                'isolate_id': 1,
                'seqrep_id': 2,
                'seqrep_pool': None,
                'version': '0.4.0',
                'pipeline_name': 'remove_contam',
                'status': 1,
                'reference_id': 1
            },
            {
                'isolate_id': 2,
                'seqrep_id': 3,
                'seqrep_pool': None,
                'version': clockwork_version,
                'pipeline_name': 'mykrobe_predict',
                'status': 1,
                'reference_id': 2
            },
            {
                'isolate_id': 2,
                'seqrep_id': 3,
                'seqrep_pool': None,
                'version': '0.4.0',
                'pipeline_name': 'remove_contam',
                'status': 1,
                'reference_id': 1
            },
            {
                'isolate_id': 2,
                'seqrep_id': 4,
                'seqrep_pool': None,
                'version': '0.4.0',
                'pipeline_name': 'remove_contam',
                'status': 1,
                'reference_id': 1
            },
            {
                'isolate_id': 2,
                'seqrep_id': 4,
                'seqrep_pool': None,
                'version': clockwork_version,
                'pipeline_name': 'mykrobe_predict',
                'status': 1,
                'reference_id': 2
            },
            {
                'isolate_id': 3,
                'seqrep_id': None,
                'seqrep_pool': '1',
                'version': clockwork_version,
                'pipeline_name': 'mykrobe_predict',
                'status': 1,
                'reference_id': 2
            },
            {
                'isolate_id': 3,
                'seqrep_id': 5,
                'seqrep_pool': None,
                'version': '0.4.0',
                'pipeline_name': 'remove_contam',
                'status': 1,
                'reference_id': 1
            },
            {
                'isolate_id': 4,
                'seqrep_id': 6,
                'seqrep_pool': None,
                'version': '0.4.0',
                'pipeline_name': 'remove_contam',
                'status': 1,
                'reference_id': 1
            },
        ]
        expected_rows.sort(key=itemgetter('isolate_id', 'pipeline_name'))
        self.assertEqual(expected_rows, got_rows)

        # check mykrobe output files etc got written. No need to check contents, trust the tools
        # We're just checking nextflow runs OK here.
        ids = [
            {
                'sample': 1,
                'seqrep_id': '1_2',
                'isolate_id': 1,
                'seq_repl': '1_2',
                'sample_name':
                'site.s1.iso.42.subject.p1.lab_id.l1.seq_reps.1_2'
            },
            {
                'sample': 2,
                'seqrep_id': 3,
                'isolate_id': 2,
                'seq_repl': '1',
                'sample_name': 'site.s2.iso.43.subject.p2.lab_id.l2.seq_reps.1'
            },
            {
                'sample': 2,
                'seqrep_id': 4,
                'isolate_id': 2,
                'seq_repl': '2',
                'sample_name': 'site.s2.iso.43.subject.p2.lab_id.l2.seq_reps.2'
            },
        ]
        for id_dict in ids:
            iso_dir = isolate_dir.IsolateDir(pipeline_root, id_dict['sample'],
                                             id_dict['isolate_id'])
            pipeline_dir = iso_dir.pipeline_dir(id_dict['seq_repl'],
                                                'mykrobe_predict',
                                                clockwork_version,
                                                reference_id=2)
            self.assertTrue(os.path.exists(pipeline_dir))
            log = os.path.join(pipeline_dir, 'log.txt')
            json_file = os.path.join(pipeline_dir, 'out.json')

            if id_dict['sample_name'].endswith('1_2'):
                self.assertFalse(os.path.exists(log))
                self.assertFalse(os.path.exists(json_file))
            else:
                self.assertTrue(os.path.exists(log))
                self.assertTrue(os.path.exists(json_file))

        shutil.rmtree(tmp_data_dir)
        nextflow_helper.clean_files()
示例#19
0
    def test_nextflow_import(self):
        '''test nextflow_import'''
        nextflow_helper.write_config_file()
        pipeline_root = 'tmp.nextflow_import.pipeline_root'
        os.mkdir(pipeline_root)
        try:
            db_connection.DbConnection(db_ini_file, destroy=True)
        except:
            pass

        dbm = db_maker.DbMaker(db_ini_file)
        dbm.run()

        dropbox_dir = 'tmp.nextflow_import.dropbox'
        shutil.copytree(os.path.join(data_dir, 'dropbox'), dropbox_dir)
        xlsx_archive_dir = 'tmp.nextflow_import.xlsx_archive'
        os.mkdir(xlsx_archive_dir)
        expected_xlsx_files = [
            os.path.basename(x)
            for x in glob.glob(os.path.join(dropbox_dir, '*.xlsx'))
        ]

        nextflow_file = os.path.join(nextflow_helper.nextflow_dir, 'import.nf')
        work_dir = 'tmp.nextflow_import.work'
        dag_file = 'nextflow.import.dag.pdf'
        try:
            os.unlink(dag_file)
        except:
            pass

        command = ' '.join([
            'nextflow run', '--dropbox_dir', dropbox_dir, '--pipeline_root',
            pipeline_root, '--db_config_file', db_ini_file,
            '--xlsx_archive_dir', xlsx_archive_dir, '-with-dag', dag_file,
            '-c', nextflow_helper.config_file, '-w', work_dir, nextflow_file
        ])
        utils.syscall(command)
        os.unlink(nextflow_helper.config_file)
        shutil.rmtree(work_dir)

        # All files should be gone from the dropbox
        self.assertEqual([], os.listdir(dropbox_dir))
        shutil.rmtree(dropbox_dir)

        # The two spreadsheets should have been archived
        got_xlsx_files = [
            os.path.basename(x)
            for x in glob.glob(os.path.join(xlsx_archive_dir, '**', '*.xlsx'))
        ]
        self.assertEqual(expected_xlsx_files, got_xlsx_files)
        shutil.rmtree(xlsx_archive_dir)

        # Check database updated correctly
        database = db.Db(db_ini_file)
        expected_sample_rows = [
            {
                'subject_id': 'p1',
                'site_id': 's1',
                'sample_id_from_lab': 'l1',
                'dataset_name': 'g1',
                'ena_center_name': 'Center A',
                'ena_sample_accession': 'ERS123456',
                'ena_study_accession': None
            },
            {
                'subject_id': 'p2',
                'site_id': 's2',
                'sample_id_from_lab': 'l2',
                'dataset_name': 'g2',
                'ena_center_name': 'Center A',
                'ena_sample_accession': None,
                'ena_study_accession': None
            },
            {
                'subject_id': 'p1',
                'site_id': 's3',
                'sample_id_from_lab': 'l1',
                'dataset_name': 'g1',
                'ena_center_name': 'Center B',
                'ena_sample_accession': None,
                'ena_study_accession': None
            },
        ]
        got_sample_rows = sorted(database.get_rows_from_table('Sample'),
                                 key=itemgetter('site_id'))
        # the rows also have the sample_id, which is made by mysql auto increment,
        # We don't know the order in which things are added, so can't check the sample_id.
        for row in got_sample_rows:
            del row['sample_id']

        self.assertEqual(expected_sample_rows, got_sample_rows)

        expected_rows = [
            {
                'sequence_replicate_number': 1,
                'original_reads_file_1_md5':
                'edc176f367fe8e5a014c819b9ec9b05c',
                'original_reads_file_2_md5':
                '0dd551a0d76d90059808f6f7ddbb0e02',
                'remove_contam_reads_file_1_md5': None,
                'remove_contam_reads_file_2_md5': None,
                'pool_sequence_replicates': 1,
                'withdrawn': 0,
                'import_status': 1,
                'submission_date': datetime.date(2017, 12, 25),
                'submit_to_ena': 0,
                'ena_run_accession': 'ERR123456',
                'ena_on_hold': 0,
                'isolate_number_from_lab': '1',
                'pool_sequence_replicates': 1,
                'ena_experiment_accession': None,
                'instrument_model': 'Illumina HiSeq 2000'
            },
            {
                'sequence_replicate_number': 1,
                'original_reads_file_1_md5':
                'fe5cd28cf9394be14794f0a56a2fe845',
                'original_reads_file_2_md5':
                'd026fd9a439294ed42795bd7f1e7df10',
                'remove_contam_reads_file_1_md5': None,
                'remove_contam_reads_file_2_md5': None,
                'pool_sequence_replicates': 1,
                'withdrawn': 0,
                'import_status': 1,
                'submission_date': datetime.date(2017, 12, 26),
                'submit_to_ena': 1,
                'ena_run_accession': None,
                'ena_on_hold': 1,
                'isolate_number_from_lab': '1',
                'pool_sequence_replicates': 1,
                'ena_experiment_accession': None,
                'instrument_model': 'Illumina HiSeq 2000'
            },
            {
                'sequence_replicate_number': 1,
                'original_reads_file_1_md5':
                'aa8f077673c158c4f2a19fc3c50e3fa7',
                'original_reads_file_2_md5':
                'ae6bafef67da3c26576e799c32985ac9',
                'remove_contam_reads_file_1_md5': None,
                'remove_contam_reads_file_2_md5': None,
                'pool_sequence_replicates': 1,
                'withdrawn': 0,
                'import_status': 1,
                'submission_date': datetime.date(2017, 12, 26),
                'submit_to_ena': 1,
                'ena_run_accession': None,
                'ena_on_hold': 1,
                'isolate_number_from_lab': '2',
                'pool_sequence_replicates': 1,
                'ena_experiment_accession': None,
                'instrument_model': 'Illumina HiSeq 2000'
            },
            {
                'sequence_replicate_number': 1,
                'original_reads_file_1_md5':
                '6b9a34ed492dad739ac03e084f3b2ab9',
                'original_reads_file_2_md5':
                '7ceffc5314ff7e305b4ab5bd859850c9',
                'remove_contam_reads_file_1_md5': None,
                'remove_contam_reads_file_2_md5': None,
                'pool_sequence_replicates': 1,
                'withdrawn': 0,
                'import_status': 1,
                'submission_date': datetime.date(2017, 12, 25),
                'submit_to_ena': 1,
                'ena_run_accession': None,
                'ena_on_hold': 0,
                'isolate_number_from_lab': '1',
                'pool_sequence_replicates': 1,
                'ena_experiment_accession': None,
                'instrument_model': 'Illumina HiSeq 2500'
            },
            {
                'sequence_replicate_number': 2,
                'original_reads_file_1_md5':
                'ec0377e321c59c0b1b6392a3c6dfc2dc',
                'original_reads_file_2_md5':
                'd541ffdb43a0648233ec7408c3626bfd',
                'remove_contam_reads_file_1_md5': None,
                'remove_contam_reads_file_2_md5': None,
                'pool_sequence_replicates': 1,
                'withdrawn': 0,
                'import_status': 1,
                'submission_date': datetime.date(2017, 12, 25),
                'submit_to_ena': 1,
                'ena_run_accession': None,
                'ena_on_hold': 0,
                'isolate_number_from_lab': '1',
                'pool_sequence_replicates': 1,
                'ena_experiment_accession': None,
                'instrument_model': 'Illumina HiSeq 2500'
            },
        ]

        expected_rows.sort(key=itemgetter('original_reads_file_1_md5'))
        query = 'SELECT * FROM (Seqrep JOIN Isolate ON Seqrep.isolate_id = Isolate.isolate_id)'
        got_rows = database.query_to_dict(query)
        got_rows.sort(key=itemgetter('original_reads_file_1_md5'))

        # Check reads files etc written correctly
        for isolate_data in got_rows:
            iso_dir = isolate_dir.IsolateDir(pipeline_root,
                                             isolate_data['sample_id'],
                                             isolate_data['isolate_id'])
            self.assertTrue(os.path.exists(iso_dir.reads_dir))

            for i in [1, 2]:
                self.assertTrue(
                    os.path.exists(
                        iso_dir.reads_filename(
                            'original',
                            isolate_data['sequence_replicate_number'], i)))

        # similar to above, we don't know the sample_id, seqrep_id or isolate_id, which are auto generated.
        for row in got_rows:
            del row['sample_id']
            del row['seqrep_id']
            del row['isolate_id']

        self.assertEqual(expected_rows, got_rows)

        shutil.rmtree(pipeline_root)
        nextflow_helper.clean_files()
        database.commit_and_close()
        db_connection.DbConnection(db_ini_file, destroy=True, must_exist=True)