def test_nextflow_qc_using_fastq_input(self): '''test nextflow_qc using fastq input''' reads1 = os.path.join(data_dir, 'Reads', 'reads.1.1.fq.gz') reads2 = os.path.join(data_dir, 'Reads', 'reads.1.2.fq.gz') output_dir = 'tmp.test_nextflow_qc_using_fastq_input' nextflow_file = os.path.join(nextflow_helper.nextflow_dir, 'qc.nf') nextflow_helper.write_config_file() work_dir = 'tmp.nextflow_qc.work' dag_file = 'nextflow.qc.dag.no_db.pdf' try: os.unlink(dag_file) except: pass command = ' '.join([ 'nextflow run', '--reads_in1', reads1, '--reads_in2', reads2, '--output_dir', output_dir, '--ref_fasta', os.path.join(data_dir, 'Reference', 'ref.fa'), '-with-dag', dag_file, '-c', nextflow_helper.config_file, '-w', work_dir, nextflow_file ]) utils.syscall(command) os.unlink(nextflow_helper.config_file) shutil.rmtree(work_dir) self.assertTrue(os.path.exists(output_dir)) for method in ['fastqc', 'samtools_qc']: qc_dir = os.path.join(output_dir, method) self.assertTrue(os.path.exists(qc_dir)) self.assertTrue(len(os.listdir(qc_dir)) >= 1) shutil.rmtree(output_dir) nextflow_helper.clean_files()
def test_nextflow_variant_call_using_fastq_input(self): """test nextflow_variant_call using fastq input""" reads1 = os.path.join(data_dir, "Reads", "reads.1.1.fq.gz") reads2 = os.path.join(data_dir, "Reads", "reads.1.2.fq.gz") outdir = os.path.abspath( "tmp.test_nextflow_variant_call_fastq_input.out") tmp_data_dir = "tmp.nextflow_variant_call_fastq_input.data" if os.path.exists(tmp_data_dir): shutil.rmtree(tmp_data_dir) shutil.copytree(data_dir, tmp_data_dir) nextflow_file = os.path.join(nextflow_helper.nextflow_dir, "variant_call.nf") nextflow_helper.write_config_file() work_dir = "tmp.nextflow_variant_call_fastq_input.work" sample_name = "test_sample_name" dag_file = "nextflow.variant_call.dag.no_db.pdf" try: os.unlink(dag_file) except: pass command = " ".join([ "nextflow run", "--reads_in1", reads1, "--reads_in2", reads2, "--output_dir", outdir, "--ref_dir", os.path.join(tmp_data_dir, "Reference"), "--sample_name", sample_name, "--cortex_mem_height 17", "--gvcf", "--testing", "-with-dag", dag_file, "-c", nextflow_helper.config_file, "-w", work_dir, nextflow_file, ]) utils.syscall(command) self._files_are_present_and_correct(outdir, sample_name, expect_rmdup_bam=True, expect_ref_check_files=False) self.assertTrue( os.path.exists(os.path.join(outdir, "minos", "gvcf.fasta"))) self.assertTrue( os.path.exists(os.path.join(outdir, "minos", "gvcf.vcf"))) os.unlink(nextflow_helper.config_file) shutil.rmtree(work_dir) shutil.rmtree(tmp_data_dir) shutil.rmtree(outdir) nextflow_helper.clean_files()
def test_nextflow_remove_contam_using_fastq_input(self): '''test nextflow_remove_contam using fastq input''' reads1 = os.path.join(data_dir, 'Reads', 'reads.1.1.fq.gz') reads2 = os.path.join(data_dir, 'Reads', 'reads.1.2.fq.gz') outprefix = 'tmp.test_nextflow_remove_contam_using_fastq_input' nextflow_file = os.path.join(nextflow_helper.nextflow_dir, 'remove_contam.nf') nextflow_helper.write_config_file() work_dir = 'tmp.nextflow_remove_contam.work' dag_file = 'nextflow.remove_contam.dag.no_db.pdf' try: os.unlink(dag_file) except: pass command = ' '.join([ 'nextflow run', '--reads_in1', reads1, '--reads_in2', reads2, '--outprefix', outprefix, '--ref_metadata_tsv', os.path.join(data_dir, 'Reference', 'remove_contam_metadata.tsv'), '--ref_fasta', os.path.join(data_dir, 'Reference', 'ref.fa'), '--testing', '-with-dag', dag_file, '-c', nextflow_helper.config_file, '-w', work_dir, nextflow_file ]) utils.syscall(command) os.unlink(nextflow_helper.config_file) shutil.rmtree(work_dir) for reads_type in ('contam', 'remove_contam'): for i in ('1', '2'): filename = outprefix + '.' + reads_type + '.' + i + '.fq.gz' self.assertTrue(os.path.exists(filename)) os.unlink(filename) expected_counts_lines = [ 'Name\tIs_contam\tReads\n', 'contam\t1\t40\n', 'ref\t0\t132\n', 'Unmapped\t0\t26\n', 'Reads_kept_after_remove_contam\t0\t158\n', ] counts_tsv = outprefix + '.counts.tsv' with open(counts_tsv) as f: got_counts_lines = f.readlines() self.assertEqual(expected_counts_lines, got_counts_lines) os.unlink(counts_tsv) nextflow_helper.clean_files()
def test_nextflow_qc_using_fastq_input(self): """test nextflow_qc using fastq input""" reads1 = os.path.join(data_dir, "Reads", "reads.1.1.fq.gz") reads2 = os.path.join(data_dir, "Reads", "reads.1.2.fq.gz") output_dir = "tmp.test_nextflow_qc_using_fastq_input" nextflow_file = os.path.join(nextflow_helper.nextflow_dir, "qc.nf") nextflow_helper.write_config_file() work_dir = "tmp.nextflow_qc.work" dag_file = "nextflow.qc.dag.no_db.pdf" try: os.unlink(dag_file) except: pass command = " ".join([ "nextflow run", "--reads_in1", reads1, "--reads_in2", reads2, "--output_dir", output_dir, "--ref_fasta", os.path.join(data_dir, "Reference", "ref.fa"), "-with-dag", dag_file, "-c", nextflow_helper.config_file, "-w", work_dir, nextflow_file, ]) utils.syscall(command) os.unlink(nextflow_helper.config_file) shutil.rmtree(work_dir) self.assertTrue(os.path.exists(output_dir)) for method in ["fastqc", "samtools_qc"]: qc_dir = os.path.join(output_dir, method) self.assertTrue(os.path.exists(qc_dir)) self.assertTrue(len(os.listdir(qc_dir)) >= 1) shutil.rmtree(output_dir) nextflow_helper.clean_files()
def test_nextflow_variant_call_using_fastq_input(self): '''test nextflow_variant_call using fastq input''' reads1 = os.path.join(data_dir, 'Reads', 'reads.1.1.fq.gz') reads2 = os.path.join(data_dir, 'Reads', 'reads.1.2.fq.gz') outdir = os.path.abspath( 'tmp.test_nextflow_variant_call_fastq_input.out') tmp_data_dir = 'tmp.nextflow_variant_call_fastq_input.data' if os.path.exists(tmp_data_dir): shutil.rmtree(tmp_data_dir) shutil.copytree(data_dir, tmp_data_dir) nextflow_file = os.path.join(nextflow_helper.nextflow_dir, 'variant_call.nf') nextflow_helper.write_config_file() work_dir = 'tmp.nextflow_variant_call_fastq_input.work' sample_name = 'test_sample_name' dag_file = 'nextflow.variant_call.dag.no_db.pdf' try: os.unlink(dag_file) except: pass command = ' '.join([ 'nextflow run', '--reads_in1', reads1, '--reads_in2', reads2, '--output_dir', outdir, '--ref_dir', os.path.join(tmp_data_dir, 'Reference'), '--sample_name', sample_name, '--cortex_mem_height 17', '--testing', '-with-dag', dag_file, '-c', nextflow_helper.config_file, '-w', work_dir, nextflow_file ]) utils.syscall(command) self._files_are_present_and_correct(outdir, sample_name, expect_rmdup_bam=True, expect_ref_check_files=False) os.unlink(nextflow_helper.config_file) shutil.rmtree(work_dir) shutil.rmtree(tmp_data_dir) shutil.rmtree(outdir) nextflow_helper.clean_files()
def test_nextflow_fake_remove_contam(self): """test nextflow_fake_remove_contam""" tmp_data_dir = "tmp.nextflow_fake_remove_contam" if os.path.exists(tmp_data_dir): shutil.rmtree(tmp_data_dir) shutil.copytree(data_dir, tmp_data_dir) nextflow_helper.write_config_file() mysql_config_file = os.path.join(data_dir, "db.cnf") mysql_dump = os.path.join(data_dir, "mysql.dump") db_config_data = db_connection.DbConnection._parse_config_file( db_ini_file) utils.syscall("mysql --defaults-file=" + mysql_config_file + ' -e "DROP DATABASE IF EXISTS ' + db_config_data["db"] + "; CREATE DATABASE " + db_config_data["db"] + '"') utils.syscall("mysql --defaults-file=" + mysql_config_file + " " + db_config_data["db"] + " < " + mysql_dump) pipeline_root = os.path.join(tmp_data_dir, "Pipeline_root") references_root = os.path.join(tmp_data_dir, "Pipeline_refs") nextflow_file = os.path.join(nextflow_helper.nextflow_dir, "fake_remove_contam.nf") work_dir = "tmp.nextflow_fake_remove_contam.work" dag_file = "nextflow.fake_remove_contam.dag.db.pdf" try: os.unlink(dag_file) except: pass command = " ".join([ "nextflow run", "--dataset_name g1", # one read pair has group g2, so should get ignored "--pipeline_root", os.path.abspath(pipeline_root), "--db_config_file", db_ini_file, "-with-dag", dag_file, "-c", nextflow_helper.config_file, "-w", work_dir, nextflow_file, ]) utils.syscall(command) os.unlink(nextflow_helper.config_file) shutil.rmtree(work_dir) # check database Pipeline table updated as expected database = db.Db(db_ini_file) got_rows = database.get_rows_from_table("Pipeline") got_rows.sort(key=itemgetter("seqrep_id")) expected_rows = [ { "isolate_id": 1, "seqrep_id": 1, "seqrep_pool": None, "version": clockwork_version, "pipeline_name": "remove_contam", "status": 1, "reference_id": 0, }, { "isolate_id": 2, "seqrep_id": 2, "seqrep_pool": None, "version": clockwork_version, "pipeline_name": "remove_contam", "status": 1, "reference_id": 0, }, { "isolate_id": 3, "seqrep_id": 3, "seqrep_pool": None, "version": clockwork_version, "pipeline_name": "remove_contam", "status": -1, "reference_id": 0, }, ] self.assertEqual(expected_rows, got_rows) # check database Read_counts table updated got_rows = database.get_rows_from_table("Read_counts") got_rows.sort(key=itemgetter("seqrep_id")) expected_rows = [ { "seqrep_id": 1, "original_total": 12, "contamination": 0, "not_contamination": 12, "unmapped": 0, "total_after_remove_contam": 12, }, { "seqrep_id": 2, "original_total": 26, "contamination": 0, "not_contamination": 26, "unmapped": 0, "total_after_remove_contam": 26, }, ] self.assertEqual(expected_rows, got_rows) # check FASTQ files got written. No need to check contents, as that is done # elsewhere. We're just checking nextflow runs OK here. ids = [ { "sample": 1, "isolate_id": 1, "seq_repl": 1 }, { "sample": 2, "isolate_id": 2, "seq_repl": 1 }, ] for id_dict in ids: iso_dir = isolate_dir.IsolateDir(pipeline_root, id_dict["sample"], id_dict["isolate_id"]) for read_type in ("original", "remove_contam"): for i in (1, 2): self.assertTrue( os.path.exists( iso_dir.reads_filename(read_type, id_dict["seq_repl"], i))) shutil.rmtree(tmp_data_dir) nextflow_helper.clean_files()
def test_nextflow_mykrobe_predict(self): """test nextflow_mykrobe using database""" tmp_data_dir = "tmp.nextflow_mykrobe_db_input.data" if os.path.exists(tmp_data_dir): shutil.rmtree(tmp_data_dir) shutil.copytree(data_dir, tmp_data_dir) nextflow_helper.write_config_file() mysql_config_file = os.path.join(data_dir, "db.cnf") mysql_dump = os.path.join(data_dir, "mysql.dump") db_config_data = db_connection.DbConnection._parse_config_file( db_ini_file) utils.syscall("mysql --defaults-file=" + mysql_config_file + ' -e "DROP DATABASE IF EXISTS ' + db_config_data["db"] + "; CREATE DATABASE " + db_config_data["db"] + '"') utils.syscall("mysql --defaults-file=" + mysql_config_file + " " + db_config_data["db"] + " < " + mysql_dump) pipeline_root = os.path.join(tmp_data_dir, "Pipeline_root") references_root = os.path.join(tmp_data_dir, "Pipeline_refs") nextflow_file = os.path.join(nextflow_helper.nextflow_dir, "mykrobe_predict.nf") work_dir = "tmp.nextflow_mykrobe_db_input.work" dag_file = "nextflow.mykrobe.dag.db.pdf" try: os.unlink(dag_file) except: pass command = " ".join([ "nextflow run", "--dataset_name g1", # one read pair is from group 2 and should get ignored "--ref_id 2", "--references_root", os.path.abspath(references_root), "--pipeline_root", pipeline_root, "--db_config_file", db_ini_file, "--testing", "-with-dag", dag_file, "-c", nextflow_helper.config_file, "-w", work_dir, nextflow_file, ]) utils.syscall(command) os.unlink(nextflow_helper.config_file) shutil.rmtree(work_dir) # check database Pipeline table updated as expected. # The --testing option is set up so that the pooled # sample fails, hence it gets a status of -1. database = db.Db(db_ini_file) got_rows = database.get_rows_from_table("Pipeline") got_rows.sort(key=itemgetter("isolate_id", "pipeline_name")) expected_rows = [ { "isolate_id": 1, "seqrep_id": None, "seqrep_pool": "1_2", "version": clockwork_version, "pipeline_name": "mykrobe_predict", "status": -1, "reference_id": 2, }, { "isolate_id": 1, "seqrep_id": 1, "seqrep_pool": None, "version": "0.4.0", "pipeline_name": "remove_contam", "status": 1, "reference_id": 1, }, { "isolate_id": 1, "seqrep_id": 2, "seqrep_pool": None, "version": "0.4.0", "pipeline_name": "remove_contam", "status": 1, "reference_id": 1, }, { "isolate_id": 2, "seqrep_id": 3, "seqrep_pool": None, "version": clockwork_version, "pipeline_name": "mykrobe_predict", "status": 1, "reference_id": 2, }, { "isolate_id": 2, "seqrep_id": 3, "seqrep_pool": None, "version": "0.4.0", "pipeline_name": "remove_contam", "status": 1, "reference_id": 1, }, { "isolate_id": 2, "seqrep_id": 4, "seqrep_pool": None, "version": "0.4.0", "pipeline_name": "remove_contam", "status": 1, "reference_id": 1, }, { "isolate_id": 2, "seqrep_id": 4, "seqrep_pool": None, "version": clockwork_version, "pipeline_name": "mykrobe_predict", "status": 1, "reference_id": 2, }, { "isolate_id": 3, "seqrep_id": None, "seqrep_pool": "1", "version": clockwork_version, "pipeline_name": "mykrobe_predict", "status": 1, "reference_id": 2, }, { "isolate_id": 3, "seqrep_id": 5, "seqrep_pool": None, "version": "0.4.0", "pipeline_name": "remove_contam", "status": 1, "reference_id": 1, }, { "isolate_id": 4, "seqrep_id": 6, "seqrep_pool": None, "version": "0.4.0", "pipeline_name": "remove_contam", "status": 1, "reference_id": 1, }, ] expected_rows.sort(key=itemgetter("isolate_id", "pipeline_name")) self.assertEqual(expected_rows, got_rows) # check mykrobe output files etc got written. No need to check contents, trust the tools # We're just checking nextflow runs OK here. ids = [ { "sample": 1, "seqrep_id": "1_2", "isolate_id": 1, "seq_repl": "1_2", "sample_name": "site.s1.iso.42.subject.p1.lab_id.l1.seq_reps.1_2", }, { "sample": 2, "seqrep_id": 3, "isolate_id": 2, "seq_repl": "1", "sample_name": "site.s2.iso.43.subject.p2.lab_id.l2.seq_reps.1", }, { "sample": 2, "seqrep_id": 4, "isolate_id": 2, "seq_repl": "2", "sample_name": "site.s2.iso.43.subject.p2.lab_id.l2.seq_reps.2", }, ] for id_dict in ids: iso_dir = isolate_dir.IsolateDir(pipeline_root, id_dict["sample"], id_dict["isolate_id"]) pipeline_dir = iso_dir.pipeline_dir( id_dict["seq_repl"], "mykrobe_predict", clockwork_version, reference_id=2, ) self.assertTrue(os.path.exists(pipeline_dir)) log = os.path.join(pipeline_dir, "log.txt") json_file = os.path.join(pipeline_dir, "out.json") if id_dict["sample_name"].endswith("1_2"): self.assertFalse(os.path.exists(log)) self.assertFalse(os.path.exists(json_file)) else: self.assertTrue(os.path.exists(log)) self.assertTrue(os.path.exists(json_file)) shutil.rmtree(tmp_data_dir) nextflow_helper.clean_files()
def test_nextflow_remove_contam_using_fastq_input(self): """test nextflow_remove_contam using fastq input""" reads1 = os.path.join(data_dir, "Reads", "reads.1.1.fq.gz") reads2 = os.path.join(data_dir, "Reads", "reads.1.2.fq.gz") outprefix = "tmp.test_nextflow_remove_contam_using_fastq_input" nextflow_file = os.path.join(nextflow_helper.nextflow_dir, "remove_contam.nf") nextflow_helper.write_config_file() work_dir = "tmp.nextflow_remove_contam.work" dag_file = "nextflow.remove_contam.dag.no_db.pdf" try: os.unlink(dag_file) except: pass command = " ".join([ "nextflow run", "--reads_in1", reads1, "--reads_in2", reads2, "--outprefix", outprefix, "--ref_metadata_tsv", os.path.join(data_dir, "Reference", "remove_contam_metadata.tsv"), "--ref_fasta", os.path.join(data_dir, "Reference", "ref.fa"), "--testing", "-with-dag", dag_file, "-c", nextflow_helper.config_file, "-w", work_dir, nextflow_file, ]) utils.syscall(command) os.unlink(nextflow_helper.config_file) shutil.rmtree(work_dir) for reads_type in ("contam", "remove_contam"): for i in ("1", "2"): filename = outprefix + "." + reads_type + "." + i + ".fq.gz" self.assertTrue(os.path.exists(filename)) os.unlink(filename) expected_counts_lines = [ "Name\tIs_contam\tReads\n", "contam\t1\t40\n", "ref\t0\t132\n", "Unmapped\t0\t26\n", "Reads_kept_after_remove_contam\t0\t158\n", ] counts_tsv = outprefix + ".counts.tsv" with open(counts_tsv) as f: got_counts_lines = f.readlines() self.assertEqual(expected_counts_lines, got_counts_lines) os.unlink(counts_tsv) nextflow_helper.clean_files()
def test_nextflow_assemble(self): '''test nextflow_assemble''' nextflow_helper.write_config_file() input_dir = 'tmp.nextflow_assemble.dir' utils.rmtree(input_dir) samples = ['ERS1', 'ERS2', 'ERS3'] samples_file = 'tmp.nextflow_assemble.samples' with open(samples_file, 'w') as f: print(*samples, sep='\n', file=f) sdirs = sample_dirs.SampleDirs(input_dir) sdirs.add_samples(samples_file) os.unlink(samples_file) nextflow_file = os.path.join(nextflow_helper.nextflow_dir, 'assemble.nf') work_dir = 'tmp.nextflow_assemble.work' outdir = 'tmp.nextflow_assemble.out' command = ' '.join([ 'nextflow run', '--input_dir', input_dir, '--testing', '--shovill_tempdir /foo/bar', '-c', nextflow_helper.config_file, '-w ', work_dir, nextflow_file, ]) try: completed_process = subprocess.check_output(command, stderr=subprocess.STDOUT, shell=True) except subprocess.CalledProcessError as e: print('Error running nextflow\nCommand: ', command) print('Output:', e.stdout.decode(), sep='\n') print('\n____________________________________\n') self.assertTrue(False) expected_json = { "ERS3": { "reads": False, "asm": True, "annot": False, "ignore": False }, "ERS1": { "reads": False, "asm": True, "annot": False, "ignore": False }, "ERS2": { "reads": False, "asm": True, "annot": False, "ignore": False } } self.maxDiff = None sdirs = sample_dirs.SampleDirs(input_dir) self.assertEqual(expected_json, sdirs.sample_data) utils.rmtree(input_dir) utils.rmtree(work_dir) nextflow_helper.clean_files()
def test_nextflow_generic_pipeline(self): """test nextflow generic pipeline using database""" tmp_data_dir = "tmp.nextflow_generic_pipeline_db_input.data" if os.path.exists(tmp_data_dir): shutil.rmtree(tmp_data_dir) shutil.copytree(data_dir, tmp_data_dir) nextflow_helper.write_config_file() mysql_config_file = os.path.join(data_dir, "db.cnf") mysql_dump = os.path.join(data_dir, "mysql.dump") db_config_data = db_connection.DbConnection._parse_config_file(db_ini_file) utils.syscall( "mysql --defaults-file=" + mysql_config_file + ' -e "DROP DATABASE IF EXISTS ' + db_config_data["db"] + "; CREATE DATABASE " + db_config_data["db"] + '"' ) utils.syscall( "mysql --defaults-file=" + mysql_config_file + " " + db_config_data["db"] + " < " + mysql_dump ) pipeline_root = os.path.join(tmp_data_dir, "Pipeline_root") nextflow_file = os.path.join( nextflow_helper.nextflow_dir, "generic_pipeline.nf" ) work_dir = "tmp.nextflow_generic_pipeline.work" dag_file = "nextflow.generic_pipeline.dag.pdf" pipeline_name = "generic_pipeline" script = os.path.join(data_dir, "script.pl") try: os.unlink(dag_file) except: pass command = " ".join( [ "nextflow run", "--dataset_name g1", # one read pair is from group 2 and should get ignored "--pipeline_name", pipeline_name, "--pipeline_root", pipeline_root, "--script", script, "--db_config_file", db_ini_file, "--max_ram", "0.5", "-with-dag", dag_file, "-c", nextflow_helper.config_file, "-w", work_dir, nextflow_file, ] ) utils.syscall(command) os.unlink(nextflow_helper.config_file) shutil.rmtree(work_dir) # check database Pipeline table updated as expected database = db.Db(db_ini_file) got_rows = database.get_rows_from_table("Pipeline") got_rows.sort(key=itemgetter("isolate_id", "pipeline_name")) expected_rows = [ { "isolate_id": 1, "seqrep_id": 1, "seqrep_pool": None, "version": "0.1.2", "pipeline_name": "remove_contam", "status": 1, "reference_id": 1, }, { "isolate_id": 1, "seqrep_id": 2, "seqrep_pool": None, "version": "0.1.2", "pipeline_name": "remove_contam", "status": 1, "reference_id": 1, }, { "isolate_id": 1, "seqrep_id": None, "seqrep_pool": "1_2", "version": clockwork_version, "pipeline_name": pipeline_name, "status": 1, "reference_id": None, }, { "isolate_id": 2, "seqrep_id": 3, "seqrep_pool": None, "version": "0.1.2", "pipeline_name": "remove_contam", "status": 1, "reference_id": 1, }, { "isolate_id": 2, "seqrep_id": 4, "seqrep_pool": None, "version": "0.1.2", "pipeline_name": "remove_contam", "status": 1, "reference_id": 1, }, { "isolate_id": 2, "seqrep_id": 3, "seqrep_pool": None, "version": clockwork_version, "pipeline_name": pipeline_name, "status": 1, "reference_id": None, }, { "isolate_id": 2, "seqrep_id": 4, "seqrep_pool": None, "version": clockwork_version, "pipeline_name": pipeline_name, "status": 1, "reference_id": None, }, { "isolate_id": 3, "seqrep_id": 5, "seqrep_pool": None, "version": "0.1.2", "pipeline_name": "remove_contam", "status": 1, "reference_id": 1, }, { "isolate_id": 3, "seqrep_id": None, "seqrep_pool": "1", "version": clockwork_version, "pipeline_name": pipeline_name, "status": -1, "reference_id": None, }, { "isolate_id": 4, "seqrep_id": 6, "seqrep_pool": None, "version": "0.1.2", "pipeline_name": "remove_contam", "status": 1, "reference_id": 1, }, ] expected_rows.sort(key=itemgetter("isolate_id", "pipeline_name")) self.assertEqual(expected_rows, got_rows) # check that the expected output file from the script.pl # got made (except for the sample that is expected to fail) ids = [ {"sample": 1, "seqrep_id": "1_2", "isolate_id": 1, "seq_repl": "1_2"}, {"sample": 2, "seqrep_id": 3, "isolate_id": 2, "seq_repl": "1"}, {"sample": 2, "seqrep_id": 4, "isolate_id": 2, "seq_repl": "2"}, ] for id_dict in ids: iso_dir = isolate_dir.IsolateDir( pipeline_root, id_dict["sample"], id_dict["isolate_id"] ) pipeline_dir = iso_dir.pipeline_dir( id_dict["seq_repl"], pipeline_name, clockwork_version ) counts_file = os.path.join(pipeline_dir, "count.txt") self.assertTrue(os.path.exists(counts_file)) shutil.rmtree(tmp_data_dir) nextflow_helper.clean_files()
def test_nextflow_run_callers(self): '''test nextflow_run_callers''' nextflow_helper.write_config_file() input_data_file = 'tmp.nextflow_run_callers.data.tsv' with open(input_data_file, 'w') as f: reads_prefix = os.path.join(data_dir, 'reads') print('ERR025839', reads_prefix + '.1.1.fq', reads_prefix + '.1.2.fq', sep='\t', file=f) print('sample2', reads_prefix + '.2.1.fq', reads_prefix + '.2.2.fq', sep='\t', file=f) callers_file = os.path.join(data_dir, 'callers.tsv') nextflow_file = os.path.join(nextflow_helper.nextflow_dir, 'run_callers.nf') work_dir = 'tmp.nextflow_run_callers.work' outdir = 'tmp.nextflow_run_callers.out' command = ' '.join([ 'nextflow run', '--input_data_file', input_data_file, '--callers_file', callers_file, '--output_dir', outdir, '--species tb', '--testing', '-c', nextflow_helper.config_file, '-w ', work_dir, nextflow_file, ]) try: completed_process = subprocess.check_output( command, stderr=subprocess.STDOUT, shell=True) except subprocess.CalledProcessError as e: print('Error running nextflow\nCommand: ', command) print('Output:', e.stdout.decode(), sep='\n') print('\n____________________________________\n') self.assertTrue(False) os.unlink(input_data_file) nextflow_helper.clean_files() expected_json = os.path.join(data_dir, 'expected.summary.json') with open(expected_json) as f: expect_json_data = json.load(f) files_to_check = [ os.path.join(outdir, 'caller_output', '0', '0', 'summary.json'), os.path.join(outdir, 'caller_output', '0', '1', 'summary.json'), ] tools = [ 'KvarQ', 'Mykrobe.tb.Fail', 'Mykrobe.tb.walker-2015', 'TB-Profiler' ] for filename in files_to_check: with open(filename) as f: got = json.load(f) for tool in tools: # Check resistance calls. Can't check memory and time because # will be different each time it's run self.assertEqual(expect_json_data[tool]['Success'], got[tool]['Success']) if tool == 'Mykrobe.tb.Fail': continue self.assertEqual(expect_json_data[tool]['resistance_calls'], got[tool]['resistance_calls']) self.assertIn('time_and_memory', got[tool]) self.assertIn('ram', got[tool]['time_and_memory']) self.assertIn('system_time', got[tool]['time_and_memory']) self.assertIn('user_time', got[tool]['time_and_memory']) self.assertIn('wall_clock_time', got[tool]['time_and_memory']) shutil.rmtree(work_dir) self.assertTrue(os.path.exists(os.path.join(outdir, 'summary.json'))) shutil.rmtree(outdir)
def test_nextflow_generic_pipeline(self): '''test nextflow generic pipeline using database''' tmp_data_dir = 'tmp.nextflow_generic_pipeline_db_input.data' if os.path.exists(tmp_data_dir): shutil.rmtree(tmp_data_dir) shutil.copytree(data_dir, tmp_data_dir) nextflow_helper.write_config_file() mysql_config_file = os.path.join(data_dir, 'db.cnf') mysql_dump = os.path.join(data_dir, 'mysql.dump') db_config_data = db_connection.DbConnection._parse_config_file(db_ini_file) utils.syscall('mysql --defaults-file=' + mysql_config_file + ' -e "DROP DATABASE IF EXISTS ' + db_config_data['db'] + '; CREATE DATABASE ' + db_config_data['db'] + '"') utils.syscall('mysql --defaults-file=' + mysql_config_file + ' ' + db_config_data['db'] + ' < ' + mysql_dump) pipeline_root = os.path.join(tmp_data_dir, 'Pipeline_root') nextflow_file = os.path.join(nextflow_helper.nextflow_dir, 'generic_pipeline.nf') work_dir = 'tmp.nextflow_generic_pipeline.work' dag_file = 'nextflow.generic_pipeline.dag.pdf' pipeline_name = 'generic_pipeline' script = os.path.join(data_dir, 'script.pl') try: os.unlink(dag_file) except: pass command = ' '.join([ 'nextflow run', '--dataset_name g1', # one read pair is from group 2 and should get ignored '--pipeline_name', pipeline_name, '--pipeline_root', pipeline_root, '--script', script, '--db_config_file', db_ini_file, '--max_ram', '0.5', '-with-dag', dag_file, '-c', nextflow_helper.config_file, '-w', work_dir, nextflow_file ]) utils.syscall(command) os.unlink(nextflow_helper.config_file) shutil.rmtree(work_dir) # check database Pipeline table updated as expected database = db.Db(db_ini_file) got_rows = database.get_rows_from_table('Pipeline') got_rows.sort(key=itemgetter('isolate_id', 'pipeline_name')) expected_rows = [ {'isolate_id': 1, 'seqrep_id': 1, 'seqrep_pool': None, 'version': '0.1.2', 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1}, {'isolate_id': 1, 'seqrep_id': 2, 'seqrep_pool': None, 'version': '0.1.2', 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1}, {'isolate_id': 1, 'seqrep_id': None, 'seqrep_pool': '1_2', 'version': clockwork_version, 'pipeline_name': pipeline_name, 'status': 1, 'reference_id': None}, {'isolate_id': 2, 'seqrep_id': 3, 'seqrep_pool': None, 'version': '0.1.2', 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1}, {'isolate_id': 2, 'seqrep_id': 4, 'seqrep_pool': None, 'version': '0.1.2', 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1}, {'isolate_id': 2, 'seqrep_id': 3, 'seqrep_pool': None, 'version': clockwork_version, 'pipeline_name': pipeline_name, 'status': 1, 'reference_id': None}, {'isolate_id': 2, 'seqrep_id': 4, 'seqrep_pool': None, 'version': clockwork_version, 'pipeline_name': pipeline_name, 'status': 1, 'reference_id': None}, {'isolate_id': 3, 'seqrep_id': 5, 'seqrep_pool': None, 'version': '0.1.2', 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1}, {'isolate_id': 3, 'seqrep_id': None, 'seqrep_pool': '1', 'version': clockwork_version, 'pipeline_name': pipeline_name, 'status': -1, 'reference_id': None}, {'isolate_id': 4, 'seqrep_id': 6, 'seqrep_pool': None, 'version': '0.1.2', 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1}, ] expected_rows.sort(key=itemgetter('isolate_id', 'pipeline_name')) self.assertEqual(expected_rows, got_rows) # check that the expected output file from the script.pl # got made (except for the sample that is expected to fail) ids = [ {'sample': 1, 'seqrep_id': '1_2', 'isolate_id': 1, 'seq_repl': '1_2'}, {'sample': 2, 'seqrep_id': 3, 'isolate_id': 2, 'seq_repl': '1'}, {'sample': 2, 'seqrep_id': 4, 'isolate_id': 2, 'seq_repl': '2'}, ] for id_dict in ids: iso_dir = isolate_dir.IsolateDir(pipeline_root, id_dict['sample'], id_dict['isolate_id']) pipeline_dir = iso_dir.pipeline_dir(id_dict['seq_repl'], pipeline_name, clockwork_version) counts_file = os.path.join(pipeline_dir, 'count.txt') self.assertTrue(os.path.exists(counts_file)) shutil.rmtree(tmp_data_dir) nextflow_helper.clean_files()
def test_nextflow_qc_using_database(self): """test nextflow_qc using database""" tmp_data_dir = "tmp.nextflow_qc" if os.path.exists(tmp_data_dir): shutil.rmtree(tmp_data_dir) shutil.copytree(data_dir, tmp_data_dir) nextflow_helper.write_config_file() mysql_config_file = os.path.join(data_dir, "db.cnf") mysql_dump = os.path.join(data_dir, "mysql.dump") db_config_data = db_connection.DbConnection._parse_config_file( db_ini_file) utils.syscall("mysql --defaults-file=" + mysql_config_file + ' -e "DROP DATABASE IF EXISTS ' + db_config_data["db"] + "; CREATE DATABASE " + db_config_data["db"] + '"') utils.syscall("mysql --defaults-file=" + mysql_config_file + " " + db_config_data["db"] + " < " + mysql_dump) pipeline_root = os.path.join(tmp_data_dir, "Pipeline_root") references_root = os.path.join(tmp_data_dir, "Pipeline_refs") nextflow_file = os.path.join(nextflow_helper.nextflow_dir, "qc.nf") work_dir = "tmp.nextflow_qc.work" dag_file = "nextflow.qc.dag.db.pdf" try: os.unlink(dag_file) except: pass command = " ".join([ "nextflow run", "--dataset_name g1", # one of the samples is in group2 and should get ignored "--ref_id 1", "--references_root", os.path.abspath(references_root), "--pipeline_root", pipeline_root, "--db_config_file", db_ini_file, "-with-dag", dag_file, "-c", nextflow_helper.config_file, "-w", work_dir, nextflow_file, ]) utils.syscall(command) os.unlink(nextflow_helper.config_file) shutil.rmtree(work_dir) # check database Pipeline table updated as expected database = db.Db(db_ini_file) got_pipeline_rows = database.get_rows_from_table("Pipeline") got_pipeline_rows.sort(key=itemgetter("seqrep_id")) expected_pipeline_rows = [ { "isolate_id": 1, "seqrep_id": 1, "seqrep_pool": None, "version": "0.0.1", "pipeline_name": "remove_contam", "status": 1, "reference_id": 1, }, { "isolate_id": 1, "seqrep_id": 1, "seqrep_pool": None, "version": clockwork_version, "pipeline_name": "qc", "status": 1, "reference_id": 1, }, { "isolate_id": 2, "seqrep_id": 2, "seqrep_pool": None, "version": "0.0.1", "pipeline_name": "remove_contam", "status": 1, "reference_id": 1, }, { "isolate_id": 2, "seqrep_id": 2, "seqrep_pool": None, "version": clockwork_version, "pipeline_name": "qc", "status": 1, "reference_id": 1, }, { "isolate_id": 3, "seqrep_id": 3, "seqrep_pool": None, "version": "0.0.1", "pipeline_name": "remove_contam", "status": 1, "reference_id": 1, }, { "isolate_id": 3, "seqrep_id": 3, "seqrep_pool": None, "version": clockwork_version, "pipeline_name": "qc", "status": -1, "reference_id": 1, }, { "isolate_id": 4, "seqrep_id": 4, "seqrep_pool": None, "version": "0.0.1", "pipeline_name": "remove_contam", "status": 1, "reference_id": 1, }, ] self.assertEqual(expected_pipeline_rows, got_pipeline_rows) # check QC stats added to database got_qc_rows = database.get_rows_from_table("QC") got_qc_rows.sort(key=itemgetter("seqrep_id")) expected_qc_rows = [ { "seqrep_id": 1, "pipeline_version": clockwork_version, "fastqc1_adapter_content": "pass", "fastqc1_basic_statistics": "pass", "fastqc1_gc": 48.0, "fastqc1_kmer_content": "fail", "fastqc1_max_sequence_length": 75, "fastqc1_min_sequence_length": 75, "fastqc1_overrepresented_sequences": "fail", "fastqc1_per_base_n_content": "pass", "fastqc1_per_base_sequence_content": "fail", "fastqc1_per_base_sequence_quality": "pass", "fastqc1_per_sequence_gc_content": "fail", "fastqc1_per_sequence_quality_scores": "fail", "fastqc1_sequence_duplication_levels": "pass", "fastqc1_sequence_length_distribution": "pass", "fastqc1_sequences_flagged_as_poor_quality": 0, "fastqc1_total_sequences": 72, "fastqc2_adapter_content": "pass", "fastqc2_basic_statistics": "pass", "fastqc2_gc": 48.0, "fastqc2_kmer_content": "fail", "fastqc2_max_sequence_length": 75, "fastqc2_min_sequence_length": 75, "fastqc2_overrepresented_sequences": "fail", "fastqc2_per_base_n_content": "pass", "fastqc2_per_base_sequence_content": "fail", "fastqc2_per_base_sequence_quality": "pass", "fastqc2_per_sequence_gc_content": "fail", "fastqc2_per_sequence_quality_scores": "fail", "fastqc2_sequence_duplication_levels": "pass", "fastqc2_sequence_length_distribution": "pass", "fastqc2_sequences_flagged_as_poor_quality": 0, "fastqc2_total_sequences": 72, "samtools_average_quality": 40.0, "samtools_bases_mapped_cigar": 9900, "samtools_bases_trimmed": 0, "samtools_error_rate": 0.0, "samtools_insert_size_average": 199.6, "samtools_insert_size_standard_deviation": 1.0, "samtools_inward_oriented_pairs": 66, "samtools_outward_oriented_pairs": 0, "samtools_pairs_with_other_orientation": 0, "samtools_raw_total_sequences": 144, "samtools_reads_duplicated": 4, "samtools_reads_mapped": 132, "het_snp_het_calls": 0, "het_snp_positions": 983, "het_snp_total_snps": 0, }, { "seqrep_id": 2, "pipeline_version": clockwork_version, "fastqc1_adapter_content": "pass", "fastqc1_basic_statistics": "pass", "fastqc1_gc": 48.0, "fastqc1_kmer_content": "fail", "fastqc1_max_sequence_length": 75, "fastqc1_min_sequence_length": 75, "fastqc1_overrepresented_sequences": "fail", "fastqc1_per_base_n_content": "pass", "fastqc1_per_base_sequence_content": "fail", "fastqc1_per_base_sequence_quality": "pass", "fastqc1_per_sequence_gc_content": "fail", "fastqc1_per_sequence_quality_scores": "fail", "fastqc1_sequence_duplication_levels": "pass", "fastqc1_sequence_length_distribution": "pass", "fastqc1_sequences_flagged_as_poor_quality": 0, "fastqc1_total_sequences": 72, "fastqc2_adapter_content": "pass", "fastqc2_basic_statistics": "pass", "fastqc2_gc": 49.0, "fastqc2_kmer_content": "fail", "fastqc2_max_sequence_length": 75, "fastqc2_min_sequence_length": 75, "fastqc2_overrepresented_sequences": "fail", "fastqc2_per_base_n_content": "pass", "fastqc2_per_base_sequence_content": "fail", "fastqc2_per_base_sequence_quality": "pass", "fastqc2_per_sequence_gc_content": "warn", "fastqc2_per_sequence_quality_scores": "fail", "fastqc2_sequence_duplication_levels": "pass", "fastqc2_sequence_length_distribution": "pass", "fastqc2_sequences_flagged_as_poor_quality": 0, "fastqc2_total_sequences": 72, "samtools_average_quality": 40.0, "samtools_bases_mapped_cigar": 9900, "samtools_bases_trimmed": 0, "samtools_error_rate": 0.0, "samtools_insert_size_average": 199.7, "samtools_insert_size_standard_deviation": 1.1, "samtools_inward_oriented_pairs": 66, "samtools_outward_oriented_pairs": 0, "samtools_pairs_with_other_orientation": 0, "samtools_raw_total_sequences": 144, "samtools_reads_duplicated": 0, "samtools_reads_mapped": 132, "het_snp_het_calls": 0, "het_snp_positions": 983, "het_snp_total_snps": 0, }, ] self.assertEqual(expected_qc_rows, got_qc_rows) # check QC files got written. No need to check contents, as that is done # elsewhere. We're just checking nextflow runs OK here. ids = [ { "sample": 1, "isolate_id": 1, "seq_repl": 43 }, { "sample": 2, "isolate_id": 2, "seq_repl": 45 }, ] for id_dict in ids: iso_dir = isolate_dir.IsolateDir(pipeline_root, id_dict["sample"], id_dict["isolate_id"]) qc_root_dir = iso_dir.pipeline_dir(id_dict["seq_repl"], "qc", clockwork_version) self.assertTrue(os.path.exists(qc_root_dir)) for method in ["fastqc", "samtools_qc"]: this_qc_dir = os.path.join(qc_root_dir, method) self.assertTrue(os.path.exists(this_qc_dir)) self.assertTrue(len(os.listdir(this_qc_dir)) >= 1) shutil.rmtree(tmp_data_dir) nextflow_helper.clean_files()
def test_nextflow_remove_contam_using_database(self): '''test nextflow_remove_contam using database''' tmp_data_dir = 'tmp.nextflow_remove_contam' if os.path.exists(tmp_data_dir): shutil.rmtree(tmp_data_dir) shutil.copytree(data_dir, tmp_data_dir) nextflow_helper.write_config_file() mysql_config_file = os.path.join(data_dir, 'db.cnf') mysql_dump = os.path.join(data_dir, 'mysql.dump') db_config_data = db_connection.DbConnection._parse_config_file( db_ini_file) utils.syscall('mysql --defaults-file=' + mysql_config_file + ' -e "DROP DATABASE IF EXISTS ' + db_config_data['db'] + '; CREATE DATABASE ' + db_config_data['db'] + '"') utils.syscall('mysql --defaults-file=' + mysql_config_file + ' ' + db_config_data['db'] + ' < ' + mysql_dump) pipeline_root = os.path.join(tmp_data_dir, 'Pipeline_root') references_root = os.path.join(tmp_data_dir, 'Pipeline_refs') nextflow_file = os.path.join(nextflow_helper.nextflow_dir, 'remove_contam.nf') work_dir = 'tmp.nextflow_remove_contam.work' dag_file = 'nextflow.remove_contam.dag.db.pdf' try: os.unlink(dag_file) except: pass command = ' '.join([ 'nextflow run', '--dataset_name g1', # one read pair has group g2, so should get ignored '--ref_id 1', '--references_root', os.path.abspath(references_root), '--pipeline_root', os.path.abspath(pipeline_root), '--db_config_file', db_ini_file, '--testing', '-with-dag', dag_file, '-c', nextflow_helper.config_file, '-w', work_dir, nextflow_file ]) utils.syscall(command) os.unlink(nextflow_helper.config_file) shutil.rmtree(work_dir) # check database Pipeline table updated as expected database = db.Db(db_ini_file) got_rows = database.get_rows_from_table('Pipeline') got_rows.sort(key=itemgetter('seqrep_id')) expected_rows = [ { 'isolate_id': 1, 'seqrep_id': 1, 'seqrep_pool': None, 'version': clockwork_version, 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1 }, { 'isolate_id': 2, 'seqrep_id': 2, 'seqrep_pool': None, 'version': clockwork_version, 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1 }, { 'isolate_id': 3, 'seqrep_id': 3, 'seqrep_pool': None, 'version': clockwork_version, 'pipeline_name': 'remove_contam', 'status': -1, 'reference_id': 1 }, ] self.assertEqual(expected_rows, got_rows) # check database Read_counts table updated got_rows = database.get_rows_from_table('Read_counts') got_rows.sort(key=itemgetter('seqrep_id')) expected_rows = [ { 'seqrep_id': 1, 'original_total': 198, 'contamination': 40, 'not_contamination': 132, 'unmapped': 26, 'total_after_remove_contam': 158, }, { 'seqrep_id': 2, 'original_total': 156, 'contamination': 12, 'not_contamination': 132, 'unmapped': 12, 'total_after_remove_contam': 144, }, ] self.assertEqual(expected_rows, got_rows) # check FASTQ files got written. No need to check contents, as that is done # elsewhere. We're just checking nextflow runs OK here. ids = [ { 'sample': 1, 'isolate_id': 1, 'seq_repl': 43 }, { 'sample': 2, 'isolate_id': 2, 'seq_repl': 45 }, ] for id_dict in ids: iso_dir = isolate_dir.IsolateDir(pipeline_root, id_dict['sample'], id_dict['isolate_id']) for read_type in ('original', 'remove_contam', 'contam'): for i in (1, 2): self.assertTrue( os.path.exists( iso_dir.reads_filename(read_type, id_dict['seq_repl'], i))) shutil.rmtree(tmp_data_dir) nextflow_helper.clean_files()
def test_nextflow_variant_call_using_database(self): """test nextflow_variant_call using database""" tmp_data_dir = "tmp.nextflow_variant_call_db_input.data" if os.path.exists(tmp_data_dir): shutil.rmtree(tmp_data_dir) shutil.copytree(data_dir, tmp_data_dir) nextflow_helper.write_config_file() mysql_config_file = os.path.join(data_dir, "db.cnf") mysql_dump = os.path.join(data_dir, "mysql.dump") db_config_data = db_connection.DbConnection._parse_config_file( db_ini_file) utils.syscall("mysql --defaults-file=" + mysql_config_file + ' -e "DROP DATABASE IF EXISTS ' + db_config_data["db"] + "; CREATE DATABASE " + db_config_data["db"] + '"') utils.syscall("mysql --defaults-file=" + mysql_config_file + " " + db_config_data["db"] + " < " + mysql_dump) pipeline_root = os.path.join(tmp_data_dir, "Pipeline_root") references_root = os.path.join(tmp_data_dir, "Pipeline_refs") nextflow_file = os.path.join(nextflow_helper.nextflow_dir, "variant_call.nf") work_dir = "tmp.nextflow_variant_call_db_input.work" dag_file = "nextflow.variant_call.dag.db.pdf" try: os.unlink(dag_file) except: pass command = " ".join([ "nextflow run", "--dataset_name g1", # one read pair is from group 2 and should get ignored "--ref_id 2", "--references_root", os.path.abspath(references_root), "--pipeline_root", pipeline_root, "--db_config_file", db_ini_file, "--cortex_mem_height 17", "--testing", # Using truth ref is broken, and we nevr use it anyway, # so disable this for now #"--truth_ref", #os.path.join(tmp_data_dir, "truth_ref.fa"), "-with-dag", dag_file, "-c", nextflow_helper.config_file, "-w", work_dir, nextflow_file, ]) utils.syscall(command) os.unlink(nextflow_helper.config_file) shutil.rmtree(work_dir) # check database Pipeline table updated as expected database = db.Db(db_ini_file) got_rows = database.get_rows_from_table("Pipeline") got_rows.sort(key=itemgetter("isolate_id", "pipeline_name")) expected_rows = [ { "isolate_id": 1, "seqrep_id": 1, "seqrep_pool": None, "version": "0.3.1", "pipeline_name": "remove_contam", "status": 1, "reference_id": 1, }, { "isolate_id": 1, "seqrep_id": 2, "seqrep_pool": None, "version": "0.3.1", "pipeline_name": "remove_contam", "status": 1, "reference_id": 1, }, { "isolate_id": 1, "seqrep_id": None, "seqrep_pool": "1_2", "version": clockwork_version, "pipeline_name": "variant_call", "status": 1, "reference_id": 2, }, { "isolate_id": 2, "seqrep_id": 3, "seqrep_pool": None, "version": "0.3.1", "pipeline_name": "remove_contam", "status": 1, "reference_id": 1, }, { "isolate_id": 2, "seqrep_id": 4, "seqrep_pool": None, "version": "0.3.1", "pipeline_name": "remove_contam", "status": 1, "reference_id": 1, }, { "isolate_id": 2, "seqrep_id": 3, "seqrep_pool": None, "version": clockwork_version, "pipeline_name": "variant_call", "status": 1, "reference_id": 2, }, { "isolate_id": 2, "seqrep_id": 4, "seqrep_pool": None, "version": clockwork_version, "pipeline_name": "variant_call", "status": 1, "reference_id": 2, }, { "isolate_id": 3, "seqrep_id": 5, "seqrep_pool": None, "version": "0.3.1", "pipeline_name": "remove_contam", "status": 1, "reference_id": 1, }, { "isolate_id": 3, "seqrep_id": None, "seqrep_pool": "1", "version": clockwork_version, "pipeline_name": "variant_call", "status": -1, "reference_id": 2, }, { "isolate_id": 4, "seqrep_id": 6, "seqrep_pool": None, "version": "0.3.1", "pipeline_name": "remove_contam", "status": 1, "reference_id": 1, }, ] self.assertEqual(expected_rows, got_rows) # check VCF files etc got written. No need to check contents, trust the tools # We're just checking nextflow runs OK here. ids = [ { "sample": 1, "seqrep_id": "1_2", "isolate_id": 1, "seq_repl": "1_2", "sample_name": "site.s1.iso.42.subject.p1.lab_id.l1.seq_reps.1_2", }, { "sample": 2, "seqrep_id": 3, "isolate_id": 2, "seq_repl": "1", "sample_name": "site.s2.iso.43.subject.p2.lab_id.l2.seq_reps.1", }, { "sample": 2, "seqrep_id": 4, "isolate_id": 2, "seq_repl": "2", "sample_name": "site.s2.iso.43.subject.p2.lab_id.l2.seq_reps.2", }, ] for id_dict in ids: iso_dir = isolate_dir.IsolateDir(pipeline_root, id_dict["sample"], id_dict["isolate_id"]) pipeline_dir = iso_dir.pipeline_dir(id_dict["seq_repl"], "variant_call", clockwork_version, reference_id=2) self._files_are_present_and_correct(pipeline_dir, id_dict["sample_name"], expect_ref_check_files=False) shutil.rmtree(tmp_data_dir) nextflow_helper.clean_files()
def test_nextflow_variant_call_using_database(self): '''test nextflow_variant_call using database''' tmp_data_dir = 'tmp.nextflow_variant_call_db_input.data' if os.path.exists(tmp_data_dir): shutil.rmtree(tmp_data_dir) shutil.copytree(data_dir, tmp_data_dir) nextflow_helper.write_config_file() mysql_config_file = os.path.join(data_dir, 'db.cnf') mysql_dump = os.path.join(data_dir, 'mysql.dump') db_config_data = db_connection.DbConnection._parse_config_file( db_ini_file) utils.syscall('mysql --defaults-file=' + mysql_config_file + ' -e "DROP DATABASE IF EXISTS ' + db_config_data['db'] + '; CREATE DATABASE ' + db_config_data['db'] + '"') utils.syscall('mysql --defaults-file=' + mysql_config_file + ' ' + db_config_data['db'] + ' < ' + mysql_dump) pipeline_root = os.path.join(tmp_data_dir, 'Pipeline_root') references_root = os.path.join(tmp_data_dir, 'Pipeline_refs') nextflow_file = os.path.join(nextflow_helper.nextflow_dir, 'variant_call.nf') work_dir = 'tmp.nextflow_variant_call_db_input.work' dag_file = 'nextflow.variant_call.dag.db.pdf' try: os.unlink(dag_file) except: pass command = ' '.join([ 'nextflow run', '--dataset_name g1', # one read pair is from group 2 and should get ignored '--ref_id 2', '--references_root', os.path.abspath(references_root), '--pipeline_root', pipeline_root, '--db_config_file', db_ini_file, '--cortex_mem_height 17', '--testing', '-with-dag', dag_file, '-c', nextflow_helper.config_file, '-w', work_dir, nextflow_file ]) utils.syscall(command) os.unlink(nextflow_helper.config_file) shutil.rmtree(work_dir) # check database Pipeline table updated as expected database = db.Db(db_ini_file) got_rows = database.get_rows_from_table('Pipeline') got_rows.sort(key=itemgetter('isolate_id', 'pipeline_name')) expected_rows = [ { 'isolate_id': 1, 'seqrep_id': 1, 'seqrep_pool': None, 'version': '0.0.1', 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1 }, { 'isolate_id': 1, 'seqrep_id': 2, 'seqrep_pool': None, 'version': '0.0.1', 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1 }, { 'isolate_id': 1, 'seqrep_id': None, 'seqrep_pool': '1_2', 'version': clockwork_version, 'pipeline_name': 'variant_call', 'status': 1, 'reference_id': 2 }, { 'isolate_id': 2, 'seqrep_id': 3, 'seqrep_pool': None, 'version': '0.0.1', 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1 }, { 'isolate_id': 2, 'seqrep_id': 4, 'seqrep_pool': None, 'version': '0.0.1', 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1 }, { 'isolate_id': 2, 'seqrep_id': 3, 'seqrep_pool': None, 'version': clockwork_version, 'pipeline_name': 'variant_call', 'status': 1, 'reference_id': 2 }, { 'isolate_id': 2, 'seqrep_id': 4, 'seqrep_pool': None, 'version': clockwork_version, 'pipeline_name': 'variant_call', 'status': 1, 'reference_id': 2 }, { 'isolate_id': 3, 'seqrep_id': 5, 'seqrep_pool': None, 'version': '0.0.1', 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1 }, { 'isolate_id': 3, 'seqrep_id': None, 'seqrep_pool': '1', 'version': clockwork_version, 'pipeline_name': 'variant_call', 'status': -1, 'reference_id': 2 }, { 'isolate_id': 4, 'seqrep_id': 6, 'seqrep_pool': None, 'version': '0.0.1', 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1 }, ] self.assertEqual(expected_rows, got_rows) # check VCF files etc got written. No need to check contents, trust the tools # We're just checking nextflow runs OK here. ids = [ { 'sample': 1, 'seqrep_id': '1_2', 'isolate_id': 1, 'seq_repl': '1_2' }, { 'sample': 2, 'seqrep_id': 3, 'isolate_id': 2, 'seq_repl': '1' }, { 'sample': 2, 'seqrep_id': 4, 'isolate_id': 2, 'seq_repl': '2' }, ] for id_dict in ids: iso_dir = isolate_dir.IsolateDir(pipeline_root, id_dict['sample'], id_dict['isolate_id']) pipeline_dir = iso_dir.pipeline_dir(id_dict['seq_repl'], 'variant_call', clockwork_version, reference_id=2) expected_sample = '.'.join([ str(id_dict[x]) for x in ['sample', 'isolate_id', 'seqrep_id', 'seq_repl'] ]) self._files_are_present_and_correct(pipeline_dir, expected_sample) shutil.rmtree(tmp_data_dir) nextflow_helper.clean_files()
def test_nextflow_qc_using_database(self): '''test nextflow_qc using database''' tmp_data_dir = 'tmp.nextflow_qc' if os.path.exists(tmp_data_dir): shutil.rmtree(tmp_data_dir) shutil.copytree(data_dir, tmp_data_dir) nextflow_helper.write_config_file() mysql_config_file = os.path.join(data_dir, 'db.cnf') mysql_dump = os.path.join(data_dir, 'mysql.dump') db_config_data = db_connection.DbConnection._parse_config_file( db_ini_file) utils.syscall('mysql --defaults-file=' + mysql_config_file + ' -e "DROP DATABASE IF EXISTS ' + db_config_data['db'] + '; CREATE DATABASE ' + db_config_data['db'] + '"') utils.syscall('mysql --defaults-file=' + mysql_config_file + ' ' + db_config_data['db'] + ' < ' + mysql_dump) pipeline_root = os.path.join(tmp_data_dir, 'Pipeline_root') references_root = os.path.join(tmp_data_dir, 'Pipeline_refs') nextflow_file = os.path.join(nextflow_helper.nextflow_dir, 'qc.nf') work_dir = 'tmp.nextflow_qc.work' dag_file = 'nextflow.qc.dag.db.pdf' try: os.unlink(dag_file) except: pass command = ' '.join([ 'nextflow run', '--dataset_name g1', # one of the samples is in group2 and should get ignored '--ref_id 1', '--references_root', os.path.abspath(references_root), '--pipeline_root', pipeline_root, '--db_config_file', db_ini_file, '-with-dag', dag_file, '-c', nextflow_helper.config_file, '-w', work_dir, nextflow_file ]) utils.syscall(command) os.unlink(nextflow_helper.config_file) shutil.rmtree(work_dir) # check database Pipeline table updated as expected database = db.Db(db_ini_file) got_pipeline_rows = database.get_rows_from_table('Pipeline') got_pipeline_rows.sort(key=itemgetter('seqrep_id')) expected_pipeline_rows = [ { 'isolate_id': 1, 'seqrep_id': 1, 'seqrep_pool': None, 'version': '0.0.1', 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1 }, { 'isolate_id': 1, 'seqrep_id': 1, 'seqrep_pool': None, 'version': clockwork_version, 'pipeline_name': 'qc', 'status': 1, 'reference_id': 1 }, { 'isolate_id': 2, 'seqrep_id': 2, 'seqrep_pool': None, 'version': '0.0.1', 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1 }, { 'isolate_id': 2, 'seqrep_id': 2, 'seqrep_pool': None, 'version': clockwork_version, 'pipeline_name': 'qc', 'status': 1, 'reference_id': 1 }, { 'isolate_id': 3, 'seqrep_id': 3, 'seqrep_pool': None, 'version': '0.0.1', 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1 }, { 'isolate_id': 3, 'seqrep_id': 3, 'seqrep_pool': None, 'version': clockwork_version, 'pipeline_name': 'qc', 'status': -1, 'reference_id': 1 }, { 'isolate_id': 4, 'seqrep_id': 4, 'seqrep_pool': None, 'version': '0.0.1', 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1 }, ] self.assertEqual(expected_pipeline_rows, got_pipeline_rows) # check QC stats added to database got_qc_rows = database.get_rows_from_table('QC') got_qc_rows.sort(key=itemgetter('seqrep_id')) expected_qc_rows = [{ 'seqrep_id': 1, 'pipeline_version': clockwork_version, 'fastqc1_adapter_content': 'pass', 'fastqc1_basic_statistics': 'pass', 'fastqc1_gc': 48.0, 'fastqc1_kmer_content': 'fail', 'fastqc1_max_sequence_length': 75, 'fastqc1_min_sequence_length': 75, 'fastqc1_overrepresented_sequences': 'fail', 'fastqc1_per_base_n_content': 'pass', 'fastqc1_per_base_sequence_content': 'fail', 'fastqc1_per_base_sequence_quality': 'pass', 'fastqc1_per_sequence_gc_content': 'fail', 'fastqc1_per_sequence_quality_scores': 'fail', 'fastqc1_sequence_duplication_levels': 'pass', 'fastqc1_sequence_length_distribution': 'pass', 'fastqc1_sequences_flagged_as_poor_quality': 0, 'fastqc1_total_sequences': 72, 'fastqc2_adapter_content': 'pass', 'fastqc2_basic_statistics': 'pass', 'fastqc2_gc': 48.0, 'fastqc2_kmer_content': 'fail', 'fastqc2_max_sequence_length': 75, 'fastqc2_min_sequence_length': 75, 'fastqc2_overrepresented_sequences': 'fail', 'fastqc2_per_base_n_content': 'pass', 'fastqc2_per_base_sequence_content': 'fail', 'fastqc2_per_base_sequence_quality': 'pass', 'fastqc2_per_sequence_gc_content': 'fail', 'fastqc2_per_sequence_quality_scores': 'fail', 'fastqc2_sequence_duplication_levels': 'pass', 'fastqc2_sequence_length_distribution': 'pass', 'fastqc2_sequences_flagged_as_poor_quality': 0, 'fastqc2_total_sequences': 72, 'samtools_average_quality': 40.0, 'samtools_bases_mapped_cigar': 9900, 'samtools_bases_trimmed': 0, 'samtools_error_rate': 0.0, 'samtools_insert_size_average': 199.6, 'samtools_insert_size_standard_deviation': 1.0, 'samtools_inward_oriented_pairs': 66, 'samtools_outward_oriented_pairs': 0, 'samtools_pairs_with_other_orientation': 0, 'samtools_raw_total_sequences': 144, 'samtools_reads_duplicated': 4, 'samtools_reads_mapped': 132, 'het_snp_het_calls': 0, 'het_snp_positions': 983, 'het_snp_total_snps': 0, }, { 'seqrep_id': 2, 'pipeline_version': clockwork_version, 'fastqc1_adapter_content': 'pass', 'fastqc1_basic_statistics': 'pass', 'fastqc1_gc': 48.0, 'fastqc1_kmer_content': 'fail', 'fastqc1_max_sequence_length': 75, 'fastqc1_min_sequence_length': 75, 'fastqc1_overrepresented_sequences': 'fail', 'fastqc1_per_base_n_content': 'pass', 'fastqc1_per_base_sequence_content': 'fail', 'fastqc1_per_base_sequence_quality': 'pass', 'fastqc1_per_sequence_gc_content': 'fail', 'fastqc1_per_sequence_quality_scores': 'fail', 'fastqc1_sequence_duplication_levels': 'pass', 'fastqc1_sequence_length_distribution': 'pass', 'fastqc1_sequences_flagged_as_poor_quality': 0, 'fastqc1_total_sequences': 72, 'fastqc2_adapter_content': 'pass', 'fastqc2_basic_statistics': 'pass', 'fastqc2_gc': 49.0, 'fastqc2_kmer_content': 'fail', 'fastqc2_max_sequence_length': 75, 'fastqc2_min_sequence_length': 75, 'fastqc2_overrepresented_sequences': 'fail', 'fastqc2_per_base_n_content': 'pass', 'fastqc2_per_base_sequence_content': 'fail', 'fastqc2_per_base_sequence_quality': 'pass', 'fastqc2_per_sequence_gc_content': 'warn', 'fastqc2_per_sequence_quality_scores': 'fail', 'fastqc2_sequence_duplication_levels': 'pass', 'fastqc2_sequence_length_distribution': 'pass', 'fastqc2_sequences_flagged_as_poor_quality': 0, 'fastqc2_total_sequences': 72, 'samtools_average_quality': 40.0, 'samtools_bases_mapped_cigar': 9900, 'samtools_bases_trimmed': 0, 'samtools_error_rate': 0.0, 'samtools_insert_size_average': 199.7, 'samtools_insert_size_standard_deviation': 1.1, 'samtools_inward_oriented_pairs': 66, 'samtools_outward_oriented_pairs': 0, 'samtools_pairs_with_other_orientation': 0, 'samtools_raw_total_sequences': 144, 'samtools_reads_duplicated': 0, 'samtools_reads_mapped': 132, 'het_snp_het_calls': 0, 'het_snp_positions': 983, 'het_snp_total_snps': 0, }] self.assertEqual(expected_qc_rows, got_qc_rows) # check QC files got written. No need to check contents, as that is done # elsewhere. We're just checking nextflow runs OK here. ids = [ { 'sample': 1, 'isolate_id': 1, 'seq_repl': 43 }, { 'sample': 2, 'isolate_id': 2, 'seq_repl': 45 }, ] for id_dict in ids: iso_dir = isolate_dir.IsolateDir(pipeline_root, id_dict['sample'], id_dict['isolate_id']) qc_root_dir = iso_dir.pipeline_dir(id_dict['seq_repl'], 'qc', clockwork_version) self.assertTrue(os.path.exists(qc_root_dir)) for method in ['fastqc', 'samtools_qc']: this_qc_dir = os.path.join(qc_root_dir, method) self.assertTrue(os.path.exists(this_qc_dir)) self.assertTrue(len(os.listdir(this_qc_dir)) >= 1) shutil.rmtree(tmp_data_dir) nextflow_helper.clean_files()
def test_nextflow_mykrobe_predict(self): '''test nextflow_mykrobe using database''' tmp_data_dir = 'tmp.nextflow_mykrobe_db_input.data' if os.path.exists(tmp_data_dir): shutil.rmtree(tmp_data_dir) shutil.copytree(data_dir, tmp_data_dir) nextflow_helper.write_config_file() mysql_config_file = os.path.join(data_dir, 'db.cnf') mysql_dump = os.path.join(data_dir, 'mysql.dump') db_config_data = db_connection.DbConnection._parse_config_file( db_ini_file) utils.syscall('mysql --defaults-file=' + mysql_config_file + ' -e "DROP DATABASE IF EXISTS ' + db_config_data['db'] + '; CREATE DATABASE ' + db_config_data['db'] + '"') utils.syscall('mysql --defaults-file=' + mysql_config_file + ' ' + db_config_data['db'] + ' < ' + mysql_dump) pipeline_root = os.path.join(tmp_data_dir, 'Pipeline_root') references_root = os.path.join(tmp_data_dir, 'Pipeline_refs') nextflow_file = os.path.join(nextflow_helper.nextflow_dir, 'mykrobe_predict.nf') work_dir = 'tmp.nextflow_mykrobe_db_input.work' dag_file = 'nextflow.mykrobe.dag.db.pdf' try: os.unlink(dag_file) except: pass command = ' '.join([ 'nextflow run', '--dataset_name g1', # one read pair is from group 2 and should get ignored '--ref_id 2', '--references_root', os.path.abspath(references_root), '--pipeline_root', pipeline_root, '--db_config_file', db_ini_file, '--testing', '-with-dag', dag_file, '-c', nextflow_helper.config_file, '-w', work_dir, nextflow_file ]) utils.syscall(command) os.unlink(nextflow_helper.config_file) shutil.rmtree(work_dir) # check database Pipeline table updated as expected. # The --testing option is set up so that the pooled # sample fails, hence it gets a status of -1. database = db.Db(db_ini_file) got_rows = database.get_rows_from_table('Pipeline') got_rows.sort(key=itemgetter('isolate_id', 'pipeline_name')) expected_rows = [ { 'isolate_id': 1, 'seqrep_id': None, 'seqrep_pool': '1_2', 'version': clockwork_version, 'pipeline_name': 'mykrobe_predict', 'status': -1, 'reference_id': 2 }, { 'isolate_id': 1, 'seqrep_id': 1, 'seqrep_pool': None, 'version': '0.4.0', 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1 }, { 'isolate_id': 1, 'seqrep_id': 2, 'seqrep_pool': None, 'version': '0.4.0', 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1 }, { 'isolate_id': 2, 'seqrep_id': 3, 'seqrep_pool': None, 'version': clockwork_version, 'pipeline_name': 'mykrobe_predict', 'status': 1, 'reference_id': 2 }, { 'isolate_id': 2, 'seqrep_id': 3, 'seqrep_pool': None, 'version': '0.4.0', 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1 }, { 'isolate_id': 2, 'seqrep_id': 4, 'seqrep_pool': None, 'version': '0.4.0', 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1 }, { 'isolate_id': 2, 'seqrep_id': 4, 'seqrep_pool': None, 'version': clockwork_version, 'pipeline_name': 'mykrobe_predict', 'status': 1, 'reference_id': 2 }, { 'isolate_id': 3, 'seqrep_id': None, 'seqrep_pool': '1', 'version': clockwork_version, 'pipeline_name': 'mykrobe_predict', 'status': 1, 'reference_id': 2 }, { 'isolate_id': 3, 'seqrep_id': 5, 'seqrep_pool': None, 'version': '0.4.0', 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1 }, { 'isolate_id': 4, 'seqrep_id': 6, 'seqrep_pool': None, 'version': '0.4.0', 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': 1 }, ] expected_rows.sort(key=itemgetter('isolate_id', 'pipeline_name')) self.assertEqual(expected_rows, got_rows) # check mykrobe output files etc got written. No need to check contents, trust the tools # We're just checking nextflow runs OK here. ids = [ { 'sample': 1, 'seqrep_id': '1_2', 'isolate_id': 1, 'seq_repl': '1_2', 'sample_name': 'site.s1.iso.42.subject.p1.lab_id.l1.seq_reps.1_2' }, { 'sample': 2, 'seqrep_id': 3, 'isolate_id': 2, 'seq_repl': '1', 'sample_name': 'site.s2.iso.43.subject.p2.lab_id.l2.seq_reps.1' }, { 'sample': 2, 'seqrep_id': 4, 'isolate_id': 2, 'seq_repl': '2', 'sample_name': 'site.s2.iso.43.subject.p2.lab_id.l2.seq_reps.2' }, ] for id_dict in ids: iso_dir = isolate_dir.IsolateDir(pipeline_root, id_dict['sample'], id_dict['isolate_id']) pipeline_dir = iso_dir.pipeline_dir(id_dict['seq_repl'], 'mykrobe_predict', clockwork_version, reference_id=2) self.assertTrue(os.path.exists(pipeline_dir)) log = os.path.join(pipeline_dir, 'log.txt') json_file = os.path.join(pipeline_dir, 'out.json') if id_dict['sample_name'].endswith('1_2'): self.assertFalse(os.path.exists(log)) self.assertFalse(os.path.exists(json_file)) else: self.assertTrue(os.path.exists(log)) self.assertTrue(os.path.exists(json_file)) shutil.rmtree(tmp_data_dir) nextflow_helper.clean_files()
def test_nextflow_import(self): '''test nextflow_import''' nextflow_helper.write_config_file() pipeline_root = 'tmp.nextflow_import.pipeline_root' os.mkdir(pipeline_root) try: db_connection.DbConnection(db_ini_file, destroy=True) except: pass dbm = db_maker.DbMaker(db_ini_file) dbm.run() dropbox_dir = 'tmp.nextflow_import.dropbox' shutil.copytree(os.path.join(data_dir, 'dropbox'), dropbox_dir) xlsx_archive_dir = 'tmp.nextflow_import.xlsx_archive' os.mkdir(xlsx_archive_dir) expected_xlsx_files = [ os.path.basename(x) for x in glob.glob(os.path.join(dropbox_dir, '*.xlsx')) ] nextflow_file = os.path.join(nextflow_helper.nextflow_dir, 'import.nf') work_dir = 'tmp.nextflow_import.work' dag_file = 'nextflow.import.dag.pdf' try: os.unlink(dag_file) except: pass command = ' '.join([ 'nextflow run', '--dropbox_dir', dropbox_dir, '--pipeline_root', pipeline_root, '--db_config_file', db_ini_file, '--xlsx_archive_dir', xlsx_archive_dir, '-with-dag', dag_file, '-c', nextflow_helper.config_file, '-w', work_dir, nextflow_file ]) utils.syscall(command) os.unlink(nextflow_helper.config_file) shutil.rmtree(work_dir) # All files should be gone from the dropbox self.assertEqual([], os.listdir(dropbox_dir)) shutil.rmtree(dropbox_dir) # The two spreadsheets should have been archived got_xlsx_files = [ os.path.basename(x) for x in glob.glob(os.path.join(xlsx_archive_dir, '**', '*.xlsx')) ] self.assertEqual(expected_xlsx_files, got_xlsx_files) shutil.rmtree(xlsx_archive_dir) # Check database updated correctly database = db.Db(db_ini_file) expected_sample_rows = [ { 'subject_id': 'p1', 'site_id': 's1', 'sample_id_from_lab': 'l1', 'dataset_name': 'g1', 'ena_center_name': 'Center A', 'ena_sample_accession': 'ERS123456', 'ena_study_accession': None }, { 'subject_id': 'p2', 'site_id': 's2', 'sample_id_from_lab': 'l2', 'dataset_name': 'g2', 'ena_center_name': 'Center A', 'ena_sample_accession': None, 'ena_study_accession': None }, { 'subject_id': 'p1', 'site_id': 's3', 'sample_id_from_lab': 'l1', 'dataset_name': 'g1', 'ena_center_name': 'Center B', 'ena_sample_accession': None, 'ena_study_accession': None }, ] got_sample_rows = sorted(database.get_rows_from_table('Sample'), key=itemgetter('site_id')) # the rows also have the sample_id, which is made by mysql auto increment, # We don't know the order in which things are added, so can't check the sample_id. for row in got_sample_rows: del row['sample_id'] self.assertEqual(expected_sample_rows, got_sample_rows) expected_rows = [ { 'sequence_replicate_number': 1, 'original_reads_file_1_md5': 'edc176f367fe8e5a014c819b9ec9b05c', 'original_reads_file_2_md5': '0dd551a0d76d90059808f6f7ddbb0e02', 'remove_contam_reads_file_1_md5': None, 'remove_contam_reads_file_2_md5': None, 'pool_sequence_replicates': 1, 'withdrawn': 0, 'import_status': 1, 'submission_date': datetime.date(2017, 12, 25), 'submit_to_ena': 0, 'ena_run_accession': 'ERR123456', 'ena_on_hold': 0, 'isolate_number_from_lab': '1', 'pool_sequence_replicates': 1, 'ena_experiment_accession': None, 'instrument_model': 'Illumina HiSeq 2000' }, { 'sequence_replicate_number': 1, 'original_reads_file_1_md5': 'fe5cd28cf9394be14794f0a56a2fe845', 'original_reads_file_2_md5': 'd026fd9a439294ed42795bd7f1e7df10', 'remove_contam_reads_file_1_md5': None, 'remove_contam_reads_file_2_md5': None, 'pool_sequence_replicates': 1, 'withdrawn': 0, 'import_status': 1, 'submission_date': datetime.date(2017, 12, 26), 'submit_to_ena': 1, 'ena_run_accession': None, 'ena_on_hold': 1, 'isolate_number_from_lab': '1', 'pool_sequence_replicates': 1, 'ena_experiment_accession': None, 'instrument_model': 'Illumina HiSeq 2000' }, { 'sequence_replicate_number': 1, 'original_reads_file_1_md5': 'aa8f077673c158c4f2a19fc3c50e3fa7', 'original_reads_file_2_md5': 'ae6bafef67da3c26576e799c32985ac9', 'remove_contam_reads_file_1_md5': None, 'remove_contam_reads_file_2_md5': None, 'pool_sequence_replicates': 1, 'withdrawn': 0, 'import_status': 1, 'submission_date': datetime.date(2017, 12, 26), 'submit_to_ena': 1, 'ena_run_accession': None, 'ena_on_hold': 1, 'isolate_number_from_lab': '2', 'pool_sequence_replicates': 1, 'ena_experiment_accession': None, 'instrument_model': 'Illumina HiSeq 2000' }, { 'sequence_replicate_number': 1, 'original_reads_file_1_md5': '6b9a34ed492dad739ac03e084f3b2ab9', 'original_reads_file_2_md5': '7ceffc5314ff7e305b4ab5bd859850c9', 'remove_contam_reads_file_1_md5': None, 'remove_contam_reads_file_2_md5': None, 'pool_sequence_replicates': 1, 'withdrawn': 0, 'import_status': 1, 'submission_date': datetime.date(2017, 12, 25), 'submit_to_ena': 1, 'ena_run_accession': None, 'ena_on_hold': 0, 'isolate_number_from_lab': '1', 'pool_sequence_replicates': 1, 'ena_experiment_accession': None, 'instrument_model': 'Illumina HiSeq 2500' }, { 'sequence_replicate_number': 2, 'original_reads_file_1_md5': 'ec0377e321c59c0b1b6392a3c6dfc2dc', 'original_reads_file_2_md5': 'd541ffdb43a0648233ec7408c3626bfd', 'remove_contam_reads_file_1_md5': None, 'remove_contam_reads_file_2_md5': None, 'pool_sequence_replicates': 1, 'withdrawn': 0, 'import_status': 1, 'submission_date': datetime.date(2017, 12, 25), 'submit_to_ena': 1, 'ena_run_accession': None, 'ena_on_hold': 0, 'isolate_number_from_lab': '1', 'pool_sequence_replicates': 1, 'ena_experiment_accession': None, 'instrument_model': 'Illumina HiSeq 2500' }, ] expected_rows.sort(key=itemgetter('original_reads_file_1_md5')) query = 'SELECT * FROM (Seqrep JOIN Isolate ON Seqrep.isolate_id = Isolate.isolate_id)' got_rows = database.query_to_dict(query) got_rows.sort(key=itemgetter('original_reads_file_1_md5')) # Check reads files etc written correctly for isolate_data in got_rows: iso_dir = isolate_dir.IsolateDir(pipeline_root, isolate_data['sample_id'], isolate_data['isolate_id']) self.assertTrue(os.path.exists(iso_dir.reads_dir)) for i in [1, 2]: self.assertTrue( os.path.exists( iso_dir.reads_filename( 'original', isolate_data['sequence_replicate_number'], i))) # similar to above, we don't know the sample_id, seqrep_id or isolate_id, which are auto generated. for row in got_rows: del row['sample_id'] del row['seqrep_id'] del row['isolate_id'] self.assertEqual(expected_rows, got_rows) shutil.rmtree(pipeline_root) nextflow_helper.clean_files() database.commit_and_close() db_connection.DbConnection(db_ini_file, destroy=True, must_exist=True)