def test_run(self): '''test run''' # use the test ini file. But we don't want the db line, # because we're making a new db ini_file = os.path.join(modules_dir, 'tests', 'data', 'db.ini') # in case database already exists try: db_connection.DbConnection(ini_file, destroy=True) except: pass # thorws error because database doesn't exist with self.assertRaises(db_connection.Error): db_connection.DbConnection(ini_file) dbm = db_maker.DbMaker(ini_file) dbm.run() # We'll just check that the database got created, and the # expected tables are in there. OTT to check complete # schema, methinks. dbc = db_connection.DbConnection(ini_file) cursor = dbc.connection.cursor() cursor.execute('USE ' + dbc.db) cursor.execute('show tables') got_tables = list(cursor.fetchall()) got_tables.sort() expected_tables = [(x, ) for x in sorted(db_schema.tables)] self.assertEqual(expected_tables, got_tables) # check version got added got_rows = cursor.execute('SELECT * FROM Version;') expected_rows = [(db_schema.version)] dbc.close() db_connection.DbConnection(ini_file, destroy=True)
def setUp(self): try: db_connection.DbConnection(db_ini_file, destroy=True) except: pass dbm = db_maker.DbMaker(db_ini_file) dbm.run() self.db = db.Db(db_ini_file)
def setUp(self): self.pipeline_root = os.path.abspath('piperoot') os.mkdir(self.pipeline_root) try: db_connection.DbConnection(ini_file, destroy=True) except: pass dbm = db_maker.DbMaker(ini_file) dbm.run() self.db = db.Db(ini_file) sample_dicts = [ { 'subject_id': 'subject_1', 'site_id': '01', 'lab_id': 'lab_id_1', 'isolate_number': '1', 'sequence_replicate_number': 1, 'submission_date': datetime.date(2018, 4, 4), 'reads_file_1': 'reads_1_1.fq', 'reads_file_1_md5': 'md5_1_1', 'reads_file_2_md5': 'md5_1_2', 'reads_file_2': 'reads_1_2.fq', 'dataset_name': 'set1', 'submit_to_ena': '0', 'instrument_model': 'Illumina HiSeq 2500', 'ena_center_name': 'Centre 1', 'ena_on_hold': '0', 'ena_run_accession': 'ERR123456', 'ena_sample_accession': 'ERS123456', }, { 'subject_id': 'subject_2', 'site_id': '01', 'lab_id': 'lab_id_2', 'isolate_number': '1', 'sequence_replicate_number': 1, 'submission_date': datetime.date(2018, 4, 4), 'reads_file_1': 'reads_2_1.fq', 'reads_file_1_md5': 'md5_2_1', 'reads_file_2_md5': 'md5_2_2', 'reads_file_2': 'reads_2_2.fq', 'dataset_name': 'set1', 'submit_to_ena': '0', 'instrument_model': 'Illumina HiSeq 2500', 'ena_center_name': 'Centre 1', 'ena_on_hold': '0', 'ena_run_accession': 'ERR123457', 'ena_sample_accession': 'ERS123457', }, { 'subject_id': 'subject_3', 'site_id': '02', 'lab_id': 'lab_id_3', 'isolate_number': '1', 'sequence_replicate_number': 1, 'submission_date': datetime.date(2018, 4, 4), 'reads_file_1': 'reads_3_1.fq', 'reads_file_1_md5': 'md5_3_1', 'reads_file_2_md5': 'md5_3_2', 'reads_file_2': 'reads_3_2.fq', 'dataset_name': 'set2', 'submit_to_ena': '0', 'instrument_model': 'Illumina HiSeq 2500', 'ena_center_name': 'Centre 2', 'ena_on_hold': '0', 'ena_run_accession': None, 'ena_sample_accession': None, }, { 'subject_id': 'subject_3', 'site_id': '02', 'lab_id': 'lab_id_3', 'isolate_number': '1', 'sequence_replicate_number': 2, 'submission_date': datetime.date(2018, 4, 4), 'reads_file_1': 'reads_4_1.fq', 'reads_file_1_md5': 'md5_4_1', 'reads_file_2_md5': 'md5_4_2', 'reads_file_2': 'reads_4_2.fq', 'dataset_name': 'set2', 'submit_to_ena': '0', 'instrument_model': 'Illumina HiSeq 2500', 'ena_center_name': 'Centre 2', 'ena_on_hold': '0', 'ena_run_accession': None, 'ena_sample_accession': None, }, ] for d in sample_dicts: self.db.add_one_seqrep(d) where_dict = {'original_reads_file_1_md5': d['reads_file_1_md5']} update_dict = { 'remove_contam_reads_file_1_md5': d['reads_file_1_md5'] + '.remove_contam', 'remove_contam_reads_file_2_md5': d['reads_file_2_md5'] + '.remove_contam', } self.db.update_row('Seqrep', where_dict, update_dict) seqrep_to_isolate = {1: 1, 2: 2, 3: 3, 4: 3} for seqrep, isolate in seqrep_to_isolate.items(): ref_id = 1 if seqrep in {1, 2} else 2 version = '0.1.1' if seqrep in {1, 2} else '0.1.3' d = { 'isolate_id': isolate, 'seqrep_id': seqrep, 'seqrep_pool': None, 'version': version, 'pipeline_name': 'remove_contam', 'status': 1, 'reference_id': ref_id } self.db.add_row_to_table('Pipeline', d) d = { 'isolate_id': isolate, 'seqrep_id': seqrep, 'seqrep_pool': None, 'version': version, 'pipeline_name': 'qc', 'status': 1, 'reference_id': ref_id + 2 } self.db.add_row_to_table('Pipeline', d) var_call_rows = [ { 'isolate_id': 1, 'seqrep_id': None, 'seqrep_pool': '1', 'version': '1.2.3', 'pipeline_name': 'variant_call', 'status': 1, 'reference_id': 10 }, { 'isolate_id': 2, 'seqrep_id': None, 'seqrep_pool': '2', 'version': '1.2.3', 'pipeline_name': 'variant_call', 'status': 1, 'reference_id': 10 }, { 'isolate_id': 3, 'seqrep_id': None, 'seqrep_pool': '1_2', 'version': '1.2.3', 'pipeline_name': 'variant_call', 'status': 1, 'reference_id': 10 }, ] for d in var_call_rows: self.db.add_row_to_table('Pipeline', d) d['pipeline_name'] = 'mykrobe_predict' self.db.add_row_to_table('Pipeline', d) self.db.commit()
def run(options): dbm = db_maker.DbMaker(options.db_config_file) dbm.run()
def setUp(self): self.pipeline_root = os.path.abspath("piperoot") os.mkdir(self.pipeline_root) try: db_connection.DbConnection(ini_file, destroy=True) except: pass dbm = db_maker.DbMaker(ini_file) dbm.run() self.db = db.Db(ini_file) sample_dicts = [ { "subject_id": "subject_1", "site_id": "01", "lab_id": "lab_id_1", "isolate_number": "1", "sequence_replicate_number": 1, "submission_date": datetime.date(2018, 4, 4), "reads_file_1": "reads_1_1.fq", "reads_file_1_md5": "md5_1_1", "reads_file_2_md5": "md5_1_2", "reads_file_2": "reads_1_2.fq", "dataset_name": "set1", "submit_to_ena": "0", "instrument_model": "Illumina HiSeq 2500", "ena_center_name": "Centre 1", "ena_on_hold": "0", "ena_run_accession": "ERR123456", "ena_sample_accession": "ERS123456", }, { "subject_id": "subject_2", "site_id": "01", "lab_id": "lab_id_2", "isolate_number": "1", "sequence_replicate_number": 1, "submission_date": datetime.date(2018, 4, 4), "reads_file_1": "reads_2_1.fq", "reads_file_1_md5": "md5_2_1", "reads_file_2_md5": "md5_2_2", "reads_file_2": "reads_2_2.fq", "dataset_name": "set1", "submit_to_ena": "0", "instrument_model": "Illumina HiSeq 2500", "ena_center_name": "Centre 1", "ena_on_hold": "0", "ena_run_accession": "ERR123457", "ena_sample_accession": "ERS123457", }, { "subject_id": "subject_3", "site_id": "02", "lab_id": "lab_id_3", "isolate_number": "1", "sequence_replicate_number": 1, "submission_date": datetime.date(2018, 4, 4), "reads_file_1": "reads_3_1.fq", "reads_file_1_md5": "md5_3_1", "reads_file_2_md5": "md5_3_2", "reads_file_2": "reads_3_2.fq", "dataset_name": "set2", "submit_to_ena": "0", "instrument_model": "Illumina HiSeq 2500", "ena_center_name": "Centre 2", "ena_on_hold": "0", "ena_run_accession": None, "ena_sample_accession": None, }, { "subject_id": "subject_3", "site_id": "02", "lab_id": "lab_id_3", "isolate_number": "1", "sequence_replicate_number": 2, "submission_date": datetime.date(2018, 4, 4), "reads_file_1": "reads_4_1.fq", "reads_file_1_md5": "md5_4_1", "reads_file_2_md5": "md5_4_2", "reads_file_2": "reads_4_2.fq", "dataset_name": "set2", "submit_to_ena": "0", "instrument_model": "Illumina HiSeq 2500", "ena_center_name": "Centre 2", "ena_on_hold": "0", "ena_run_accession": None, "ena_sample_accession": None, }, ] for d in sample_dicts: self.db.add_one_seqrep(d) where_dict = {"original_reads_file_1_md5": d["reads_file_1_md5"]} update_dict = { "remove_contam_reads_file_1_md5": d["reads_file_1_md5"] + ".remove_contam", "remove_contam_reads_file_2_md5": d["reads_file_2_md5"] + ".remove_contam", } self.db.update_row("Seqrep", where_dict, update_dict) seqrep_to_isolate = {1: 1, 2: 2, 3: 3, 4: 3} for seqrep, isolate in seqrep_to_isolate.items(): ref_id = 1 if seqrep in {1, 2} else 2 version = "0.1.1" if seqrep in {1, 2} else "0.1.3" d = { "isolate_id": isolate, "seqrep_id": seqrep, "seqrep_pool": None, "version": version, "pipeline_name": "remove_contam", "status": 1, "reference_id": ref_id, } self.db.add_row_to_table("Pipeline", d) d = { "isolate_id": isolate, "seqrep_id": seqrep, "seqrep_pool": None, "version": version, "pipeline_name": "qc", "status": 1, "reference_id": ref_id + 2, } self.db.add_row_to_table("Pipeline", d) var_call_rows = [ { "isolate_id": 1, "seqrep_id": None, "seqrep_pool": "1", "version": "1.2.3", "pipeline_name": "variant_call", "status": 1, "reference_id": 10, }, { "isolate_id": 2, "seqrep_id": None, "seqrep_pool": "2", "version": "1.2.3", "pipeline_name": "variant_call", "status": 1, "reference_id": 10, }, { "isolate_id": 3, "seqrep_id": None, "seqrep_pool": "1_2", "version": "1.2.3", "pipeline_name": "variant_call", "status": 1, "reference_id": 10, }, ] for d in var_call_rows: self.db.add_row_to_table("Pipeline", d) d["pipeline_name"] = "mykrobe_predict" self.db.add_row_to_table("Pipeline", d) self.db.commit()
def test_nextflow_import(self): '''test nextflow_import''' nextflow_helper.write_config_file() pipeline_root = 'tmp.nextflow_import.pipeline_root' os.mkdir(pipeline_root) try: db_connection.DbConnection(db_ini_file, destroy=True) except: pass dbm = db_maker.DbMaker(db_ini_file) dbm.run() dropbox_dir = 'tmp.nextflow_import.dropbox' shutil.copytree(os.path.join(data_dir, 'dropbox'), dropbox_dir) xlsx_archive_dir = 'tmp.nextflow_import.xlsx_archive' os.mkdir(xlsx_archive_dir) expected_xlsx_files = [ os.path.basename(x) for x in glob.glob(os.path.join(dropbox_dir, '*.xlsx')) ] nextflow_file = os.path.join(nextflow_helper.nextflow_dir, 'import.nf') work_dir = 'tmp.nextflow_import.work' dag_file = 'nextflow.import.dag.pdf' try: os.unlink(dag_file) except: pass command = ' '.join([ 'nextflow run', '--dropbox_dir', dropbox_dir, '--pipeline_root', pipeline_root, '--db_config_file', db_ini_file, '--xlsx_archive_dir', xlsx_archive_dir, '-with-dag', dag_file, '-c', nextflow_helper.config_file, '-w', work_dir, nextflow_file ]) utils.syscall(command) os.unlink(nextflow_helper.config_file) shutil.rmtree(work_dir) # All files should be gone from the dropbox self.assertEqual([], os.listdir(dropbox_dir)) shutil.rmtree(dropbox_dir) # The two spreadsheets should have been archived got_xlsx_files = [ os.path.basename(x) for x in glob.glob(os.path.join(xlsx_archive_dir, '**', '*.xlsx')) ] self.assertEqual(expected_xlsx_files, got_xlsx_files) shutil.rmtree(xlsx_archive_dir) # Check database updated correctly database = db.Db(db_ini_file) expected_sample_rows = [ { 'subject_id': 'p1', 'site_id': 's1', 'sample_id_from_lab': 'l1', 'dataset_name': 'g1', 'ena_center_name': 'Center A', 'ena_sample_accession': 'ERS123456', 'ena_study_accession': None }, { 'subject_id': 'p2', 'site_id': 's2', 'sample_id_from_lab': 'l2', 'dataset_name': 'g2', 'ena_center_name': 'Center A', 'ena_sample_accession': None, 'ena_study_accession': None }, { 'subject_id': 'p1', 'site_id': 's3', 'sample_id_from_lab': 'l1', 'dataset_name': 'g1', 'ena_center_name': 'Center B', 'ena_sample_accession': None, 'ena_study_accession': None }, ] got_sample_rows = sorted(database.get_rows_from_table('Sample'), key=itemgetter('site_id')) # the rows also have the sample_id, which is made by mysql auto increment, # We don't know the order in which things are added, so can't check the sample_id. for row in got_sample_rows: del row['sample_id'] self.assertEqual(expected_sample_rows, got_sample_rows) expected_rows = [ { 'sequence_replicate_number': 1, 'original_reads_file_1_md5': 'edc176f367fe8e5a014c819b9ec9b05c', 'original_reads_file_2_md5': '0dd551a0d76d90059808f6f7ddbb0e02', 'remove_contam_reads_file_1_md5': None, 'remove_contam_reads_file_2_md5': None, 'pool_sequence_replicates': 1, 'withdrawn': 0, 'import_status': 1, 'submission_date': datetime.date(2017, 12, 25), 'submit_to_ena': 0, 'ena_run_accession': 'ERR123456', 'ena_on_hold': 0, 'isolate_number_from_lab': '1', 'pool_sequence_replicates': 1, 'ena_experiment_accession': None, 'instrument_model': 'Illumina HiSeq 2000' }, { 'sequence_replicate_number': 1, 'original_reads_file_1_md5': 'fe5cd28cf9394be14794f0a56a2fe845', 'original_reads_file_2_md5': 'd026fd9a439294ed42795bd7f1e7df10', 'remove_contam_reads_file_1_md5': None, 'remove_contam_reads_file_2_md5': None, 'pool_sequence_replicates': 1, 'withdrawn': 0, 'import_status': 1, 'submission_date': datetime.date(2017, 12, 26), 'submit_to_ena': 1, 'ena_run_accession': None, 'ena_on_hold': 1, 'isolate_number_from_lab': '1', 'pool_sequence_replicates': 1, 'ena_experiment_accession': None, 'instrument_model': 'Illumina HiSeq 2000' }, { 'sequence_replicate_number': 1, 'original_reads_file_1_md5': 'aa8f077673c158c4f2a19fc3c50e3fa7', 'original_reads_file_2_md5': 'ae6bafef67da3c26576e799c32985ac9', 'remove_contam_reads_file_1_md5': None, 'remove_contam_reads_file_2_md5': None, 'pool_sequence_replicates': 1, 'withdrawn': 0, 'import_status': 1, 'submission_date': datetime.date(2017, 12, 26), 'submit_to_ena': 1, 'ena_run_accession': None, 'ena_on_hold': 1, 'isolate_number_from_lab': '2', 'pool_sequence_replicates': 1, 'ena_experiment_accession': None, 'instrument_model': 'Illumina HiSeq 2000' }, { 'sequence_replicate_number': 1, 'original_reads_file_1_md5': '6b9a34ed492dad739ac03e084f3b2ab9', 'original_reads_file_2_md5': '7ceffc5314ff7e305b4ab5bd859850c9', 'remove_contam_reads_file_1_md5': None, 'remove_contam_reads_file_2_md5': None, 'pool_sequence_replicates': 1, 'withdrawn': 0, 'import_status': 1, 'submission_date': datetime.date(2017, 12, 25), 'submit_to_ena': 1, 'ena_run_accession': None, 'ena_on_hold': 0, 'isolate_number_from_lab': '1', 'pool_sequence_replicates': 1, 'ena_experiment_accession': None, 'instrument_model': 'Illumina HiSeq 2500' }, { 'sequence_replicate_number': 2, 'original_reads_file_1_md5': 'ec0377e321c59c0b1b6392a3c6dfc2dc', 'original_reads_file_2_md5': 'd541ffdb43a0648233ec7408c3626bfd', 'remove_contam_reads_file_1_md5': None, 'remove_contam_reads_file_2_md5': None, 'pool_sequence_replicates': 1, 'withdrawn': 0, 'import_status': 1, 'submission_date': datetime.date(2017, 12, 25), 'submit_to_ena': 1, 'ena_run_accession': None, 'ena_on_hold': 0, 'isolate_number_from_lab': '1', 'pool_sequence_replicates': 1, 'ena_experiment_accession': None, 'instrument_model': 'Illumina HiSeq 2500' }, ] expected_rows.sort(key=itemgetter('original_reads_file_1_md5')) query = 'SELECT * FROM (Seqrep JOIN Isolate ON Seqrep.isolate_id = Isolate.isolate_id)' got_rows = database.query_to_dict(query) got_rows.sort(key=itemgetter('original_reads_file_1_md5')) # Check reads files etc written correctly for isolate_data in got_rows: iso_dir = isolate_dir.IsolateDir(pipeline_root, isolate_data['sample_id'], isolate_data['isolate_id']) self.assertTrue(os.path.exists(iso_dir.reads_dir)) for i in [1, 2]: self.assertTrue( os.path.exists( iso_dir.reads_filename( 'original', isolate_data['sequence_replicate_number'], i))) # similar to above, we don't know the sample_id, seqrep_id or isolate_id, which are auto generated. for row in got_rows: del row['sample_id'] del row['seqrep_id'] del row['isolate_id'] self.assertEqual(expected_rows, got_rows) shutil.rmtree(pipeline_root) nextflow_helper.clean_files() database.commit_and_close() db_connection.DbConnection(db_ini_file, destroy=True, must_exist=True)