def test_import_project(self): """import_project: check project is imported """ # Make an auto-process directory mockdir = MockAnalysisDirFactory.bcl2fastq2( '160621_M00879_0087_000000000-AGEW9', 'miseq', top_dir=self.dirn) mockdir.create() # Check that the project is not currently present ap = AutoProcess(mockdir.dirn) self.assertFalse( 'NewProj' in [p.name for p in ap.get_analysis_projects()]) self.assertFalse( 'NewProj' in [p.name for p in ap.get_analysis_projects_from_dirs()]) self.assertFalse( os.path.exists(os.path.join(ap.analysis_dir, 'NewProj'))) # Import the project import_project(ap, self.new_project_dir) self.assertTrue( 'NewProj' in [p.name for p in ap.get_analysis_projects()]) self.assertTrue('NewProj' in [p.name for p in ap.get_analysis_projects_from_dirs()]) self.assertTrue( os.path.exists(os.path.join(ap.analysis_dir, 'NewProj'))) # Verify via fresh AutoProcess object ap2 = AutoProcess(mockdir.dirn) self.assertTrue( 'NewProj' in [p.name for p in ap2.get_analysis_projects()]) self.assertTrue( 'NewProj' in [p.name for p in ap2.get_analysis_projects_from_dirs()]) self.assertTrue( os.path.exists(os.path.join(ap2.analysis_dir, 'NewProj')))
def test_analysis_dir_path(self): """AutoProcess: analysis dir path is absolute and normalized """ # Create mock Illumina run directory mock_illumina_run = MockIlluminaRun( '160621_M00879_0087_000000000-AGEW9', 'miseq', top_dir=self.dirn) mock_illumina_run.create() # Set up new AutoProcess instance ap = AutoProcess() self.assertEqual(ap.analysis_dir, None) # Make a mock analysis dir mockdir = MockAnalysisDirFactory.bcl2fastq2( '160621_M00879_0087_000000000-AGEW9', 'miseq', top_dir=self.dirn) mockdir.create() # Create Autoprocess instances from different # forms of path and check stored value rel_path = "160621_M00879_0087_000000000-AGEW9_analysis" abs_path = os.path.join(self.dirn, rel_path) rel_unnormalised = os.path.join("..", os.path.basename(self.dirn), rel_path) abs_unnormalised = os.path.join(self.dirn, rel_unnormalised) ap = AutoProcess(analysis_dir=abs_path) self.assertEqual(ap.analysis_dir, abs_path) ap = AutoProcess(analysis_dir=rel_path) self.assertEqual(ap.analysis_dir, abs_path) ap = AutoProcess(analysis_dir=abs_unnormalised) self.assertEqual(ap.analysis_dir, abs_path) ap = AutoProcess(analysis_dir=rel_unnormalised) self.assertEqual(ap.analysis_dir, abs_path)
def test_import_project_with_qc(self): """import_project: check project with QC outputs is imported """ # Make mock multiqc MockMultiQC.create(os.path.join(self.bin, "multiqc")) os.environ['PATH'] = "%s:%s" % (self.bin, os.environ['PATH']) # Make an auto-process directory mockdir = MockAnalysisDirFactory.bcl2fastq2( '160621_M00879_0087_000000000-AGEW9', 'miseq', top_dir=self.dirn) mockdir.create() # Add QC outputs to the project to be imported UpdateAnalysisProject(AnalysisProject( 'NewProj', self.new_project_dir)).add_qc_outputs(include_multiqc=False) print(os.listdir(os.path.join(self.dirn, 'NewProj'))) # Check that the project is not currently present ap = AutoProcess(mockdir.dirn) self.assertFalse( 'NewProj' in [p.name for p in ap.get_analysis_projects()]) self.assertFalse( 'NewProj' in [p.name for p in ap.get_analysis_projects_from_dirs()]) self.assertFalse( os.path.exists(os.path.join(ap.analysis_dir, 'NewProj'))) # Import the project import_project(ap, self.new_project_dir) self.assertTrue( 'NewProj' in [p.name for p in ap.get_analysis_projects()]) self.assertTrue('NewProj' in [p.name for p in ap.get_analysis_projects_from_dirs()]) self.assertTrue( os.path.exists(os.path.join(ap.analysis_dir, 'NewProj'))) # Verify via fresh AutoProcess object ap2 = AutoProcess(mockdir.dirn) self.assertTrue( 'NewProj' in [p.name for p in ap2.get_analysis_projects()]) self.assertTrue( 'NewProj' in [p.name for p in ap2.get_analysis_projects_from_dirs()]) self.assertTrue( os.path.exists(os.path.join(ap2.analysis_dir, 'NewProj'))) # Check for QC report and ZIP file print(os.listdir(os.path.join(ap2.analysis_dir, 'NewProj'))) for f in ( "qc_report.html", "multiqc_report.html", "qc_report.NewProj.160621_M00879_0087_000000000-AGEW9.zip", ): f = os.path.join(ap2.analysis_dir, 'NewProj', f) self.assertTrue(os.path.exists(f), "Missing %s" % f)
def test_make_fastqs_unknown_platform(self): """make_fastqs: unknown platform raises exception """ # Create mock source data illumina_run = MockIlluminaRun( "171020_UNKNOWN_00002_AHGXXXX", "miseq", top_dir=self.wd) illumina_run.create() # Create mock bcl2fastq MockBcl2fastq2Exe.create(os.path.join(self.bin, "bcl2fastq")) os.environ['PATH'] = "%s:%s" % (self.bin, os.environ['PATH']) # Do the test ap = AutoProcess(settings=self.settings) ap.setup(os.path.join(self.wd, "171020_UNKNOWN_00002_AHGXXXX")) self.assertTrue(ap.params.sample_sheet is not None) self.assertEqual(ap.params.bases_mask,"auto") self.assertTrue(ap.params.primary_data_dir is None) self.assertFalse(ap.params.acquired_primary_data) self.assertRaises(Exception, make_fastqs, ap, protocol="standard")
def test_make_fastqs_handle_bcl2fastq2_failure(self): """make_fastqs: handle bcl2fastq2 failure """ # Create mock source data illumina_run = MockIlluminaRun("171020_M00879_00002_AHGXXXX", "miseq", top_dir=self.wd) illumina_run.create() # Create mock bcl2fastq which will fail (i.e. # return non-zero exit code) MockBcl2fastq2Exe.create(os.path.join(self.bin, "bcl2fastq"), exit_code=1) os.environ['PATH'] = "%s:%s" % (self.bin, os.environ['PATH']) # Do the test ap = AutoProcess() ap.setup(os.path.join(self.wd, "171020_M00879_00002_AHGXXXX")) self.assertTrue(ap.params.sample_sheet is not None) self.assertRaises(Exception, ap.make_fastqs, protocol="standard") # Check outputs analysis_dir = os.path.join(self.wd, "171020_M00879_00002_AHGXXXX_analysis") for subdir in (os.path.join("primary_data", "171020_M00879_00002_AHGXXXX"), os.path.join("logs", "002_make_fastqs"), "bcl2fastq"): self.assertTrue(os.path.isdir(os.path.join(analysis_dir, subdir)), "Missing subdir: %s" % subdir) for filen in ("statistics.info", "statistics_full.info", "per_lane_statistics.info", "per_lane_sample_stats.info", "projects.info", "processing_qc.html"): self.assertFalse(os.path.exists(os.path.join(analysis_dir, filen)), "Missing file: %s" % filen)
def test_ignore_commented_projects(self): """AutoProcess.get_analysis_projects: ignore commented projects """ # Make an auto-process directory mockdir = MockAnalysisDirFactory.bcl2fastq2( '160621_K00879_0087_000000000-AGEW9', 'hiseq', metadata={ "run_number": 87, "source": "local" }, top_dir=self.dirn) mockdir.create() # Update the projects.info file projects_info = os.path.join(mockdir.dirn, "projects.info") with open(projects_info, "w") as fp: fp.write( """#Project\tSamples\tUser\tLibrary\tSC_Platform\tOrganism\tPI\tComments #AB\tAB1,AB2\tAlan Brown\tRNA-seq\t.\tHuman\tAudrey Benson\t1% PhiX CDE\tCDE3,CDE4\tClive David Edwards\tChIP-seq\t.\tMouse\tClaudia Divine Eccleston\t1% PhiX """) # List the projects projects = AutoProcess(mockdir.dirn).get_analysis_projects() expected = ('CDE', 'undetermined') self.assertEqual(len(projects), len(expected)) for p in projects: self.assertTrue(isinstance(p, AnalysisProject)) self.assertTrue(p.name in expected) for p in expected: matched_projects = [x for x in projects if x.name == p] self.assertEqual(len(matched_projects), 1)
def test_update_project_metadata_file_uncomment_existing_project(self): """ AutoProcess.update_project_metadata_file: existing project is uncommented """ # Make an auto-process directory mockdir = MockAnalysisDirFactory.bcl2fastq2( '160621_K00879_0087_000000000-AGEW9', 'hiseq', metadata={ "run_number": 87, "source": "local" }, top_dir=self.dirn) mockdir.create(no_project_dirs=True) # Create projects.info file with one project already listed with open(os.path.join(mockdir.dirn, "projects.info"), 'wt') as fp: fp.write( "#Project\tSamples\tUser\tLibrary\tSC_Platform\tOrganism\tPI\tComments\n#CDE\tCDE3,CDE4\t.\t.\t.\t.\t.\tKeep me" ) # Update the projects.info file AutoProcess(mockdir.dirn).update_project_metadata_file() # Check output - missing project kept but commented out with open(os.path.join(mockdir.dirn, "projects.info"), 'rt') as fp: self.assertEqual( fp.read(), """#Project\tSamples\tUser\tLibrary\tSC_Platform\tOrganism\tPI\tComments AB\tAB1,AB2\t.\t.\t.\t.\t.\t. CDE\tCDE3,CDE4\t.\t.\t.\t.\t.\tKeep me """)
def test_update_project_metadata_file_missing_from_bcl2fastq_output(self): """ AutoProcess.update_project_metadata_file: make missing file and populate from bcl2fastq output """ # Make an auto-process directory mockdir = MockAnalysisDirFactory.bcl2fastq2( '160621_K00879_0087_000000000-AGEW9', 'hiseq', metadata={ "run_number": 87, "source": "local" }, top_dir=self.dirn) mockdir.create(no_project_dirs=True) # Remove projects.info file os.remove(os.path.join(mockdir.dirn, "projects.info")) # Update the projects.info file AutoProcess(mockdir.dirn).update_project_metadata_file() # Check output with open(os.path.join(mockdir.dirn, "projects.info"), 'rt') as fp: self.assertEqual( fp.read(), """#Project\tSamples\tUser\tLibrary\tSC_Platform\tOrganism\tPI\tComments AB\tAB1,AB2\t.\t.\t.\t.\t.\t. CDE\tCDE3,CDE4\t.\t.\t.\t.\t.\t. """)
def test_make_project_metadata_file_no_bcl2fastq_output(self): """ AutoProcess.make_project_metadata_file: new 'projects.info' (no bcl2fastq output) """ # Make an auto-process directory mockdir = MockAnalysisDirFactory.bcl2fastq2( '160621_K00879_0087_000000000-AGEW9', 'hiseq', metadata={ "run_number": 87, "source": "local" }, top_dir=self.dirn) mockdir.create(no_project_dirs=True) # Remove the projects.info file and the bcl2fastq output dir os.remove(os.path.join(mockdir.dirn, "projects.info")) shutil.rmtree(os.path.join(mockdir.dirn, "bcl2fastq")) # Create a new projects.info file AutoProcess(mockdir.dirn).make_project_metadata_file() # Check outputs self.assertTrue( os.path.exists(os.path.join(mockdir.dirn, "projects.info"))) with open(os.path.join(mockdir.dirn, "projects.info"), 'rt') as fp: self.assertEqual( fp.read(), """#Project\tSamples\tUser\tLibrary\tSC_Platform\tOrganism\tPI\tComments """)
def test_update_fastq_stats(self): """update_fastq_stats: generates statistics files """ # Make an auto-process directory mockdir = MockAnalysisDirFactory.bcl2fastq2( '190104_M00879_0087_000000000-AGEW9', 'miseq', metadata={ "instrument_datestamp": "190104" }, top_dir=self.wd) mockdir.create(no_project_dirs=True) # Statistics files stats_files = ( "statistics.info", "statistics_full.info", "per_lane_statistics.info", "per_lane_sample_stats.info", ) # Check stats files don't already exist for filen in stats_files: self.assertFalse(os.path.exists(os.path.join(mockdir.dirn,filen)), "%s: file exists, but shouldn't" % filen) # Update (i.e. generate) stats ap = AutoProcess(mockdir.dirn) update_fastq_stats(ap) # Check files now exist for filen in stats_files: self.assertTrue(os.path.exists(os.path.join(mockdir.dirn,filen)), "%s: missing" % filen)
def test_analyse_barcodes_with_stored_bases_mask(self): """analyse_barcodes: test with stored bases mask """ # Make an auto-process directory mockdir = MockAnalysisDirFactory.bcl2fastq2( '160621_M00879_0087_000000000-AGEW9', 'miseq', bases_mask='y76,I6,y76', metadata={ "instrument_datestamp": "160621" }, top_dir=self.wd) mockdir.create(no_project_dirs=True) # Add data to Fastq files self._insert_fastq_reads(mockdir.dirn) # Populate the samplesheet sample_sheet = os.path.join(mockdir.dirn,"custom_SampleSheet.csv") with open(sample_sheet,'w') as fp: fp.write("""[Data] Sample_ID,Sample_Name,Sample_Plate,Sample_Well,I7_Index_ID,index,I5_Index_ID,index2,Sample_Project,Description AB1,AB1,,,D701,CGTGTAGG,D501,GACCTGAA,AB, AB2,AB2,,,D702,CGTGTAGG,D501,ATGTAACT,AB, CDE3,CDE3,,,D701,GACCTGAA,D501,CGTGTAGG,CDE, CDE4,CDE4,,,D702,ATGTAACT,D501,CGTGTAGG,CDE, """) # Analyse barcodes ap = AutoProcess(mockdir.dirn, settings=self.settings) analyse_barcodes(ap) # Check outputs analysis_dir = os.path.join( self.wd, "160621_M00879_0087_000000000-AGEW9_analysis") self.assertTrue(os.path.isdir( os.path.join(analysis_dir,"barcode_analysis")), "Missing dir: barcode_analysis") self.assertTrue(os.path.isdir( os.path.join(analysis_dir,"barcode_analysis","counts")), "Missing dir: barcode_analysis/counts") for f in ("AB.AB1_S1_R1_001.fastq.gz.counts", "AB.AB2_S2_R1_001.fastq.gz.counts", "CDE.CDE3_S3_R1_001.fastq.gz.counts", "CDE.CDE4_S4_R1_001.fastq.gz.counts", "undetermined.Undetermined_S0_R1_001.fastq.gz.counts"): self.assertTrue(os.path.isfile( os.path.join(analysis_dir,"barcode_analysis","counts",f)), "Missing file: %s" % f) self.assertTrue(os.path.isfile( os.path.join(analysis_dir, "barcode_analysis", "barcodes.report")), "Missing file: barcodes.report") self.assertTrue(os.path.isfile( os.path.join(analysis_dir, "barcode_analysis", "barcodes.xls")), "Missing file: barcodes.xls") self.assertTrue(os.path.isfile( os.path.join(analysis_dir, "barcode_analysis", "barcodes.html")), "Missing file: barcodes.html")
def test_publish_qc_with_projects_with_multiple_fastq_sets(self): """publish_qc: projects with multiple Fastq sets """ # Make an auto-process directory mockdir = MockAnalysisDirFactory.bcl2fastq2( '160621_K00879_0087_000000000-AGEW9', 'hiseq', metadata={ "run_number": 87, "source": "local", "instrument_datestamp": "160621" }, top_dir=self.dirn) mockdir.create() ap = AutoProcess(mockdir.dirn) # Add processing report and QC outputs UpdateAnalysisDir(ap).add_processing_report() for project in ap.get_analysis_projects(): UpdateAnalysisProject(project).add_qc_outputs() # Add additional fastq set for first project multi_fastqs_project = ap.get_analysis_projects()[0] UpdateAnalysisProject(multi_fastqs_project).add_fastq_set( "fastqs.extra", ("Alt1.r1.fastq.gz","Alt2.r1.fastq.gz")) UpdateAnalysisProject(multi_fastqs_project).add_qc_outputs( fastq_set="fastqs.extra", qc_dir="qc.extra") # Make a mock publication area publication_dir = os.path.join(self.dirn,'QC') os.mkdir(publication_dir) # Publish publish_qc(ap,location=publication_dir) # Check outputs outputs = ["index.html", "processing_qc.html"] for project in ap.get_analysis_projects(): # Standard QC outputs project_qc = "qc_report.%s.%s" % (project.name, os.path.basename( ap.analysis_dir)) outputs.append(project_qc) outputs.append("%s.zip" % project_qc) outputs.append(os.path.join(project_qc,"qc_report.html")) outputs.append(os.path.join(project_qc,"qc")) # MultiQC output outputs.append("multiqc_report.%s.html" % project.name) # Additional QC for second fastq set in first project project_qc = "qc.extra_report.%s.%s" % (multi_fastqs_project.name, os.path.basename( ap.analysis_dir)) outputs.append(project_qc) outputs.append("%s.zip" % project_qc) outputs.append(os.path.join(project_qc,"qc.extra_report.html")) outputs.append(os.path.join(project_qc,"qc.extra")) # MultiQC output outputs.append("multiqc.extra_report.%s.html" % multi_fastqs_project.name) for item in outputs: f = os.path.join(publication_dir, "160621_K00879_0087_000000000-AGEW9_analysis", item) self.assertTrue(os.path.exists(f),"Missing %s" % f)
def test_report_concise(self): """report: report run in 'concise' mode """ # Make a mock auto-process directory mockdir = MockAnalysisDirFactory.bcl2fastq2( '170901_M00879_0087_000000000-AGEW9', 'miseq', metadata={ "source": "testing", "run_number": 87, "assay": "Nextera" }, project_metadata={ "AB": { "User": "******", "Library type": "RNA-seq", "Organism": "Human", "PI": "Audrey Bower" }, "CDE": { "User": "******", "Library type": "ChIP-seq", "Organism": "Mouse", "PI": "Colin Delaney Eccleston" } }, top_dir=self.dirn) mockdir.create() # Make autoprocess instance ap = AutoProcess(analysis_dir=mockdir.dirn) # Generate concise report self.assertEqual( report_concise(ap), "Paired end: 'AB': Alison Bell, Human RNA-seq (PI: Audrey Bower) (2 samples); 'CDE': Charles David Edwards, Mouse ChIP-seq (PI: Colin Delaney Eccleston) (2 samples)" )
def test_report_info_no_projects(self): """report: report run with no projects in 'info' mode """ # Make a mock auto-process directory mockdir = MockAnalysisDirFactory.bcl2fastq2( '170901_M00879_0087_000000000-AGEW9', 'miseq', metadata={ "source": "testing", "run_number": 87, "assay": "Nextera" }, top_dir=self.dirn) mockdir.create(no_project_dirs=True) # Make autoprocess instance ap = AutoProcess(analysis_dir=mockdir.dirn) # Generate concise report expected = """Run reference: MISEQ_170901#87 Directory : %s Platform : miseq Unaligned dir: bcl2fastq Summary of data in 'bcl2fastq' dir: - AB: AB1-2 (2 paired end samples) - CDE: CDE3-4 (2 paired end samples) No analysis projects found""" % mockdir.dirn for o, e in zip(report_info(ap).split('\n'), expected.split('\n')): self.assertEqual(o, e)
def test_make_fastqs_icell8_protocol(self): """make_fastqs: icell8 protocol """ # Create mock source data illumina_run = MockIlluminaRun("171020_SN7001250_00002_AHGXXXX", "hiseq", top_dir=self.wd) illumina_run.create() # Create mock bcl2fastq # Check that bases mask is as expected MockBcl2fastq2Exe.create(os.path.join(self.bin, "bcl2fastq"), assert_bases_mask="y25n76,I8,I8,y101") os.environ['PATH'] = "%s:%s" % (self.bin, os.environ['PATH']) # Do the test ap = AutoProcess() ap.setup(os.path.join(self.wd, "171020_SN7001250_00002_AHGXXXX")) self.assertTrue(ap.params.sample_sheet is not None) ap.make_fastqs(protocol="icell8") # Check outputs analysis_dir = os.path.join(self.wd, "171020_SN7001250_00002_AHGXXXX_analysis") for subdir in (os.path.join("primary_data", "171020_SN7001250_00002_AHGXXXX"), os.path.join("logs", "002_make_fastqs_icell8"), "bcl2fastq"): self.assertTrue(os.path.isdir(os.path.join(analysis_dir, subdir)), "Missing subdir: %s" % subdir) for filen in ("statistics.info", "statistics_full.info", "per_lane_statistics.info", "per_lane_sample_stats.info", "projects.info", "processing_qc.html"): self.assertTrue(os.path.isfile(os.path.join(analysis_dir, filen)), "Missing file: %s" % filen)
def test_make_fastqs_specify_platform_via_metadata(self): """make_fastqs: implicitly specify the platform via metadata """ # Create mock source data illumina_run = MockIlluminaRun("171020_UNKNOWN_00002_AHGXXXX", "miseq", top_dir=self.wd) illumina_run.create() # Create mock bcl2fastq MockBcl2fastq2Exe.create(os.path.join(self.bin, "bcl2fastq"), platform="miseq") os.environ['PATH'] = "%s:%s" % (self.bin, os.environ['PATH']) # Do the test ap = AutoProcess() ap.setup(os.path.join(self.wd, "171020_UNKNOWN_00002_AHGXXXX")) self.assertTrue(ap.params.sample_sheet is not None) self.assertTrue(ap.metadata.platform is None) ap.metadata["platform"] = "miseq" ap.make_fastqs(protocol="standard") # Check outputs analysis_dir = os.path.join(self.wd, "171020_UNKNOWN_00002_AHGXXXX_analysis") for subdir in (os.path.join("primary_data", "171020_UNKNOWN_00002_AHGXXXX"), os.path.join("logs", "002_make_fastqs"), "bcl2fastq"): self.assertTrue(os.path.isdir(os.path.join(analysis_dir, subdir)), "Missing subdir: %s" % subdir) for filen in ("statistics.info", "statistics_full.info", "per_lane_statistics.info", "per_lane_sample_stats.info", "projects.info", "processing_qc.html"): self.assertTrue(os.path.isfile(os.path.join(analysis_dir, filen)), "Missing file: %s" % filen)
def test_publish_qc_missing_destination(self): """publish_qc: raise exception if destination doesn't exist """ # Make an auto-process directory mockdir = MockAnalysisDirFactory.bcl2fastq2( '160621_K00879_0087_000000000-AGEW9', 'hiseq', metadata={ "run_number": 87, "source": "local", "instrument_datestamp": "160621" }, top_dir=self.dirn) mockdir.create() ap = AutoProcess(mockdir.dirn, settings=self.settings) # Add processing report and QC outputs UpdateAnalysisDir(ap).add_processing_report() for project in ap.get_analysis_projects(): UpdateAnalysisProject(project).add_qc_outputs() # Reference publication area which doesn't exist publication_dir = os.path.join(self.dirn, 'QC') self.assertFalse(os.path.exists(publication_dir)) # Publish self.assertRaises(Exception, publish_qc, ap, location=publication_dir) self.assertFalse(os.path.exists(publication_dir))
def test_publish_qc_processing_qc(self): """publish_qc: processing QC report only """ # Make an auto-process directory mockdir = MockAnalysisDirFactory.bcl2fastq2( '160621_K00879_0087_000000000-AGEW9', 'hiseq', metadata={ "run_number": 87, "source": "local", "instrument_datestamp": "160621" }, top_dir=self.dirn) mockdir.create(no_project_dirs=True) ap = AutoProcess(mockdir.dirn, settings=self.settings) # Add processing report UpdateAnalysisDir(ap).add_processing_report() # Make a mock publication area publication_dir = os.path.join(self.dirn, 'QC') os.mkdir(publication_dir) # Publish QC publish_qc(ap, location=publication_dir) # Check outputs outputs = ("index.html", "processing_qc.html") for item in outputs: f = os.path.join(publication_dir, "160621_K00879_0087_000000000-AGEW9_analysis", item) self.assertTrue(os.path.exists(f), "Missing %s" % f)
def test_publish_qc_with_project_missing_qc(self): """publish_qc: raises exception if project has missing QC """ # Make an auto-process directory mockdir = MockAnalysisDirFactory.bcl2fastq2( '160621_K00879_0087_000000000-AGEW9', 'hiseq', metadata={ "run_number": 87, "source": "local", "instrument_datestamp": "160621" }, top_dir=self.dirn) mockdir.create() ap = AutoProcess(mockdir.dirn, settings=self.settings) # Add processing report UpdateAnalysisDir(ap).add_processing_report() # Add QC outputs for subset of projects projects = ap.get_analysis_projects()[1:] for project in projects: UpdateAnalysisProject(project).add_qc_outputs() # Make a mock publication area publication_dir = os.path.join(self.dirn, 'QC') os.mkdir(publication_dir) # Publish self.assertRaises(Exception, publish_qc, ap, location=publication_dir)
def test_publish_qc_with_cellranger_qc_multiple_lanes_subsets(self): """publish_qc: publish cellranger QC output (multiple subsets of lanes) """ # Make an auto-process directory mockdir = MockAnalysisDirFactory.bcl2fastq2( '160621_K00879_0087_000000000-AGEW9', 'hiseq', metadata={ "run_number": 87, "source": "local", "instrument_datestamp": "160621" }, top_dir=self.dirn) mockdir.create(no_project_dirs=True) ap = AutoProcess(mockdir.dirn, settings=self.settings) # Add processing and cellranger QC reports UpdateAnalysisDir(ap).add_processing_report() UpdateAnalysisDir(ap).add_cellranger_qc_output(lanes="45") UpdateAnalysisDir(ap).add_cellranger_qc_output(lanes="78") # Make a mock publication area publication_dir = os.path.join(self.dirn, 'QC') os.mkdir(publication_dir) # Publish publish_qc(ap, location=publication_dir) # Check outputs outputs = [ "index.html", "processing_qc.html", "cellranger_qc_summary_45.html", "cellranger_qc_summary_78.html" ] # Do checks for item in outputs: f = os.path.join(publication_dir, "160621_K00879_0087_000000000-AGEW9_analysis", item) self.assertTrue(os.path.exists(f), "Missing %s" % f)
def test_bcl2fastq2_can_be_loaded_after_rsync(self): """ merge_fastq_dirs: rsynced bcl2fastq v2 output can be loaded """ analysis_dir = self._setup_bcl2fastq2() # Merge the unaligned dirs self.ap = AutoProcess(analysis_dir, settings=self.settings) merge_fastq_dirs(self.ap, "bcl2fastq.lanes1-2", output_dir="bcl2fastq") # Check output directory exists and can be loaded self._assert_dir_exists(os.path.join(analysis_dir, 'bcl2fastq')) try: illumina_data = IlluminaData(analysis_dir, unaligned_dir='bcl2fastq') except Exception as ex: self.fail("exception loading merged directory: %s" % ex) # Rsync (with empty directories pruned) target_dir = os.path.join(self.dirn, "rsynced") os.mkdir(target_dir) applications.general.rsync( "%s/bcl2fastq" % self.ap.analysis_dir, target_dir, prune_empty_dirs=True).run_subprocess( log=os.path.join(self.dirn, "rsync.log")) # Check rsynced directory exists and can be loaded self._assert_dir_exists(os.path.join(target_dir, 'bcl2fastq')) try: illumina_data = IlluminaData(target_dir, unaligned_dir='bcl2fastq') except Exception as ex: self.fail("exception loading rsynced directory: %s" % ex)
def test_with_project_dirs_no_projects_dot_info_no_unaligned(self): """AutoProcess.get_analysis_projects: project dirs exist (no projects.info, no unaligned) """ # Make an auto-process directory mockdir = MockAnalysisDirFactory.bcl2fastq2( '160621_K00879_0087_000000000-AGEW9', 'hiseq', metadata={ "run_number": 87, "source": "local" }, top_dir=self.dirn) mockdir.create() # Remove the projects.info file os.remove(os.path.join(mockdir.dirn, "projects.info")) # List the projects projects = AutoProcess(mockdir.dirn).get_analysis_projects() expected = ('AB', 'CDE', 'undetermined') self.assertEqual(len(projects), len(expected)) for p in projects: self.assertTrue(isinstance(p, AnalysisProject)) self.assertTrue(p.name in expected) for p in expected: matched_projects = [x for x in projects if x.name == p] self.assertEqual(len(matched_projects), 1)
def test_setup_analysis_dirs_icell8_atac(self): """ setup_analysis_dirs: test create new analysis dir for ICELL8 ATAC """ # Make a mock auto-process directory mockdir = MockAnalysisDirFactory.bcl2fastq2( '170901_M00879_0087_000000000-AGEW9', 'miseq', metadata={"instrument_datestamp": "170901"}, paired_end=True, top_dir=self.dirn) mockdir.create(no_project_dirs=True) # Add required metadata to 'projects.info' projects_info = os.path.join(mockdir.dirn, "projects.info") with open(projects_info, "w") as fp: fp.write( """#Project\tSamples\tUser\tLibrary\tSC_Platform\tOrganism\tPI\tComments AB\tAB1,AB2\tAlan Brown\tscATAC-seq\tICELL8 ATAC\tHuman\tAudrey Benson\t1% PhiX """) # Add ICELL8 ATAC outputs xlsx_file = os.path.join(mockdir.dirn, "bcl2fastq", "Reports", "icell8_atac_stats.xlsx") with open(xlsx_file, 'w') as fp: fp.write("") # Expected data projects = { "AB": [ "AB1_S1_R1_001.fastq.gz", "AB1_S1_R2_001.fastq.gz", "AB2_S2_R1_001.fastq.gz", "AB2_S2_R2_001.fastq.gz" ], "undetermined": [ "Undetermined_S0_R1_001.fastq.gz", "Undetermined_S0_R2_001.fastq.gz" ] } # Check project dirs don't exist for project in projects: project_dir_path = os.path.join(mockdir.dirn, project) self.assertFalse(os.path.exists(project_dir_path)) # Setup the project dirs ap = AutoProcess(analysis_dir=mockdir.dirn) setup_analysis_dirs(ap) # Check project dirs and contents for project in projects: project_dir_path = os.path.join(mockdir.dirn, project) self.assertTrue(os.path.exists(project_dir_path)) # Check README.info file readme_file = os.path.join(project_dir_path, "README.info") self.assertTrue(os.path.exists(readme_file)) # Check Fastqs fastqs_dir = os.path.join(project_dir_path, "fastqs") self.assertTrue(os.path.exists(fastqs_dir)) for fq in projects[project]: fastq = os.path.join(fastqs_dir, fq) self.assertTrue(os.path.exists(fastq)) # Check extra data for ICELL8 ATAC icell8_atac_xlsx = os.path.join(mockdir.dirn, "AB", "icell8_atac_stats.xlsx") self.assertTrue(os.path.exists(icell8_atac_xlsx))
def test_report_summary_single_cell(self): """report: report single-cell run in 'summary' mode """ # Make a mock auto-process directory mockdir = MockAnalysisDirFactory.bcl2fastq2( '170901_M00879_0087_000000000-AGEW9', 'miseq', metadata={ "source": "testing", "run_number": 87, "bcl2fastq_software": "('/usr/bin/bcl2fastq', 'bcl2fastq', '2.17.1.14')", "cellranger_software": "('/usr/bin/cellranger', 'cellranger', '3.0.1')", "assay": "Nextera" }, project_metadata={ "AB": { "User": "******", "Library type": "scRNA-seq", "Organism": "Human", "PI": "Audrey Bower", "Single cell platform": "ICELL8", "Number of cells": 1311 }, "CDE": { "User": "******", "Library type": "ChIP-seq", "Organism": "Mouse", "PI": "Colin Delaney Eccleston", "Comments": "Repeat of previous run" } }, top_dir=self.dirn) mockdir.create() # Make autoprocess instance ap = AutoProcess(analysis_dir=mockdir.dirn) # Generate summary report expected = """MISEQ run #87 datestamped 170901 ================================ Run name : 170901_M00879_0087_000000000-AGEW9 Reference : MISEQ_170901#87 Platform : MISEQ Directory : %s Endedness : Paired end Bcl2fastq : bcl2fastq 2.17.1.14 Cellranger: cellranger 3.0.1 Assay : Nextera 2 projects: - 'AB': Alison Bell Human scRNA-seq (ICELL8) 2 samples/1311 cells (PI Audrey Bower) - 'CDE': Charles David Edwards Mouse ChIP-seq 2 samples (PI Colin Delaney Eccleston) Additional notes/comments: - CDE: Repeat of previous run """ % mockdir.dirn for o, e in zip(report_summary(ap).split('\n'), expected.split('\n')): self.assertEqual(o, e)
def test_bcl2fastq2_one_undetermined_fastq_pair(self): """ merge_fastq_dirs: bcl2fastq v2 output with --no-lane-splitting, one undetermined Fastq pair """ analysis_dir = self._setup_bcl2fastq2_no_lane_splitting() # Remove undetermined Fastqs from bcl2fastq.CDE for f in os.listdir(os.path.join(analysis_dir, "bcl2fastq.CDE")): if f.startswith("Undetermined_S0_"): os.remove(os.path.join(analysis_dir, "bcl2fastq.CDE", f)) # Merge the unaligned dirs self.ap = AutoProcess(analysis_dir, settings=self.settings) merge_fastq_dirs(self.ap, "bcl2fastq.AB") # Check outputs self._assert_dir_exists(os.path.join(analysis_dir, 'save.bcl2fastq.AB')) self._assert_dir_exists( os.path.join(analysis_dir, 'save.bcl2fastq.CDE')) self._assert_dir_exists(os.path.join(analysis_dir, 'bcl2fastq.AB')) self._assert_dir_doesnt_exist( os.path.join(analysis_dir, 'bcl2fastq.CDE')) for f in ( 'AB/AB1_S1_R1_001.fastq.gz', 'AB/AB1_S1_R2_001.fastq.gz', 'AB/AB2_S2_R1_001.fastq.gz', 'AB/AB2_S2_R2_001.fastq.gz', 'CDE/CDE3_S3_R1_001.fastq.gz', 'CDE/CDE3_S3_R2_001.fastq.gz', 'CDE/CDE4_S4_R1_001.fastq.gz', 'CDE/CDE4_S4_R2_001.fastq.gz', 'Undetermined_S0_R1_001.fastq.gz', 'Undetermined_S0_R2_001.fastq.gz', ): self._assert_file_exists( os.path.join(analysis_dir, 'bcl2fastq.AB', f)) # Check merge of undetermined fastqs undetermined_r1 = gzip.GzipFile( os.path.join(analysis_dir, 'bcl2fastq.AB', 'Undetermined_S0_R1_001.fastq.gz'), 'rb').read().decode() expected_r1 = '\n'.join(fastq_reads_r1[:4]) + '\n' self.assertEqual(undetermined_r1, expected_r1) undetermined_r2 = gzip.GzipFile( os.path.join(analysis_dir, 'bcl2fastq.AB', 'Undetermined_S0_R2_001.fastq.gz'), 'rb').read().decode() expected_r2 = '\n'.join(fastq_reads_r2[:4]) + '\n' self.assertEqual(undetermined_r2, expected_r2) # Check projects.info files self._assert_file_exists( os.path.join(analysis_dir, 'save.projects.info')) self._assert_file_exists(os.path.join(analysis_dir, 'projects.info')) with open(os.path.join(analysis_dir, 'projects.info'), 'rt') as fp: projects_info = fp.read() expected = """#Project Samples User Library SC_Platform Organism PI Comments AB AB1,AB2 . . . . . . CDE CDE3,CDE4 . . . . . . """ self.assertEqual(projects_info, expected)
def test_report_processing_qc_empty_lane(self): """report_processing_qc: report with empty lane """ # Create test data analysis_dir = os.path.join(self.wd, "180430_K00311_0001_ABCDEFGHXX_analysis") os.mkdir(analysis_dir) per_lane_sample_stats = os.path.join(analysis_dir, "per_lane_sample_stats.info") with open(per_lane_sample_stats, 'w') as fp: fp.write(""" Lane 1 Total reads = 0 Lane 2 Total reads = 0114447328 - CDE/CDE1 25058003 21.9% - CDE/CDE2 0 0.0% - CDE/CDE3 34509382 30.2% - CDE/CDE4 27283286 23.8% - Undetermined_indices/undetermined 27596657 24.1% """) per_lane_statistics = os.path.join(analysis_dir, "per_lane_statistics.info") with open(per_lane_statistics, 'w') as fp: fp.write( """#Lane Total reads Assigned reads Unassigned reads %assigned %unassigned Lane 1 0 0 0 0.0 0.0 Lane 2 114447328 86850671 27596657 75.9 24.1 """) statistics_full = os.path.join(analysis_dir, "statistics_full.info") with open(statistics_full, 'w') as fp: fp.write( """#Project Sample Fastq Size Nreads Paired_end Read_number L1 L2 AB AB1 AB1_S1_R1_001.fastq.gz 0.0K 0 Y 1 AB AB1 AB1_S1_R2_001.fastq.gz 0.0K 0 Y 2 AB AB2 AB2_S2_R1_001.fastq.gz 0.0K 0 Y 1 AB AB2 AB2_S2_R2_001.fastq.gz 0.0K 0 Y 2 AB AB3 AB3_S3_R1_001.fastq.gz 0.0K 0 Y 1 AB AB3 AB3_S3_R2_001.fastq.gz 0.0k 0 Y 2 AB AB4 AB4_S4_R1_001.fastq.gz 1.1G 0 Y 1 AB AB4 AB4_S4_R2_001.fastq.gz 1.2G 0 Y 2 CDE CDE1 CDE1_S5_R1_001.fastq.gz 1.0G 0 Y 1 25058003 CDE CDE1 CDE1_S5_R2_001.fastq.gz 1.1G 0 Y 2 25058003 CDE CDE2 CDE2_S6_R1_001.fastq.gz 0.0K 0 Y 1 CDE CDE2 CDE2_S6_R2_001.fastq.gz 0.0K 0 Y 2 CDE CDE3 CDE3_S7_R1_001.fastq.gz 1.4G 34509382 Y 1 34509382 CDE CDE3 CDE3_S7_R2_001.fastq.gz 1.6G 34509382 Y 2 34509382 CDE CDE4 CDE4_S8_R1_001.fastq.gz 1.1G 27283286 Y 1 27283286 CDE CDE4 CDE4_S8_R2_001.fastq.gz 1.2G 27283286 Y 2 27283286 Undetermined_indices undetermined Undetermined_S0_R1_001.fastq.gz 1.0K 0 Y 1 0 Undetermined_indices undetermined Undetermined_S0_R2_001.fastq.gz 1.0K 0 Y 2 0 """) # Generate QC report output_html = os.path.join(analysis_dir, "processing_report.html") self.assertFalse(os.path.exists(output_html)) report_processing_qc(AutoProcess(analysis_dir), output_html) self.assertTrue(os.path.exists(output_html))
def test_archive_to_final_via_staging(self): """archive: test copying to staging then final archive dir """ # Make a mock auto-process directory mockdir = MockAnalysisDirFactory.bcl2fastq2( '170901_M00879_0087_000000000-AGEW9', 'miseq', metadata={"instrument_datestamp": "170901"}, top_dir=self.dirn) mockdir.create() # Make a mock archive directory archive_dir = os.path.join(self.dirn, "archive") final_dir = os.path.join(archive_dir, "2017", "miseq") os.makedirs(final_dir) self.assertTrue(os.path.isdir(final_dir)) self.assertEqual(len(os.listdir(final_dir)), 0) # Make autoprocess instance and set required metadata ap = AutoProcess(analysis_dir=mockdir.dirn) ap.set_metadata("source", "testing") ap.set_metadata("run_number", "87") # Do staging archiving op status = archive(ap, archive_dir=archive_dir, year='2017', platform='miseq', read_only_fastqs=False, final=False) self.assertEqual(status, 0) # Check that staging dir exists staging_dir = os.path.join( final_dir, "__170901_M00879_0087_000000000-AGEW9_analysis.pending") final_archive_dir = os.path.join( final_dir, "170901_M00879_0087_000000000-AGEW9_analysis") self.assertTrue(os.path.exists(staging_dir)) self.assertFalse(os.path.exists(final_archive_dir)) self.assertEqual(len(os.listdir(final_dir)), 1) # Do final archiving op status = archive(ap, archive_dir=archive_dir, year='2017', platform='miseq', read_only_fastqs=False, final=True) self.assertEqual(status, 0) self.assertFalse(os.path.exists(staging_dir)) self.assertTrue(os.path.exists(final_archive_dir)) self.assertEqual(len(os.listdir(final_dir)), 1) # Check contents dirs = ("AB", "CDE", "logs", "undetermined") for d in dirs: d = os.path.join(final_archive_dir, d) self.assertTrue(os.path.exists(d)) files = ("auto_process.info", "custom_SampleSheet.csv", "metadata.info", "projects.info", "SampleSheet.orig.csv") for f in files: f = os.path.join(final_archive_dir, f) self.assertTrue(os.path.exists(f))
def test_casava_new_output_dir(self): """ merge_fastq_dirs: casava/bcl2fastq v1.8.* output, new output dir """ analysis_dir = self._setup_casava() # Merge the unaligned dirs self.ap = AutoProcess(analysis_dir) merge_fastq_dirs(self.ap, "bcl2fastq.lanes1-2", output_dir="bcl2fastq") # Check outputs self._assert_dir_exists( os.path.join(analysis_dir, 'save.bcl2fastq.lanes1-2')) self._assert_dir_exists( os.path.join(analysis_dir, 'save.bcl2fastq.lanes3-4')) self._assert_dir_exists(os.path.join(analysis_dir, 'bcl2fastq')) self._assert_dir_doesnt_exist( os.path.join(analysis_dir, 'bcl2fastq.lanes1-2')) self._assert_dir_doesnt_exist( os.path.join(analysis_dir, 'bcl2fastq.lanes3-4')) for f in ( 'Project_AB/Sample_AB1/AB1_GCCAAT_L001_R1_001.fastq.gz', 'Project_AB/Sample_AB1/AB1_GCCAAT_L001_R2_001.fastq.gz', 'Project_AB/Sample_AB2/AB2_AGTCAA_L001_R1_001.fastq.gz', 'Project_AB/Sample_AB2/AB2_AGTCAA_L001_R2_001.fastq.gz', 'Project_AB/Sample_AB1/AB1_GCCAAT_L002_R1_001.fastq.gz', 'Project_AB/Sample_AB1/AB1_GCCAAT_L002_R2_001.fastq.gz', 'Project_AB/Sample_AB2/AB2_AGTCAA_L002_R1_001.fastq.gz', 'Project_AB/Sample_AB2/AB2_AGTCAA_L002_R2_001.fastq.gz', 'Project_CDE/Sample_CDE3/CDE3_GCCAAT_L003_R1_001.fastq.gz', 'Project_CDE/Sample_CDE3/CDE3_GCCAAT_L003_R2_001.fastq.gz', 'Project_CDE/Sample_CDE4/CDE4_AGTCAA_L003_R1_001.fastq.gz', 'Project_CDE/Sample_CDE4/CDE4_AGTCAA_L003_R2_001.fastq.gz', 'Project_CDE/Sample_CDE3/CDE3_GCCAAT_L004_R1_001.fastq.gz', 'Project_CDE/Sample_CDE3/CDE3_GCCAAT_L004_R2_001.fastq.gz', 'Project_CDE/Sample_CDE4/CDE4_AGTCAA_L004_R1_001.fastq.gz', 'Project_CDE/Sample_CDE4/CDE4_AGTCAA_L004_R2_001.fastq.gz', 'Undetermined_indices/Sample_lane1/lane1_Undetermined_L001_R1_001.fastq.gz', 'Undetermined_indices/Sample_lane1/lane1_Undetermined_L001_R2_001.fastq.gz', 'Undetermined_indices/Sample_lane2/lane2_Undetermined_L002_R1_001.fastq.gz', 'Undetermined_indices/Sample_lane2/lane2_Undetermined_L002_R2_001.fastq.gz', 'Undetermined_indices/Sample_lane3/lane3_Undetermined_L003_R1_001.fastq.gz', 'Undetermined_indices/Sample_lane3/lane3_Undetermined_L003_R2_001.fastq.gz', 'Undetermined_indices/Sample_lane4/lane4_Undetermined_L004_R1_001.fastq.gz', 'Undetermined_indices/Sample_lane4/lane4_Undetermined_L004_R2_001.fastq.gz' ): self._assert_file_exists(os.path.join(analysis_dir, 'bcl2fastq', f)) # Check projects.info files self._assert_file_exists( os.path.join(analysis_dir, 'save.projects.info')) self._assert_file_exists(os.path.join(analysis_dir, 'projects.info')) with open(os.path.join(analysis_dir, 'projects.info'), 'rt') as fp: projects_info = fp.read() expected = """#Project Samples User Library SC_Platform Organism PI Comments AB AB1,AB2 . . . . . . CDE CDE3,CDE4 . . . . . . """ self.assertEqual(projects_info, expected)
def test_clone_analysis_dir_copy_fastqs(self): """ clone: copies an analysis directory """ # Make a source analysis dir analysis_dir = MockAnalysisDirFactory.bcl2fastq2( "190116_M01234_0002_AXYZ123", platform="miseq", paired_end=True, no_lane_splitting=False, include_stats_files=True, top_dir=self.dirn) analysis_dir.create() ap = AutoProcess(analysis_dir.dirn) UpdateAnalysisDir(ap).add_processing_report() ap.add_directory("primary_data/190116_M01234_0002_AXYZ123") # Make a copy clone_dir = os.path.join(self.dirn, "190116_M01234_0002_AXYZ123_copy") self.assertFalse(os.path.exists(clone_dir)) clone(ap, clone_dir, copy_fastqs=True) self.assertTrue(os.path.isdir(clone_dir)) # Check contents for subdir in ('logs', 'ScriptCode'): d = os.path.join(clone_dir, subdir) self.assertTrue(os.path.isdir(d), "Missing '%s'" % subdir) for filen in ( 'SampleSheet.orig.csv', 'custom_SampleSheet.csv', 'auto_process.info', 'metadata.info', 'statistics.info', 'statistics_full.info', 'per_lane_statistics.info', 'per_lane_sample_stats.info', 'processing_qc.html', ): f = os.path.join(clone_dir, filen) self.assertTrue(os.path.isfile(f), "Missing '%s'" % filen) # Check unaligned unaligned = os.path.join(clone_dir, 'bcl2fastq') self.assertTrue(os.path.isdir(unaligned)) # Check primary data primary_data = os.path.join(clone_dir, 'primary_data', '190116_M01234_0002_AXYZ123') self.assertTrue(os.path.islink(primary_data)) # Check projects for proj in ('AB', 'CDE', 'undetermined'): d = os.path.join(clone_dir, proj) self.assertTrue(os.path.isdir(d), "Missing '%s'" % proj) # Check parameters params = AnalysisDirParameters( filen=os.path.join(clone_dir, 'auto_process.info')) self.assertEqual(params.sample_sheet, os.path.join(clone_dir, "custom_SampleSheet.csv")) self.assertEqual(params.primary_data_dir, os.path.join(clone_dir, "primary_data"))
def test_publish_qc_subset_of_projects(self): """publish_qc: only publish subset of projects """ # Make an auto-process directory mockdir = MockAnalysisDirFactory.bcl2fastq2( '160621_K00879_0087_000000000-AGEW9', 'hiseq', metadata={ "run_number": 87, "source": "local", "instrument_datestamp": "160621" }, top_dir=self.dirn) mockdir.create() ap = AutoProcess(mockdir.dirn) # Add processing report UpdateAnalysisDir(ap).add_processing_report() # Add QC outputs for subset of projects projects = ap.get_analysis_projects() missing_projects = projects[1:] projects = projects[0:1] for project in ap.get_analysis_projects(): UpdateAnalysisProject(project).add_qc_outputs() # Make a mock publication area publication_dir = os.path.join(self.dirn,'QC') os.mkdir(publication_dir) # Publish publish_qc(ap,location=publication_dir, projects="AB*") # Check outputs outputs = ["index.html", "processing_qc.html"] for project in projects: # Standard QC outputs project_qc = "qc_report.%s.%s" % (project.name, os.path.basename( ap.analysis_dir)) outputs.append(project_qc) outputs.append("%s.zip" % project_qc) outputs.append(os.path.join(project_qc,"qc_report.html")) outputs.append(os.path.join(project_qc,"qc")) # MultiQC output outputs.append("multiqc_report.%s.html" % project.name) for item in outputs: f = os.path.join(publication_dir, "160621_K00879_0087_000000000-AGEW9_analysis", item) self.assertTrue(os.path.exists(f),"Missing %s" % f) # Check that missing projects weren't copied for project in missing_projects: self.assertFalse(os.path.exists( os.path.join(publication_dir, "160621_K00879_0087_000000000-AGEW9_analysis", "qc_report.%s.%s" % (project.name, os.path.basename( ap.analysis_dir)))), "%s exists in final dir, but shouldn't" % project.name)