def run_INNUca(sampleName, outdir, fastq_files, args, script_path, scheme, spadesMaxMemory, jar_path_trimmomatic, jar_path_pilon, jarMaxMemory, trueCoverage_config, rematch_script, species_genus, mlst_scheme_genus): threads = args.threads adaptersFasta = args.adapters if adaptersFasta is not None: adaptersFasta = os.path.abspath(adaptersFasta.name) genomeSize = args.genomeSizeExpectedMb skipped = [None, None, 0, {'sample': 'Skipped'}] not_run = [None, None, 0, {'sample': 'Not run'}] runs = {} # Run FastQ integrity check not_corruption_found, pass_qc, time_taken, failing, fastq_encoding, min_reads_length, max_reads_length = fastQintegrity.runFastQintegrity(fastq_files, threads, outdir) runs['FastQ_Integrity'] = [not_corruption_found, pass_qc, time_taken, failing] if not_corruption_found: # Run first Estimated Coverage run_successfully_estimatedCoverage = False estimatedCoverage = None run_successfully_trueCoverage = False pass_qc_trueCoverage = False if not args.skipEstimatedCoverage: # Check whether the Estimated Coverage output is already present report_file = os.path.join(outdir, 'coverage_report.txt') if os.path.isfile(report_file): os.remove(report_file) # Run getEstimatedCoverage run_successfully_estimatedCoverage, pass_qc, time_taken, failing, estimatedCoverage = coverage.getEstimatedCoverage(fastq_files, genomeSize, outdir, threads, args.estimatedMinimumCoverage) runs['first_Coverage'] = [run_successfully_estimatedCoverage, pass_qc, time_taken, failing] else: print '--skipEstimatedCoverage set. Skipping First Estimated Coverage analysis' runs['first_Coverage'] = skipped trimmomatic_run_successfully = False if args.skipEstimatedCoverage or (run_successfully_estimatedCoverage and not estimatedCoverage < args.estimatedMinimumCoverage): if not args.skipTrueCoverage and trueCoverage_config is not None: # Run True Coverage run_successfully_trueCoverage, pass_qc_trueCoverage, time_taken, failing = trueCoverage.runTrueCoverage(sampleName, fastq_files, trueCoverage_config['reference_file'], threads, outdir, trueCoverage_config['length_extra_seq'], trueCoverage_config['minimum_depth_presence'], trueCoverage_config['minimum_depth_call'], trueCoverage_config['minimum_depth_frequency_dominant_allele'], trueCoverage_config['minimum_gene_coverage'], False, False, 1, trueCoverage_config['minimum_gene_identity'], trueCoverage_config, rematch_script) runs['trueCoverage_ReMatCh'] = [run_successfully_trueCoverage, pass_qc_trueCoverage, time_taken, failing] else: print '\n' + '--skipTrueCoverage set. Skipping True coverage analysis' runs['trueCoverage_ReMatCh'] = skipped if args.skipTrueCoverage or trueCoverage_config is None or (run_successfully_trueCoverage and pass_qc_trueCoverage): # Run first FastQC nts2clip_based_ntsContent = None if not args.skipFastQC: run_successfully, pass_qc, time_taken, failing, warning, maximum_reads_length, nts2clip_based_ntsContent = fastqc.runFastQCanalysis(outdir, threads, adaptersFasta, fastq_files, args.fastQCkeepFiles, 'first_run') runs['first_FastQC'] = [run_successfully, pass_qc, time_taken, failing, warning] else: print '--skipFastQC set. Skipping First FastQC analysis' runs['first_FastQC'] = skipped + ['NA'] # Run Trimmomatic if not args.skipTrimmomatic: run_successfully, not_empty_fastq, time_taken, failing, paired_reads, trimmomatic_folder, fileSize = trimmomatic.runTrimmomatic(jar_path_trimmomatic, sampleName, outdir, threads, adaptersFasta, script_path, args.doNotSearchAdapters, fastq_files, max_reads_length, args.doNotTrimCrops, args.trimCrop, args.trimHeadCrop, args.trimLeading, args.trimTrailing, args.trimSlidingWindow, args.trimMinLength, nts2clip_based_ntsContent, jarMaxMemory, fastq_encoding) runs['Trimmomatic'] = [run_successfully, None, time_taken, failing, fileSize] trimmomatic_run_successfully = run_successfully if run_successfully and not_empty_fastq: fastq_files = paired_reads min_reads_length = args.trimMinLength # Run second Estimated Coverage if not args.skipEstimatedCoverage: run_successfully_estimatedCoverage, pass_qc, time_taken, failing, estimatedCoverage = coverage.getEstimatedCoverage(fastq_files, genomeSize, outdir, threads, args.estimatedMinimumCoverage) runs['second_Coverage'] = [run_successfully_estimatedCoverage, pass_qc, time_taken, failing] else: print '--skipEstimatedCoverage set. Skipping Second Estimated Coverage analysis' runs['second_Coverage'] = skipped if args.skipEstimatedCoverage or (run_successfully_estimatedCoverage and not estimatedCoverage < args.estimatedMinimumCoverage): # Run second FastQC if not args.skipFastQC: run_successfully, pass_qc, time_taken, failing, warning, maximum_reads_length, nts2clip_based_ntsContent = fastqc.runFastQCanalysis(outdir, threads, adaptersFasta, fastq_files, args.fastQCkeepFiles, 'second_run') runs['second_FastQC'] = [run_successfully, pass_qc, time_taken, failing, warning] if run_successfully: max_reads_length = maximum_reads_length else: print '--skipFastQC set. Skipping Second FastQC analysis' runs['second_FastQC'] = skipped + ['NA'] else: print '\n' + 'Estimated coverage is too lower (< ' + str(args.estimatedMinimumCoverage) + 'x). This sample will not proceed with INNUca pipeline' runs['second_FastQC'] = not_run + ['NA'] runs['Pear'] = not_run + ['NA'] runs['SPAdes'] = not_run + ['NA'] runs['Assembly_Mapping'] = not_run + ['NA'] runs['Pilon'] = not_run runs['MLST'] = not_run + ['NA'] else: print 'Trimmomatic did not run successfully or return zero reads! Skipping Second Estimated Coverage analysis and FastQC analysis' runs['second_Coverage'] = skipped runs['second_FastQC'] = skipped + ['NA'] else: print '--skipTrimmomatic set. Skipping Trimmomatic, but also Second FastQC analysis and Second Estimated Coverage analysis' runs['Trimmomatic'] = skipped + ['NA'] runs['second_Coverage'] = skipped runs['second_FastQC'] = skipped + ['NA'] if not args.skipFastQC and (runs['second_FastQC'][1] or (runs['second_FastQC'][1] is None and runs['first_FastQC'][1])) is False and not args.fastQCproceed: print '\n' + 'This sample does not pass FastQC module QA/QC. It will not proceed with INNUca pipeline' runs['Pear'] = not_run + ['NA'] runs['SPAdes'] = not_run + ['NA'] runs['Assembly_Mapping'] = not_run + ['NA'] runs['Pilon'] = not_run runs['MLST'] = not_run + ['NA'] else: print '\n' + 'This sample does not pass True Coverage module QA/QC. This sample will not proceed with INNUca pipeline' runs['first_FastQC'] = not_run + ['NA'] runs['Trimmomatic'] = not_run + ['NA'] runs['second_Coverage'] = not_run runs['second_FastQC'] = not_run + ['NA'] runs['Pear'] = not_run + ['NA'] runs['SPAdes'] = not_run + ['NA'] runs['Assembly_Mapping'] = not_run + ['NA'] runs['Pilon'] = not_run runs['MLST'] = not_run + ['NA'] else: print '\n' + 'Estimated coverage is too lower (< ' + str(args.estimatedMinimumCoverage) + 'x). This sample will not proceed with INNUca pipeline' runs['trueCoverage_ReMatCh'] = not_run runs['first_FastQC'] = not_run + ['NA'] runs['Trimmomatic'] = not_run + ['NA'] runs['second_Coverage'] = not_run runs['second_FastQC'] = not_run + ['NA'] runs['Pear'] = not_run + ['NA'] runs['SPAdes'] = not_run + ['NA'] runs['Assembly_Mapping'] = not_run + ['NA'] runs['Pilon'] = not_run runs['MLST'] = not_run + ['NA'] if args.skipEstimatedCoverage or (run_successfully_estimatedCoverage and not estimatedCoverage < args.estimatedMinimumCoverage): if args.skipTrueCoverage or trueCoverage_config is None or (run_successfully_trueCoverage and pass_qc_trueCoverage): if args.skipFastQC or (runs['second_FastQC'][1] or (runs['second_FastQC'][1] is None and runs['first_FastQC'][1])) is not False or args.fastQCproceed: unassembled_pe_reads = None assembled_se_reads = None # Run Pear if args.runPear: print '--runPear set. Running Pear' pearMinOverlap = pear.determine_minimum_overlap(args.pearMinOverlap, min_reads_length, max_reads_length) run_successfully, pass_qc, time_taken, failing, unassembled_pe_reads, assembled_se_reads, pear_folder, warning = pear.runPear(fastq_files, threads, outdir, sampleName, fastq_encoding, trimmomatic_run_successfully, pearMinOverlap) runs['Pear'] = [run_successfully, pass_qc, time_taken, failing, warning] else: runs['Pear'] = not_run + ['NA'] # Run SPAdes if not args.skipSPAdes: run_successfully, pass_qc, time_taken, failing, contigs_spades, warning = spades.runSpades(sampleName, outdir, threads, unassembled_pe_reads if unassembled_pe_reads is not None else fastq_files, args.spadesNotUseCareful, spadesMaxMemory, args.spadesMinCoverageAssembly, args.spadesMinContigsLength, genomeSize, args.spadesKmers, max_reads_length, args.spadesDefaultKmers, args.spadesMinKmerCovContigs, assembled_se_reads, args.saveExcludedContigs, args.maxNumberContigs) runs['SPAdes'] = [run_successfully, pass_qc, time_taken, failing, warning] if run_successfully: contigs = contigs_spades # Run Assembly Mapping check bam_file = None if not args.skipAssemblyMapping: run_successfully, pass_qc, time_taken, failing, assembly_filtered, bam_file, assemblyMapping_folder, warning = assembly_mapping.runAssemblyMapping(fastq_files, contigs, threads, outdir, args.assemblyMinCoverageContigs, genomeSize, args.saveExcludedContigs, args.maxNumberContigs) runs['Assembly_Mapping'] = [run_successfully, pass_qc, time_taken, failing, warning] if run_successfully: contigs = assembly_filtered if not args.keepIntermediateAssemblies and os.path.isfile(contigs_spades) and contigs != contigs_spades: os.remove(contigs_spades) else: print '--skipAssemblyMapping set. Skipping Assembly Mapping check' runs['Assembly_Mapping'] = skipped + ['NA'] # Run Pilon if not args.skipPilon: run_successfully, _, time_taken, failing, assembly_polished, pilon_folder = pilon.runPilon(jar_path_pilon, contigs, fastq_files, threads, outdir, jarMaxMemory, bam_file) runs['Pilon'] = [run_successfully, None, time_taken, failing] if run_successfully: contigs = assembly_polished if not args.keepIntermediateAssemblies and 'assembly_filtered' in locals() and os.path.isfile(assembly_filtered): os.remove(assembly_filtered) if not args.pilonKeepFiles: utils.removeDirectory(pilon_folder) else: print '--skipPilon set. Skipping Pilon correction' runs['Pilon'] = skipped if 'assemblyMapping_folder' in locals(): utils.removeDirectory(assemblyMapping_folder) print '\n' + 'Final assembly: ' + contigs with open(os.path.join(outdir, 'final_assembly.txt'), 'wt') as writer: writer.write(contigs + '\n') # Run MLST if not args.skipMLST: run_successfully, pass_qc, time_taken, failing, warning = mlst.runMlst(contigs, scheme, outdir, species_genus, mlst_scheme_genus) runs['MLST'] = [run_successfully, pass_qc, time_taken, failing, warning] else: print '--skipMLST set. Skipping MLST analysis' runs['MLST'] = skipped + ['NA'] else: print 'SPAdes did not run successfully! Skipping Pilon correction, Assembly Mapping check and MLST analysis' runs['Assembly_Mapping'] = skipped + ['NA'] runs['Pilon'] = skipped runs['MLST'] = skipped + ['NA'] else: print '--skipSPAdes set. Skipping SPAdes, Pilon correction, Assembly Mapping check and MLST analysis' runs['SPAdes'] = skipped + ['NA'] runs['Assembly_Mapping'] = skipped + ['NA'] runs['Pilon'] = skipped runs['MLST'] = skipped + ['NA'] else: print 'Moving to the next sample' for step in ('first_Coverage', 'trueCoverage_ReMatCh', 'first_FastQC', 'Trimmomatic', 'second_Coverage', 'second_FastQC', 'Pear', 'SPAdes', 'Assembly_Mapping', 'Pilon', 'MLST'): if step in ('Trimmomatic', 'first_FastQC', 'second_FastQC', 'Pear', 'SPAdes', 'Assembly_Mapping', 'MLST'): runs[step] = not_run + ['NA'] else: runs[step] = not_run # Remove Pear directory if not args.pearKeepFiles and 'pear_folder' in locals(): utils.removeDirectory(pear_folder) # Remove Trimmomatic directory with cleaned reads if not args.trimKeepFiles and 'trimmomatic_folder' in locals(): utils.removeDirectory(trimmomatic_folder) # Check run run_successfully = all(runs[step][0] or runs[step][0] is None for step in runs) pass_fastqIntegrity = runs['FastQ_Integrity'][0] pass_cov = (runs['second_Coverage'][1] or (runs['second_Coverage'][1] is None and runs['first_Coverage'][1])) is not False pass_trueCov = runs['trueCoverage_ReMatCh'][1] is not False pass_fastqc = (runs['second_FastQC'][1] or (runs['second_FastQC'][1] is None and runs['first_FastQC'][1])) is not False # pass_trimmomatic = runs['Trimmomatic'][1] is not False # pass_pear = runs['Pear'][1] is not False # pass_spades = runs['SPAdes'][1] is not False or runs['Assembly_Mapping'][1] is True pass_spades = runs['SPAdes'][1] is not False pass_assemblyMapping = runs['Assembly_Mapping'][1] is not False pass_pilon = runs['Pilon'][0] is not False pass_mlst = runs['MLST'][1] is not False pass_qc = all([pass_fastqIntegrity, pass_cov, pass_trueCov, pass_fastqc, pass_spades, pass_assemblyMapping, pass_pilon, pass_mlst]) return run_successfully, pass_qc, runs
def run_INNUca(sampleName, outdir, fastq_files, args, script_path, scheme, spadesMaxMemory, jar_path_trimmomatic, jar_path_pilon, jarMaxMemory, trueCoverage_config): threads = args.threads adaptersFasta = args.adapters if adaptersFasta is not None: adaptersFasta = os.path.abspath(adaptersFasta.name) genomeSize = args.genomeSizeExpectedMb maximumReadsLength = None skipped = [None, None, 0, {'sample': 'Skipped'}] not_run = [None, None, 0, {'sample': 'Not run'}] runs = {} # Run FastQ integrity check not_corruption_found, _, time_taken, failing = fastQintegrity.runFastQintegrity( fastq_files, threads, outdir) runs['FastQ_Integrity'] = [not_corruption_found, None, time_taken, failing] if not_corruption_found: # Run first Estimated Coverage run_successfully_estimatedCoverage = False estimatedCoverage = None run_successfully_trueCoverage = False pass_qc_trueCoverage = False if not args.skipEstimatedCoverage: # Check whether the Estimated Coverage output is already present report_file = os.path.join(outdir, 'coverage_report.txt') if os.path.isfile(report_file): os.remove(report_file) # Run getEstimatedCoverage run_successfully_estimatedCoverage, pass_qc, time_taken, failing, estimatedCoverage = coverage.getEstimatedCoverage( fastq_files, genomeSize, outdir, threads) runs['first_Coverage'] = [ run_successfully_estimatedCoverage, pass_qc, time_taken, failing ] else: print '--skipEstimatedCoverage set. Skipping First Estimated Coverage analysis' runs['first_Coverage'] = skipped if args.skipEstimatedCoverage or ( run_successfully_estimatedCoverage and not estimatedCoverage < args.estimatedMinimumCoverage): if not args.skipTrueCoverage and trueCoverage_config is not None: # Run True Coverage run_successfully_trueCoverage, pass_qc_trueCoverage, time_taken, failing = trueCoverage.runTrueCoverage( fastq_files, trueCoverage_config['reference_file'], threads, outdir, trueCoverage_config['length_extra_seq'], trueCoverage_config['minimum_depth_presence'], trueCoverage_config['minimum_depth_call'], trueCoverage_config[ 'minimum_depth_frequency_dominant_allele'], trueCoverage_config['minimum_gene_coverage'], trueCoverage_config['maximum_number_absent_genes'], trueCoverage_config[ 'maximum_number_genes_multiple_alleles'], trueCoverage_config['minimum_read_coverage']) runs['trueCoverage_ReMatCh'] = [ run_successfully_trueCoverage, pass_qc_trueCoverage, time_taken, failing ] else: print '\n' + '--skipTrueCoverage set. Skipping True coverage analysis' runs['trueCoverage_ReMatCh'] = skipped if args.skipTrueCoverage or trueCoverage_config is None or ( run_successfully_trueCoverage and pass_qc_trueCoverage): # Run first FastQC nts2clip_based_ntsContent = None if not args.skipFastQC: run_successfully, pass_qc, time_taken, failing, maximumReadsLength, nts2clip_based_ntsContent = fastqc.runFastQCanalysis( outdir, threads, adaptersFasta, fastq_files) runs['first_FastQC'] = [ run_successfully, pass_qc, time_taken, failing ] else: print '--skipFastQC set. Skipping First FastQC analysis' runs['first_FastQC'] = skipped # Run Trimmomatic if not args.skipTrimmomatic: run_successfully, not_empty_fastq, time_taken, failing, paired_reads, trimmomatic_folder, fileSize = trimmomatic.runTrimmomatic( jar_path_trimmomatic, sampleName, outdir, threads, adaptersFasta, script_path, args.doNotSearchAdapters, fastq_files, maximumReadsLength, args.doNotTrimCrops, args.trimCrop, args.trimHeadCrop, args.trimLeading, args.trimTrailing, args.trimSlidingWindow, args.trimMinLength, nts2clip_based_ntsContent, jarMaxMemory) runs['Trimmomatic'] = [ run_successfully, not_empty_fastq, time_taken, failing, fileSize ] if run_successfully and not_empty_fastq: fastq_files = paired_reads # Run second Estimated Coverage if not args.skipEstimatedCoverage: run_successfully_estimatedCoverage, pass_qc, time_taken, failing, estimatedCoverage = coverage.getEstimatedCoverage( fastq_files, genomeSize, outdir, threads) runs['second_Coverage'] = [ run_successfully_estimatedCoverage, pass_qc, time_taken, failing ] else: print '--skipEstimatedCoverage set. Skipping Second Estimated Coverage analysis' runs['second_Coverage'] = skipped if args.skipEstimatedCoverage or ( run_successfully_estimatedCoverage and not estimatedCoverage < args.estimatedMinimumCoverage): # Run second FastQC if not args.skipFastQC: run_successfully, pass_qc, time_taken, failing, maximumReadsLength, nts2clip_based_ntsContent = fastqc.runFastQCanalysis( outdir, threads, adaptersFasta, fastq_files) runs['second_FastQC'] = [ run_successfully, pass_qc, time_taken, failing ] else: print '--skipFastQC set. Skipping Second FastQC analysis' runs['second_FastQC'] = skipped else: print '\n' + 'Estimated coverage is too lower (< ' + str( args.estimatedMinimumCoverage ) + 'x). This sample will not proceed with INNUca pipeline' runs['second_FastQC'] = not_run runs['SPAdes'] = not_run runs['Pilon'] = not_run runs['Assembly_Mapping'] = not_run runs['MLST'] = not_run else: print 'Trimmomatic did not run successfully or return zero reads! Skipping Second Estimated Coverage analysis and FastQC analysis' runs['second_Coverage'] = skipped runs['second_FastQC'] = skipped else: print '--skipTrimmomatic set. Skipping Trimmomatic, but also Second FastQC analysis and Second Estimated Coverage analysis' runs['Trimmomatic'] = skipped + ['NA'] runs['second_Coverage'] = skipped runs['second_FastQC'] = skipped else: print '\n' + 'This sample does not pass True Coverage module QA/QC. This sample will not proceed with INNUca pipeline' runs['first_FastQC'] = not_run runs['Trimmomatic'] = not_run + ['NA'] runs['second_Coverage'] = not_run runs['second_FastQC'] = not_run runs['SPAdes'] = not_run runs['Pilon'] = not_run runs['Assembly_Mapping'] = not_run runs['MLST'] = not_run else: print '\n' + 'Estimated coverage is too lower (< ' + str( args.estimatedMinimumCoverage ) + 'x). This sample will not proceed with INNUca pipeline' runs['trueCoverage_ReMatCh'] = not_run runs['first_FastQC'] = not_run runs['Trimmomatic'] = not_run + ['NA'] runs['second_Coverage'] = not_run runs['second_FastQC'] = not_run runs['SPAdes'] = not_run runs['Pilon'] = not_run runs['Assembly_Mapping'] = not_run runs['MLST'] = not_run if args.skipEstimatedCoverage or ( run_successfully_estimatedCoverage and not estimatedCoverage < args.estimatedMinimumCoverage): if args.skipTrueCoverage or trueCoverage_config is None or ( run_successfully_trueCoverage and pass_qc_trueCoverage): # Run SPAdes if not args.skipSPAdes: run_successfully, pass_qc, time_taken, failing, contigs_spades = spades.runSpades( sampleName, outdir, threads, fastq_files, args.spadesNotUseCareful, spadesMaxMemory, args.spadesMinCoverageAssembly, args.spadesMinContigsLength, genomeSize, args.spadesKmers, maximumReadsLength, args.spadesDefaultKmers, args.spadesMinKmerCovContigs) runs['SPAdes'] = [ run_successfully, pass_qc, time_taken, failing ] if run_successfully: # Run Pilon contigs = contigs_spades if not args.skipPilon: run_successfully, _, time_taken, failing, assembly_polished, bam_file, pilon_folder = pilon.runPilon( jar_path_pilon, contigs_spades, fastq_files, threads, outdir, jarMaxMemory) runs['Pilon'] = [ run_successfully, None, time_taken, failing ] if run_successfully: contigs = assembly_polished # Run Assembly Mapping check if bam_file is not None: if not args.skipAssemblyMapping: run_successfully, pass_qc, time_taken, failing, assembly_filtered = assembly_mapping.runAssemblyMapping( bam_file, contigs_spades, threads, outdir, args.assemblyMinCoverageContigs, assembly_polished, genomeSize) runs['Assembly_Mapping'] = [ run_successfully, pass_qc, time_taken, failing ] if run_successfully: contigs = assembly_filtered else: print '--skipAssemblyMapping set. Skipping Assembly Mapping check' runs['Assembly_Mapping'] = skipped else: print 'Pilon did not produce the bam file! Assembly Mapping check' runs['Assembly_Mapping'] = skipped if not args.pilonKeepFiles: utils.removeDirectory(pilon_folder) else: print '--skipPilon set. Skipping Pilon correction and Assembly Mapping check' runs['Pilon'] = skipped runs['Assembly_Mapping'] = skipped print '\n' + 'Final assembly: ' + contigs with open(os.path.join(outdir, 'final_assembly.txt'), 'wt') as writer: writer.write(contigs + '\n') # Run MLST if not args.skipMLST: runs['MLST'] = mlst.runMlst( contigs, scheme, outdir) else: print '--skipMLST set. Skipping MLST analysis' runs['MLST'] = skipped else: print 'SPAdes did not run successfully! Skipping Pilon correction, Assembly Mapping check and MLST analysis' runs['Pilon'] = skipped runs['Assembly_Mapping'] = skipped runs['MLST'] = skipped else: print '--skipSPAdes set. Skipping SPAdes Pilon correction, Assembly Mapping check and MLST analysis' runs['SPAdes'] = skipped runs['Pilon'] = skipped runs['Assembly_Mapping'] = skipped runs['MLST'] = skipped else: print 'Moving to the next sample' for step in ('first_Coverage', 'trueCoverage_ReMatCh', 'first_FastQC', 'Trimmomatic', 'second_Coverage', 'second_FastQC', 'SPAdes', 'Pilon', 'Assembly_Mapping', 'MLST'): if step == 'Trimmomatic': runs[step] = not_run + ['NA'] else: runs[step] = not_run # Remove Trimmomatic directory with cleaned reads if not args.trimKeepFiles: try: utils.removeDirectory(trimmomatic_folder) except: print 'It is not possible to remove Trimmomatic directory because Trimmomatic did not run' # Check run run_successfully = all(runs[step][0] or runs[step][0] is None for step in runs) pass_fastqIntegrity = runs['FastQ_Integrity'][0] pass_cov = (runs['second_Coverage'][1] or (runs['second_Coverage'][1] is None and runs['first_Coverage'][1])) is not False pass_trueCov = runs['trueCoverage_ReMatCh'][1] is not False pass_fastqc = (runs['second_FastQC'][1] or (runs['second_FastQC'][1] is None and runs['first_FastQC'][1])) is not False pass_trimmomatic = runs['Trimmomatic'][1] is not False pass_spades = runs['SPAdes'][1] is not False pass_assemblyMapping = runs['Assembly_Mapping'][1] is not False pass_mlst = runs['MLST'][1] is not False pass_qc = all([ pass_fastqIntegrity, pass_cov, pass_trueCov, pass_fastqc, pass_trimmomatic, pass_spades, pass_assemblyMapping, pass_mlst ]) return run_successfully, pass_qc, runs
def run_INNUca(sampleName, outdir, fastq_files, args, script_path, scheme, spadesMaxMemory, jar_path_trimmomatic, jar_path_pilon, jarMaxMemory, trueCoverage_config, rematch_script, species_genus, mlst_scheme_genus): threads = args.threads adaptersFasta = args.adapters if adaptersFasta is not None: adaptersFasta = os.path.abspath(adaptersFasta.name) genomeSize = args.genomeSizeExpectedMb skipped = [None, None, 0, {'sample': 'Skipped'}] not_run = [None, None, 0, {'sample': 'Not run'}] runs = {} # Run FastQ integrity check not_corruption_found, pass_qc, time_taken, failing, fastq_encoding, min_reads_length, max_reads_length = fastQintegrity.runFastQintegrity( fastq_files, threads, outdir) runs['FastQ_Integrity'] = [ not_corruption_found, pass_qc, time_taken, failing ] if not_corruption_found: # Run first Estimated Coverage run_successfully_estimatedCoverage = False estimatedCoverage = None run_successfully_trueCoverage = False pass_qc_trueCoverage = False if not args.skipEstimatedCoverage: # Check whether the Estimated Coverage output is already present report_file = os.path.join(outdir, 'coverage_report.txt') if os.path.isfile(report_file): os.remove(report_file) # Run getEstimatedCoverage run_successfully_estimatedCoverage, pass_qc, time_taken, failing, estimatedCoverage = coverage.getEstimatedCoverage( fastq_files, genomeSize, outdir, threads, args.estimatedMinimumCoverage) runs['first_Coverage'] = [ run_successfully_estimatedCoverage, pass_qc, time_taken, failing ] else: print '--skipEstimatedCoverage set. Skipping First Estimated Coverage analysis' runs['first_Coverage'] = skipped trimmomatic_run_successfully = False if args.skipEstimatedCoverage or ( run_successfully_estimatedCoverage and not estimatedCoverage < args.estimatedMinimumCoverage): if not args.skipTrueCoverage and trueCoverage_config is not None: # Run True Coverage run_successfully_trueCoverage, pass_qc_trueCoverage, time_taken, failing = trueCoverage.runTrueCoverage( sampleName, fastq_files, trueCoverage_config['reference_file'], threads, outdir, trueCoverage_config['length_extra_seq'], trueCoverage_config['minimum_depth_presence'], trueCoverage_config['minimum_depth_call'], trueCoverage_config[ 'minimum_depth_frequency_dominant_allele'], trueCoverage_config['minimum_gene_coverage'], False, False, 1, trueCoverage_config['minimum_gene_identity'], trueCoverage_config, rematch_script) runs['trueCoverage_ReMatCh'] = [ run_successfully_trueCoverage, pass_qc_trueCoverage, time_taken, failing ] else: print '\n' + '--skipTrueCoverage set. Skipping True coverage analysis' runs['trueCoverage_ReMatCh'] = skipped if args.skipTrueCoverage or trueCoverage_config is None or ( run_successfully_trueCoverage and pass_qc_trueCoverage): # Run first FastQC nts2clip_based_ntsContent = None if not args.skipFastQC: run_successfully, pass_qc, time_taken, failing, warning, maximum_reads_length, nts2clip_based_ntsContent = fastqc.runFastQCanalysis( outdir, threads, adaptersFasta, fastq_files, args.fastQCkeepFiles, 'first_run') runs['first_FastQC'] = [ run_successfully, pass_qc, time_taken, failing, warning ] else: print '--skipFastQC set. Skipping First FastQC analysis' runs['first_FastQC'] = skipped + ['NA'] # Run Trimmomatic if not args.skipTrimmomatic: run_successfully, not_empty_fastq, time_taken, failing, paired_reads, trimmomatic_folder, fileSize = trimmomatic.runTrimmomatic( jar_path_trimmomatic, sampleName, outdir, threads, adaptersFasta, script_path, args.doNotSearchAdapters, fastq_files, max_reads_length, args.doNotTrimCrops, args.trimCrop, args.trimHeadCrop, args.trimLeading, args.trimTrailing, args.trimSlidingWindow, args.trimMinLength, nts2clip_based_ntsContent, jarMaxMemory, fastq_encoding) runs['Trimmomatic'] = [ run_successfully, None, time_taken, failing, fileSize ] trimmomatic_run_successfully = run_successfully if run_successfully and not_empty_fastq: fastq_files = paired_reads min_reads_length = args.trimMinLength # Run second Estimated Coverage if not args.skipEstimatedCoverage: run_successfully_estimatedCoverage, pass_qc, time_taken, failing, estimatedCoverage = coverage.getEstimatedCoverage( fastq_files, genomeSize, outdir, threads, args.estimatedMinimumCoverage) runs['second_Coverage'] = [ run_successfully_estimatedCoverage, pass_qc, time_taken, failing ] else: print '--skipEstimatedCoverage set. Skipping Second Estimated Coverage analysis' runs['second_Coverage'] = skipped if args.skipEstimatedCoverage or ( run_successfully_estimatedCoverage and not estimatedCoverage < args.estimatedMinimumCoverage): # Run second FastQC if not args.skipFastQC: run_successfully, pass_qc, time_taken, failing, warning, maximum_reads_length, nts2clip_based_ntsContent = fastqc.runFastQCanalysis( outdir, threads, adaptersFasta, fastq_files, args.fastQCkeepFiles, 'second_run') runs['second_FastQC'] = [ run_successfully, pass_qc, time_taken, failing, warning ] if run_successfully: max_reads_length = maximum_reads_length else: print '--skipFastQC set. Skipping Second FastQC analysis' runs['second_FastQC'] = skipped + ['NA'] else: print '\n' + 'Estimated coverage is too lower (< ' + str( args.estimatedMinimumCoverage ) + 'x). This sample will not proceed with INNUca pipeline' runs['second_FastQC'] = not_run + ['NA'] runs['Pear'] = not_run + ['NA'] runs['SPAdes'] = not_run + ['NA'] runs['Assembly_Mapping'] = not_run + ['NA'] runs['Pilon'] = not_run runs['MLST'] = not_run + ['NA'] else: print 'Trimmomatic did not run successfully or return zero reads! Skipping Second Estimated Coverage analysis and FastQC analysis' runs['second_Coverage'] = skipped runs['second_FastQC'] = skipped + ['NA'] else: print '--skipTrimmomatic set. Skipping Trimmomatic, but also Second FastQC analysis and Second Estimated Coverage analysis' runs['Trimmomatic'] = skipped + ['NA'] runs['second_Coverage'] = skipped runs['second_FastQC'] = skipped + ['NA'] if not args.skipFastQC and ( runs['second_FastQC'][1] or (runs['second_FastQC'][1] is None and runs['first_FastQC'] [1])) is False and not args.fastQCproceed: print '\n' + 'This sample does not pass FastQC module QA/QC. It will not proceed with INNUca pipeline' runs['Pear'] = not_run + ['NA'] runs['SPAdes'] = not_run + ['NA'] runs['Assembly_Mapping'] = not_run + ['NA'] runs['Pilon'] = not_run runs['MLST'] = not_run + ['NA'] else: print '\n' + 'This sample does not pass True Coverage module QA/QC. This sample will not proceed with INNUca pipeline' runs['first_FastQC'] = not_run + ['NA'] runs['Trimmomatic'] = not_run + ['NA'] runs['second_Coverage'] = not_run runs['second_FastQC'] = not_run + ['NA'] runs['Pear'] = not_run + ['NA'] runs['SPAdes'] = not_run + ['NA'] runs['Assembly_Mapping'] = not_run + ['NA'] runs['Pilon'] = not_run runs['MLST'] = not_run + ['NA'] else: print '\n' + 'Estimated coverage is too lower (< ' + str( args.estimatedMinimumCoverage ) + 'x). This sample will not proceed with INNUca pipeline' runs['trueCoverage_ReMatCh'] = not_run runs['first_FastQC'] = not_run + ['NA'] runs['Trimmomatic'] = not_run + ['NA'] runs['second_Coverage'] = not_run runs['second_FastQC'] = not_run + ['NA'] runs['Pear'] = not_run + ['NA'] runs['SPAdes'] = not_run + ['NA'] runs['Assembly_Mapping'] = not_run + ['NA'] runs['Pilon'] = not_run runs['MLST'] = not_run + ['NA'] if args.skipEstimatedCoverage or ( run_successfully_estimatedCoverage and not estimatedCoverage < args.estimatedMinimumCoverage): if args.skipTrueCoverage or trueCoverage_config is None or ( run_successfully_trueCoverage and pass_qc_trueCoverage): if args.skipFastQC or (runs['second_FastQC'][1] or (runs['second_FastQC'][1] is None and runs['first_FastQC'][1]) ) is not False or args.fastQCproceed: unassembled_pe_reads = None assembled_se_reads = None # Run Pear if args.runPear: print '--runPear set. Running Pear' pearMinOverlap = pear.determine_minimum_overlap( args.pearMinOverlap, min_reads_length, max_reads_length) run_successfully, pass_qc, time_taken, failing, unassembled_pe_reads, assembled_se_reads, pear_folder, warning = pear.runPear( fastq_files, threads, outdir, sampleName, fastq_encoding, trimmomatic_run_successfully, pearMinOverlap) runs['Pear'] = [ run_successfully, pass_qc, time_taken, failing, warning ] else: runs['Pear'] = not_run + ['NA'] # Run SPAdes if not args.skipSPAdes: run_successfully, pass_qc, time_taken, failing, contigs_spades, warning = spades.runSpades( sampleName, outdir, threads, unassembled_pe_reads if unassembled_pe_reads is not None else fastq_files, args.spadesNotUseCareful, spadesMaxMemory, args.spadesMinCoverageAssembly, args.spadesMinContigsLength, genomeSize, args.spadesKmers, max_reads_length, args.spadesDefaultKmers, args.spadesMinKmerCovContigs, assembled_se_reads, args.saveExcludedContigs, args.maxNumberContigs) runs['SPAdes'] = [ run_successfully, pass_qc, time_taken, failing, warning ] if run_successfully: contigs = contigs_spades # Run Assembly Mapping check bam_file = None if not args.skipAssemblyMapping: run_successfully, pass_qc, time_taken, failing, assembly_filtered, bam_file, assemblyMapping_folder, warning = assembly_mapping.runAssemblyMapping( fastq_files, contigs, threads, outdir, args.assemblyMinCoverageContigs, genomeSize, args.saveExcludedContigs, args.maxNumberContigs) runs['Assembly_Mapping'] = [ run_successfully, pass_qc, time_taken, failing, warning ] if run_successfully: contigs = assembly_filtered if not args.keepIntermediateAssemblies and os.path.isfile( contigs_spades ) and contigs != contigs_spades: os.remove(contigs_spades) else: print '--skipAssemblyMapping set. Skipping Assembly Mapping check' runs['Assembly_Mapping'] = skipped + ['NA'] # Run Pilon if not args.skipPilon: run_successfully, _, time_taken, failing, assembly_polished, pilon_folder = pilon.runPilon( jar_path_pilon, contigs, fastq_files, threads, outdir, jarMaxMemory, bam_file) runs['Pilon'] = [ run_successfully, None, time_taken, failing ] if run_successfully: contigs = assembly_polished if not args.keepIntermediateAssemblies and 'assembly_filtered' in locals( ) and os.path.isfile(assembly_filtered): os.remove(assembly_filtered) if not args.pilonKeepFiles: utils.removeDirectory(pilon_folder) else: print '--skipPilon set. Skipping Pilon correction' runs['Pilon'] = skipped if 'assemblyMapping_folder' in locals(): utils.removeDirectory(assemblyMapping_folder) print '\n' + 'Final assembly: ' + contigs with open( os.path.join(outdir, 'final_assembly.txt'), 'wt') as writer: writer.write(contigs + '\n') # Run MLST if not args.skipMLST: run_successfully, pass_qc, time_taken, failing, warning = mlst.runMlst( contigs, scheme, outdir, species_genus, mlst_scheme_genus) runs['MLST'] = [ run_successfully, pass_qc, time_taken, failing, warning ] else: print '--skipMLST set. Skipping MLST analysis' runs['MLST'] = skipped + ['NA'] else: print 'SPAdes did not run successfully! Skipping Pilon correction, Assembly Mapping check and MLST analysis' runs['Assembly_Mapping'] = skipped + ['NA'] runs['Pilon'] = skipped runs['MLST'] = skipped + ['NA'] else: print '--skipSPAdes set. Skipping SPAdes, Pilon correction, Assembly Mapping check and MLST analysis' runs['SPAdes'] = skipped + ['NA'] runs['Assembly_Mapping'] = skipped + ['NA'] runs['Pilon'] = skipped runs['MLST'] = skipped + ['NA'] else: print 'Moving to the next sample' for step in ('first_Coverage', 'trueCoverage_ReMatCh', 'first_FastQC', 'Trimmomatic', 'second_Coverage', 'second_FastQC', 'Pear', 'SPAdes', 'Assembly_Mapping', 'Pilon', 'MLST'): if step in ('Trimmomatic', 'first_FastQC', 'second_FastQC', 'Pear', 'SPAdes', 'Assembly_Mapping', 'MLST'): runs[step] = not_run + ['NA'] else: runs[step] = not_run # Remove Pear directory if not args.pearKeepFiles and 'pear_folder' in locals(): utils.removeDirectory(pear_folder) # Remove Trimmomatic directory with cleaned reads if not args.trimKeepFiles and 'trimmomatic_folder' in locals(): utils.removeDirectory(trimmomatic_folder) # Check run run_successfully = all(runs[step][0] or runs[step][0] is None for step in runs) pass_fastqIntegrity = runs['FastQ_Integrity'][0] pass_cov = (runs['second_Coverage'][1] or (runs['second_Coverage'][1] is None and runs['first_Coverage'][1])) is not False pass_trueCov = runs['trueCoverage_ReMatCh'][1] is not False pass_fastqc = (runs['second_FastQC'][1] or (runs['second_FastQC'][1] is None and runs['first_FastQC'][1])) is not False # pass_trimmomatic = runs['Trimmomatic'][1] is not False # pass_pear = runs['Pear'][1] is not False # pass_spades = runs['SPAdes'][1] is not False or runs['Assembly_Mapping'][1] is True pass_spades = runs['SPAdes'][1] is not False pass_assemblyMapping = runs['Assembly_Mapping'][1] is not False pass_pilon = runs['Pilon'][0] is not False pass_mlst = runs['MLST'][1] is not False pass_qc = all([ pass_fastqIntegrity, pass_cov, pass_trueCov, pass_fastqc, pass_spades, pass_assemblyMapping, pass_pilon, pass_mlst ]) return run_successfully, pass_qc, runs
def run_innuca(sample_name, outdir, fastq_files, args, script_path, scheme, spades_max_memory, jar_path_trimmomatic, jar_path_pilon, jar_max_memory, true_coverage_config, rematch_script, species_genus, mlst_scheme_genus, spades_version=None): threads = args.threads adapters_fasta = args.adapters if adapters_fasta is not None: adapters_fasta = os.path.abspath(adapters_fasta.name) genome_size = args.genomeSizeExpectedMb # run_successfully, pass_qc, time_taken, failing, warning, file_size skipped = [None, None, 0, {'sample': 'Skipped'}, {}, 'NA'] not_run = [None, None, 0, {'sample': 'Not run'}, {}, 'NA'] runs = {} # Run FastQ integrity check not_corruption_found, pass_qc, time_taken, failing, fastq_encoding, min_reads_length, max_reads_length = \ fastQintegrity.runFastQintegrity(fastq_files, threads, outdir) runs['FastQ_Integrity'] = [ not_corruption_found, pass_qc, time_taken, failing, {}, 'NA' ] pear_folder = None trimmomatic_folder = None if not_corruption_found: # Run Kraken # most_abundant_taxon_percent = None run_successfully_kraken = False run_successfully_estimated_coverage = False estimated_coverage = None run_successfully_true_coverage = False pass_qc_true_coverage = False trimmomatic_run_successfully = False if args.runKraken: print('\n' '--runKraken set. Running Kraken for reads') run_successfully_kraken, pass_qc, time_taken, failing, warning, _ = \ kraken(species=args.speciesExpected, files_to_classify=fastq_files, kraken_db=args.krakenDB, files_type='fastq', outdir=outdir, version_kraken=version_kraken_global, db_mem=args.krakenMemory, quick=args.krakenQuick, min_percent_covered=args.krakenMinCov, max_unclassified_frag=args.krakenMaxUnclass, min_base_quality=args.krakenMinQual, threads=threads) runs['reads_Kraken'] = [ run_successfully_kraken, pass_qc, time_taken, failing, warning, 'NA' ] else: runs['reads_Kraken'] = skipped if args.runKraken and \ (run_successfully_kraken and not pass_qc) and \ not args.krakenProceed and \ not args.krakenIgnoreQC: print( '\n' 'This sample does not pass Kraken module QA/QC. It will not proceed with INNUca pipeline' ) else: # Run first Estimated Coverage if not args.skipEstimatedCoverage: # Check whether the Estimated Coverage output is already present report_file = os.path.join(outdir, 'coverage_report.txt') if os.path.isfile(report_file): os.remove(report_file) # Run getEstimatedCoverage run_successfully_estimated_coverage, pass_qc, time_taken, failing, estimated_coverage = \ coverage.getEstimatedCoverage(fastq_files, genome_size, outdir, threads, args.estimatedMinimumCoverage) runs['first_Coverage'] = [ run_successfully_estimated_coverage, pass_qc, time_taken, failing, {}, 'NA' ] else: print( '--skipEstimatedCoverage set. Skipping First Estimated Coverage analysis' ) runs['first_Coverage'] = skipped # # Correct first estimation coverage with Kraken percentage # # Does not seem to be a good idea (at least for Streptococcus agalactiae) # if args.runKraken and \ # (runs['Kraken'][0] and runs['Kraken'][1]) and \ # most_abundant_taxon_percent is not None and \ # estimated_coverage is not None: # new_estimation = estimated_coverage * (most_abundant_taxon_percent / 100) # print('\n' # 'Correct estimated coverage ({estimated}x) with Kraken taxon percentage' # ' coverage ({percent}%): {new_estimation}x'.format(estimated=estimated_coverage, # percent=most_abundant_taxon_percent, # new_estimation=new_estimation)) # estimated_coverage = new_estimation if args.skipEstimatedCoverage or ( run_successfully_estimated_coverage and not estimated_coverage < args.estimatedMinimumCoverage): if not args.skipTrueCoverage and true_coverage_config is not None: # Run True Coverage run_successfully_true_coverage, pass_qc_true_coverage, time_taken, failing, _ = \ trueCoverage.run_true_coverage(sample_name, fastq_files, true_coverage_config['reference_file'], threads, outdir, true_coverage_config['length_extra_seq'], true_coverage_config['minimum_depth_presence'], true_coverage_config['minimum_depth_call'], true_coverage_config['minimum_depth_frequency_dominant_allele'], true_coverage_config['minimum_gene_coverage'], False, true_coverage_config['minimum_gene_identity'], true_coverage_config, rematch_script, num_map_loc=1, bowtie_algorithm=args.trueCoverageBowtieAlgo, clean_run_rematch=True) runs['trueCoverage_ReMatCh'] = [ run_successfully_true_coverage, pass_qc_true_coverage, time_taken, failing, {}, 'NA' ] else: print( '\n' + '--skipTrueCoverage set. Skipping True coverage analysis' ) runs['trueCoverage_ReMatCh'] = skipped if args.skipTrueCoverage or true_coverage_config is None or args.trueCoverageProceed or \ (run_successfully_true_coverage and pass_qc_true_coverage): # Run first FastQC nts2clip_based_nts_content = None if not args.skipFastQC: run_successfully, pass_qc, time_taken, failing, warning, maximum_reads_length, \ nts2clip_based_nts_content = fastqc.runFastQCanalysis(outdir, threads, adapters_fasta, fastq_files, args.fastQCkeepFiles, 'first_run') runs['first_FastQC'] = [ run_successfully, pass_qc, time_taken, failing, warning, 'NA' ] else: print( '--skipFastQC set. Skipping First FastQC analysis') runs['first_FastQC'] = skipped # Run Trimmomatic not_empty_fastq = True if not args.skipTrimmomatic: run_successfully, not_empty_fastq, time_taken, failing, paired_reads, trimmomatic_folder, \ file_size, warning = trimmomatic.runTrimmomatic(jar_path_trimmomatic, sample_name, outdir, threads, adapters_fasta, script_path, args.doNotSearchAdapters, fastq_files, max_reads_length, args.doNotTrimCrops, args.trimCrop, args.trimHeadCrop, args.trimLeading, args.trimTrailing, args.trimSlidingWindow, args.trimMinLength, nts2clip_based_nts_content, jar_max_memory, fastq_encoding) runs['Trimmomatic'] = [ run_successfully, None, time_taken, failing, warning, file_size ] trimmomatic_run_successfully = run_successfully if run_successfully and not_empty_fastq: fastq_files = paired_reads min_reads_length = args.trimMinLength # Run second Estimated Coverage if not args.skipEstimatedCoverage: run_successfully_estimated_coverage, pass_qc, time_run, failing, estimated_coverage = \ coverage.getEstimatedCoverage(fastq_files, genome_size, outdir, threads, args.estimatedMinimumCoverage) runs['second_Coverage'] = [ run_successfully_estimated_coverage, pass_qc, time_run, failing, {}, 'NA' ] else: print( '--skipEstimatedCoverage set. Skipping Second Estimated Coverage analysis' ) runs['second_Coverage'] = skipped if args.skipEstimatedCoverage or ( run_successfully_estimated_coverage and not estimated_coverage < args.estimatedMinimumCoverage): # Run second FastQC if not args.skipFastQC: run_successfully, pass_qc, time_taken, failing, warning, maximum_reads_length, \ nts2clip_based_nts_content = fastqc.runFastQCanalysis(outdir, threads, adapters_fasta, fastq_files, args.fastQCkeepFiles, 'second_run') runs['second_FastQC'] = [ run_successfully, pass_qc, time_taken, failing, warning, 'NA' ] if run_successfully: max_reads_length = maximum_reads_length else: print( '--skipFastQC set. Skipping Second FastQC analysis' ) runs['second_FastQC'] = skipped else: print( '\n' 'Estimated coverage is too lower (< {estimatedMinimumCoverage}x). This sample' ' will not proceed with INNUca' ' pipeline'.format( estimatedMinimumCoverage=args. estimatedMinimumCoverage)) runs['second_FastQC'] = skipped else: print( 'Trimmomatic did not run successfully or return zero reads! Skipping Second Estimated' ' Coverage analysis and FastQC analysis') runs['second_Coverage'] = skipped runs['second_FastQC'] = skipped else: print( '--skipTrimmomatic set. Skipping Trimmomatic, but also Second FastQC analysis and Second' ' Estimated Coverage analysis') runs['Trimmomatic'] = skipped runs['second_Coverage'] = skipped runs['second_FastQC'] = skipped if not args.skipFastQC and \ (runs['second_FastQC'][1] or (runs['second_FastQC'][1] is None and runs['first_FastQC'][1])) is False and \ not not_empty_fastq and not args.fastQCproceed: print( '\n' 'This sample does not pass FastQC module QA/QC. It will not proceed with INNUca pipeline' ) else: print( '\n' 'This sample does not pass True Coverage module QA/QC. This sample will not proceed with' ' INNUca pipeline') else: print( '\n' 'Estimated coverage is too lower (< {estimatedMinimumCoverage}x). This sample will not proceed' ' with INNUca pipeline'.format( estimatedMinimumCoverage=args.estimatedMinimumCoverage) ) continue_second_part = False if not args.runKraken or \ (runs['reads_Kraken'][0] is True and runs['reads_Kraken'][1] is True) or \ args.krakenProceed or \ args.krakenIgnoreQC: if args.skipEstimatedCoverage or ( run_successfully_estimated_coverage and not estimated_coverage < args.estimatedMinimumCoverage): if args.skipTrueCoverage or true_coverage_config is None or args.trueCoverageProceed or \ (run_successfully_true_coverage and pass_qc_true_coverage): if args.skipFastQC or (runs['second_FastQC'][1] or (runs['second_FastQC'][1] is None and runs['first_FastQC'][1])) is not False or \ args.fastQCproceed: continue_second_part = True if continue_second_part: unassembled_pe_reads = None assembled_se_reads = None # Run Pear if args.runPear: print('--runPear set. Running Pear') pear_min_overlap = pear.determine_minimum_overlap( args.pearMinOverlap, min_reads_length, max_reads_length) run_successfully, pass_qc, time_taken, failing, unassembled_pe_reads, assembled_se_reads, \ pear_folder, warning = pear.runPear(fastq_files, threads, outdir, sample_name, fastq_encoding, trimmomatic_run_successfully, pear_min_overlap) runs['Pear'] = [ run_successfully, pass_qc, time_taken, failing, warning, 'NA' ] else: runs['Pear'] = skipped # Run SPAdes if not args.skipSPAdes: run_successfully, pass_qc, time_taken, failing, contigs_spades, warning = \ spades.run_spades(sample_name, outdir, threads, unassembled_pe_reads if unassembled_pe_reads is not None else fastq_files, args.spadesNotUseCareful, spades_max_memory, args.spadesMinCoverageAssembly, args.spadesMinContigsLength, genome_size, args.spadesKmers, max_reads_length, args.spadesDefaultKmers, args.spadesMinKmerCovContigs, assembled_se_reads, args.saveExcludedContigs, args.maxNumberContigs, args.keepSPAdesScaffolds, spades_version=spades_version, estimated_coverage=estimated_coverage, spades_not_use_isolate=args.spadesNotUseIsolate) runs['SPAdes'] = [ run_successfully, pass_qc, time_taken, failing, warning, 'NA' ] if run_successfully: contigs = contigs_spades # Run Assembly Mapping check bam_file = None original_bam = None assembly_mapping_folder = None possible_assemblies_bam_remove = {} if not args.skipAssemblyMapping: run_successfully, pass_qc, time_taken, failing, assembly_filtered, bam_file, \ assembly_mapping_folder, warning, original_bam = \ assembly_mapping.run_assembly_mapping(fastq_files=fastq_files, reference_file=contigs, outdir=outdir, estimated_genome_size_mb=genome_size, max_number_contigs=args.maxNumberContigs, save_excluded_contigs=args.saveExcludedContigs, min_coverage_assembly=args.assemblyMinCoverageContigs, keep_bam=args.keepBAM, threads=threads) runs['Assembly_Mapping'] = [ run_successfully, pass_qc, time_taken, failing, warning, 'NA' ] if run_successfully: # Assembly to remove if not args.keepIntermediateAssemblies: if os.path.isfile(contigs_spades) and \ assembly_filtered is not None and \ assembly_filtered != contigs_spades: if not args.keepBAM: os.remove(contigs_spades) else: possible_assemblies_bam_remove[ 'assembly_mapping'] = contigs_spades if assembly_filtered is not None and \ assembly_filtered != contigs_spades and \ os.path.isfile(assembly_filtered): contigs = assembly_filtered else: print( '--skipAssemblyMapping set. Skipping Assembly Mapping check' ) runs['Assembly_Mapping'] = skipped # Run Pilon pilon_new_bam = False pilon_bam = None if not args.skipPilon: run_successfully, _, time_taken, failing, assembly_polished, pilon_folder, pilon_new_bam, \ pilon_bam = pilon.run_pilon(jar_path_pilon=jar_path_pilon, assembly=contigs, fastq_files=fastq_files, outdir=outdir, jar_max_memory=jar_max_memory, alignment_file=bam_file, keep_bam=args.keepBAM, threads=threads) runs['Pilon'] = [ run_successfully, None, time_taken, failing, {}, 'NA' ] if run_successfully: if not args.keepIntermediateAssemblies: if os.path.isfile(contigs) and \ assembly_polished is not None and \ os.path.isfile(assembly_polished): if not args.keepBAM: os.remove(contigs) else: if not pilon_new_bam: possible_assemblies_bam_remove[ 'pilon'] = contigs if assembly_polished is not None and \ os.path.isfile(assembly_polished): contigs = assembly_polished if not args.pilonKeepFiles and os.path.isdir( pilon_folder): utils.removeDirectory(pilon_folder) else: print('--skipPilon set. Skipping Pilon correction') runs['Pilon'] = skipped if not args.keepBAM: if bam_file is not None: if os.path.isfile(bam_file): os.remove(bam_file) if os.path.isfile(bam_file + '.bai'): os.remove(bam_file + '.bai') if original_bam is not None and os.path.isfile( original_bam): os.remove(original_bam) if pilon_bam is not None and os.path.isfile(pilon_bam): os.remove(pilon_bam) if 'assembly_mapping' in possible_assemblies_bam_remove and \ os.path.isfile(possible_assemblies_bam_remove['assembly_mapping']): os.remove(possible_assemblies_bam_remove[ 'assembly_mapping']) if 'pilon' in possible_assemblies_bam_remove and \ os.path.isfile(possible_assemblies_bam_remove['pilon']): os.remove(possible_assemblies_bam_remove['pilon']) else: if pilon_new_bam: if bam_file is not None: if os.path.isfile(bam_file): os.remove(bam_file) if os.path.isfile(bam_file + '.bai'): os.remove(bam_file + '.bai') if original_bam is not None and os.path.isfile( original_bam): os.remove(original_bam) if 'assembly_mapping' in possible_assemblies_bam_remove and \ os.path.isfile(possible_assemblies_bam_remove['assembly_mapping']): os.remove(possible_assemblies_bam_remove[ 'assembly_mapping']) else: if original_bam is not None and os.path.isfile(original_bam) and \ bam_file is not None and os.path.isfile(bam_file): os.remove(bam_file) if 'pilon' in possible_assemblies_bam_remove and \ os.path.isfile(possible_assemblies_bam_remove['pilon']): os.remove( possible_assemblies_bam_remove['pilon']) if not args.skipAssemblyMapping: utils.removeDirectory(assembly_mapping_folder) print('\n' + 'Final assembly: ' + contigs) with open(os.path.join(outdir, 'final_assembly.txt'), 'wt') as writer: writer.write(contigs + '\n') # Run MLST if not args.skipMLST: run_successfully, pass_qc, time_taken, failing, warning = \ mlst.runMlst(contigs, scheme, outdir, species_genus, mlst_scheme_genus) runs['MLST'] = [ run_successfully, pass_qc, time_taken, failing, warning, 'NA' ] else: print('--skipMLST set. Skipping MLST analysis') runs['MLST'] = skipped # Run Kraken if args.runKraken: print('\n' '--runKraken set. Running Kraken for assembly') run_successfully, pass_qc, time_taken, failing, warning, _ = \ kraken(species=args.speciesExpected, files_to_classify=[contigs], kraken_db=args.krakenDB, files_type='fasta', outdir=outdir, version_kraken=version_kraken_global, db_mem=args.krakenMemory, quick=args.krakenQuick, min_percent_covered=args.krakenMinCov, max_unclassified_frag=args.krakenMaxUnclass, min_base_quality=args.krakenMinQual, threads=threads) runs['assembly_Kraken'] = [ run_successfully, pass_qc, time_taken, failing, warning, 'NA' ] else: runs['assembly_Kraken'] = skipped # Run insert_size if args.runInsertSize: print('\n' '--runInsertSize set. Running insert_size') run_successfully, _, time_taken, failing = \ insert_size(sample_name=sample_name, reference=contigs, fastq=fastq_files, outdir=outdir, threads=threads, dist=args.insertSizeDist) runs['insert_size'] = [ run_successfully, None, time_taken, failing, {}, 'NA' ] else: runs['insert_size'] = skipped else: print( 'SPAdes did not run successfully! Skipping Pilon correction, Assembly Mapping check,' ' MLST and Kraken (assembly) analysis and insert size determination' ) else: print( '--skipSPAdes set. Skipping SPAdes, Pilon correction, Assembly Mapping check and MLST and Kraken' ' (assembly) analysis and insert size determination') runs['SPAdes'] = skipped runs['Assembly_Mapping'] = skipped runs['Pilon'] = skipped runs['MLST'] = skipped runs['assembly_Kraken'] = skipped runs['insert_size'] = skipped else: print('Moving to the next sample') for step in ('reads_Kraken', 'first_Coverage', 'trueCoverage_ReMatCh', 'first_FastQC', 'Trimmomatic', 'second_Coverage', 'second_FastQC', 'Pear', 'SPAdes', 'Assembly_Mapping', 'Pilon', 'MLST', 'assembly_Kraken', 'insert_size'): if step not in runs: runs[step] = not_run # Remove Pear directory if not args.pearKeepFiles and pear_folder is not None: utils.removeDirectory(pear_folder) # Remove Trimmomatic directory with cleaned reads if not args.trimKeepFiles and trimmomatic_folder is not None: utils.removeDirectory(trimmomatic_folder) # Check run run_successfully = all(runs[step][0] or runs[step][0] is None for step in runs) pass_fastq_integrity = runs['FastQ_Integrity'][0] pass_reads_kraken = runs['reads_Kraken'][ 1] is not False or args.krakenIgnoreQC pass_cov = (runs['second_Coverage'][1] or (runs['second_Coverage'][1] is None and runs['first_Coverage'][1])) is not False pass_true_cov = runs['trueCoverage_ReMatCh'][ 1] is not False or args.trueCoverageIgnoreQC pass_fastqc = (runs['second_FastQC'][1] or (runs['second_FastQC'][1] is None and runs['first_FastQC'][1])) is not False # pass_trimmomatic = runs['Trimmomatic'][1] is not False # pass_pear = runs['Pear'][1] is not False # pass_spades = runs['SPAdes'][1] is not False or runs['Assembly_Mapping'][1] is True pass_spades = runs['SPAdes'][1] is not False pass_assembly_mapping = runs['Assembly_Mapping'][1] is not False pass_pilon = runs['Pilon'][0] is not False pass_mlst = runs['MLST'][1] is not False or args.mlstIgnoreQC pass_assembly_kraken = runs['assembly_Kraken'][ 1] is not False or args.krakenIgnoreQC pass_qc = all([ pass_fastq_integrity, pass_reads_kraken, pass_cov, pass_true_cov, pass_fastqc, pass_spades, pass_assembly_mapping, pass_pilon, pass_mlst, pass_assembly_kraken ]) return run_successfully, pass_qc, runs