示例#1
0
def get_sample_args_fastq(fastq_files_list, outdir, pairEnd_filesSeparation_list):
    new_indir = os.path.join(outdir, 'reads', '')
    utils.removeDirectory(new_indir)
    os.mkdir(new_indir)
    samples = []
    for fastq in fastq_files_list:
        fastq_link = os.path.join(new_indir, os.path.basename(fastq))
        os.symlink(fastq, fastq_link)
    samples, removeCreatedSamplesDirectories, indir_same_outdir = utils.checkSetInputDirectory(new_indir, outdir, pairEnd_filesSeparation_list)
    return new_indir, samples, removeCreatedSamplesDirectories, indir_same_outdir
示例#2
0
def get_sample_args_fastq(fastq_files_list, outdir,
                          pairEnd_filesSeparation_list):
    new_indir = os.path.join(outdir, 'reads', '')
    utils.removeDirectory(new_indir)
    os.mkdir(new_indir)
    samples = []
    for fastq in fastq_files_list:
        fastq_link = os.path.join(new_indir, os.path.basename(fastq))
        os.symlink(fastq, fastq_link)
    samples, removeCreatedSamplesDirectories, indir_same_outdir = utils.checkSetInputDirectory(
        new_indir, outdir, pairEnd_filesSeparation_list)
    return new_indir, samples, removeCreatedSamplesDirectories, indir_same_outdir
示例#3
0
def main():
    program_name = 'seq_typing.py'

    if sys.version_info[0] < 3:
        sys.exit('Must be using Python 3. Try calling "python3 {}"'.format(
            program_name))

    parser, _, _, _, _ = python_arguments(program_name, __version__)
    args = parser.parse_args()

    start_time = time.time()

    args.outdir = os.path.abspath(args.outdir)
    if not os.path.isdir(args.outdir):
        os.makedirs(args.outdir)

    # Start logger
    logfile, time_str = utils.start_logger(args.outdir)

    script_path = utils.general_information(script_name=program_name,
                                            logfile=logfile,
                                            version=__version__,
                                            outdir=args.outdir,
                                            time_str=time_str)
    del script_path
    print('\n')

    folders_2_remove = []

    # Create modules pickles folder
    pickles_folder = os.path.join(args.outdir, 'pickles', '')
    if not os.path.isdir(pickles_folder):
        os.makedirs(pickles_folder)
    folders_2_remove.append(pickles_folder)

    # Run functions
    folders_2_remove_func, references_results, reference, references_headers = args.func(
        args)
    folders_2_remove.extend(folders_2_remove_func)

    # Parse results
    _, _, _, _, _ = parse_results.parse_results(
        references_results, reference, references_headers, args.outdir,
        args.minGeneCoverage, args.minDepthCoverage, args.typeSeparator)

    if not args.debug:
        for folder in folders_2_remove:
            utils.removeDirectory(folder)

    _ = utils.runTime(start_time)
示例#4
0
def sequence_data(sample, reference_file, bam_file, outdir, threads, length_extra_seq, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, debug_mode_true, rematch):
    sequence_data_outdir = os.path.join(outdir, 'sequence_data', '')
    utils.removeDirectory(sequence_data_outdir)
    os.mkdir(sequence_data_outdir)

    sequences, headers = utils.get_sequence_information(reference_file, length_extra_seq)

    pool = multiprocessing.Pool(processes=threads)
    for sequence_counter in sequences:
        sequence_dir = os.path.join(sequence_data_outdir, str(sequence_counter), '')
        utils.removeDirectory(sequence_dir)
        os.makedirs(sequence_dir)
        pool.apply_async(rematch.analyse_sequence_data, args=(bam_file, sequences[sequence_counter], sequence_dir, sequence_counter, reference_file, length_extra_seq, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele,))
    pool.close()
    pool.join()

    run_successfully, sample_data, consensus_files, consensus_sequences = rematch.gather_data_together(sample, sequence_data_outdir, sequences, outdir.rsplit('/', 2)[0], debug_mode_true, length_extra_seq, False)

    return run_successfully, sample_data, consensus_files, consensus_sequences
示例#5
0
def rematch_for_different_references(fastq,
                                     references_files,
                                     threads,
                                     outdir,
                                     extraSeq,
                                     minCovPresence,
                                     minCovCall,
                                     minFrequencyDominantAllele,
                                     minGeneCoverage,
                                     debug,
                                     minGeneIdentity,
                                     rematch_module,
                                     doNotRemoveConsensus,
                                     bowtie_algorithm,
                                     clean_run_rematch=False):
    references_results = {}
    for x, reference in enumerate(references_files):
        reference_name = os.path.basename(reference) + '_' + str(x)
        ref_dir = os.path.join(outdir, reference_name, '')
        os.makedirs(ref_dir)
        header_gene_list, seq_reference_dict = utils.extractVariableFromPickle(
            reference + '.pkl')
        time_taken, run_successfully, data_by_gene, sample_data_general, consensus_files, consensus_sequences = \
            rematch_module.run_rematch_module('sample', fastq, reference, threads, ref_dir, extraSeq,
                                              minCovPresence, minCovCall, minFrequencyDominantAllele, minGeneCoverage,
                                              debug, 1, minGeneIdentity, 'first', 7, 'none', seq_reference_dict, 'X',
                                              bowtie_algorithm, None, header_gene_list, not doNotRemoveConsensus,
                                              clean_run=clean_run_rematch)
        if run_successfully:
            pickleFile = os.path.join(outdir, str(reference_name + '.pkl'))
            utils.saveVariableToPickle(data_by_gene, pickleFile)
            references_results[reference] = pickleFile
        else:
            sys.exit(
                'Something went wrong while running ReMatCh for reference {reference}'
                .format(reference=reference))
        clean_rematch_folder(consensus_files, reference, ref_dir,
                             doNotRemoveConsensus, debug)
        if not debug and not doNotRemoveConsensus:
            utils.removeDirectory(ref_dir)
    return references_results
示例#6
0
def run_rematch(rematch_script,
                outdir,
                references_files,
                fastq,
                threads,
                extraSeq,
                minCovPresence,
                minCovCall,
                minFrequencyDominantAllele,
                minGeneCoverage,
                minGeneIdentity,
                debug,
                doNotRemoveConsensus,
                bowtie_algorithm,
                clean_run_rematch=False):
    module_dir = os.path.join(outdir, 'rematch', '')
    utils.removeDirectory(module_dir)
    os.makedirs(module_dir)

    sys.path.append(os.path.join(os.path.dirname(rematch_script), 'modules'))
    import rematch_module

    references_results = rematch_for_different_references(
        fastq,
        references_files,
        threads,
        module_dir,
        extraSeq,
        minCovPresence,
        minCovCall,
        minFrequencyDominantAllele,
        minGeneCoverage,
        debug,
        minGeneIdentity,
        rematch_module,
        doNotRemoveConsensus,
        bowtie_algorithm,
        clean_run_rematch=clean_run_rematch)

    return references_results, module_dir
示例#7
0
def run_rematch(rematch, outdir, reference_file, bam_file, threads, length_extra_seq, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, minimum_gene_coverage, minimum_gene_identity, debug_mode_true, doNotRemoveConsensus):
    module_dir = os.path.join(outdir, 'rematch', '')
    utils.removeDirectory(module_dir)
    os.makedirs(module_dir)

    sys.path.append(os.path.join(os.path.dirname(rematch), 'modules'))
    import rematch_module as rematch

    print('Analysing alignment data')
    run_successfully, sample_data, consensus_files, consensus_sequences = sequence_data('sample', reference_file, bam_file, module_dir, threads, length_extra_seq, minimum_depth_presence, minimum_depth_call, minimum_depth_frequency_dominant_allele, debug_mode_true, rematch)

    if run_successfully:
        number_absent_genes, number_genes_multiple_alleles, mean_sample_coverage = \
            determine_general_statistics(outdir, sample_data=sample_data, minimum_gene_coverage=minimum_gene_coverage,
                                         minimum_gene_identity=minimum_gene_identity)

    if not debug_mode_true:
        utils.removeDirectory(module_dir)

    clean_rematch_folder(consensus_files, bam_file, reference_file, outdir, doNotRemoveConsensus, debug_mode_true)

    return run_successfully, {'number_absent_genes': number_absent_genes if 'number_absent_genes' in locals() else None, 'number_genes_multiple_alleles': number_genes_multiple_alleles if 'number_genes_multiple_alleles' in locals() else None, 'mean_sample_coverage': round(mean_sample_coverage, 2) if 'mean_sample_coverage' in locals() else None}, sample_data if 'sample_data' in locals() else None
示例#8
0
def main():
    version = '3.1'
    args = utils.parseArguments(version)

    general_start_time = time.time()
    time_str = time.strftime("%Y%m%d-%H%M%S")

    # Check if output directory exists
    outdir = os.path.abspath(os.path.join(args.outdir, ''))
    if not os.path.isdir(outdir):
        os.makedirs(outdir)

    # Start logger
    if not args.noLog:
        sys.stdout = utils.Logger(outdir, time_str)

    print '\n' + '==========> INNUca.py <=========='
    print '\n' + 'Program start: ' + time.ctime()

    # Tells where the logfile will be stored
    if not args.noLog:
        print '\n' + 'LOGFILE:'
        print sys.stdout.getLogFile()

    # Print command
    print '\n' + 'COMMAND:'
    script_path = os.path.abspath(sys.argv[0])
    print sys.executable + ' ' + script_path + ' ' + ' '.join(sys.argv[1:])

    # Print directory where programme was lunch
    print '\n' + 'PRESENT DIRECTORY:'
    print os.getcwd()

    # Print program version
    print '\n' + 'VERSION INNUca.py:'
    utils.scriptVersionGit(version, os.getcwd(), script_path, args.noGitInfo)

    # Get CPU information
    utils.get_cpu_information(outdir, time_str)

    # Get trueCoverage_ReMatCh settings
    trueCoverage_config = get_trueCoverage_config(args.skipTrueCoverage, args.trueConfigFile.name if args.trueConfigFile is not None else None, args.speciesExpected, script_path)

    # Check programms
    programs_version_dictionary = {}
    programs_version_dictionary['gunzip'] = ['--version', '>=', '1.6']

    # Java check first for java dependents check next
    if not (args.skipFastQC and args.skipTrimmomatic and (args.skipPilon or args.skipSPAdes)):
        # programs_version_dictionary['java'] = ['-version', '>=', '1.8']
        programs_version_dictionary['java'] = [None, '>=', '1.8']  # For OpenJDK compatibility
    missingPrograms, programs_version_dictionary = utils.checkPrograms(programs_version_dictionary)
    if len(missingPrograms) > 0:
        sys.exit('\n' + 'Errors:' + '\n' + '\n'.join(missingPrograms))

    if not args.skipTrueCoverage or trueCoverage_config is not None:
        include_rematch_dependencies_path(args.doNotUseProvidedSoftware)
        programs_version_dictionary['rematch.py'] = ['--version', '>=', '3.2']
        programs_version_dictionary['bcftools'] = ['--version', '==', '1.3.1']
    if not (args.skipTrueCoverage and ((args.skipAssemblyMapping and args.skipPilon) or args.skipSPAdes)):
        programs_version_dictionary['bowtie2'] = ['--version', '>=', '2.2.9']
        programs_version_dictionary['samtools'] = ['--version', '==', '1.3.1']
    if not args.skipFastQC:
        programs_version_dictionary['fastqc'] = ['--version', '==', '0.11.5']
    if not args.skipTrimmomatic:
        programs_version_dictionary['trimmomatic-0.36.jar'] = ['-version', '==', '0.36']
    if args.runPear:
        programs_version_dictionary['pear'] = ['--version', '>=', '0.9.10']
    if not args.skipSPAdes:
        programs_version_dictionary['spades.py'] = ['--version', '>=', '3.9.0']
    if not (args.skipPilon or args.skipSPAdes):
        programs_version_dictionary['pilon-1.18.jar'] = ['--version', '==', '1.18']
    if not (args.skipMLST or args.skipSPAdes):
        programs_version_dictionary['mlst'] = ['--version', '>=', '2.4']

    # Set and print PATH variable
    utils.setPATHvariable(args, script_path)

    missingPrograms, programs_version_dictionary = utils.checkPrograms(programs_version_dictionary)
    if len(missingPrograms) > 0:
        sys.exit('\n' + 'Errors:' + '\n' + '\n'.join(missingPrograms))

    # .jar paths
    jar_path_trimmomatic = None
    if not args.skipTrimmomatic:
        jar_path_trimmomatic = programs_version_dictionary['trimmomatic-0.36.jar'][3]

    jar_path_pilon = None
    if not args.skipPilon and not args.skipSPAdes:
        jar_path_pilon = programs_version_dictionary['pilon-1.18.jar'][3]

    rematch_script = None
    # ReMatCh path
    if not args.skipTrueCoverage:
        rematch_script = programs_version_dictionary['rematch.py'][3]

    # pairEnd_filesSeparation_list = args.pairEnd_filesSeparation
    pairEnd_filesSeparation_list = None
    samples, inputDirectory, removeCreatedSamplesDirectories, indir_same_outdir = get_samples(args.inputDirectory, args.fastq, outdir, pairEnd_filesSeparation_list)

    # Start running the analysis
    print '\n' + 'RUNNING INNUca.py'

    # Prepare run report file
    samples_report_path = os.path.join(outdir, 'samples_report.' + time_str + '.tab')
    utils.start_sample_report_file(samples_report_path)

    number_samples_successfully = 0
    number_samples_pass = 0
    number_samples_warning = 0

    # Get MLST scheme to use
    scheme = 'unknown'
    species_genus, mlst_scheme_genus = None, None
    if not args.skipMLST and not args.skipSPAdes:
        scheme, species_genus, mlst_scheme_genus = mlst.getScheme(args.speciesExpected)
        # Print path to blastn
        mlst.getBlastPath()

    # Memory
    available_memory_GB = utils.get_free_memory() / (1024.0 ** 2)
    # Determine SPAdes maximum memory
    spadesMaxMemory = None
    if not args.skipSPAdes:
        print ''
        spadesMaxMemory = spades.define_memory(args.spadesMaxMemory, args.threads, available_memory_GB)
    # Determine .jar maximum memory
    jarMaxMemory = 'off'
    if not (args.skipTrimmomatic and (args.skipSPAdes or args.skipPilon)):
        print ''
        jarMaxMemory = utils.define_jar_max_memory(args.jarMaxMemory, args.threads, available_memory_GB)

    # Run INNUca for each sample
    sample_report_json = {}
    for sample in samples:
        sample_start_time = time.time()

        print '\n' + 'Sample: ' + sample + '\n'

        # Create sample outdir
        sample_outdir = os.path.abspath(os.path.join(outdir, sample, ''))
        if not os.path.isdir(sample_outdir):
            os.makedirs(sample_outdir)

        # Get fastq files
        fastq_files = utils.searchFastqFiles(os.path.join(inputDirectory, sample, ''), pairEnd_filesSeparation_list, False)
        if len(fastq_files) == 1:
            print 'Only one fastq file was found: ' + str(fastq_files)
            print 'Pair-End sequencing is required. Moving to the next sample'
            continue
        elif len(fastq_files) == 0:
            print 'No compressed fastq files were found. Continue to the next sample'
            continue

        print 'The following files will be used:'
        print str(fastq_files) + '\n'

        # Run INNUca.py analysis
        run_successfully, pass_qc, run_report = run_INNUca(sample, sample_outdir, fastq_files, args, script_path, scheme, spadesMaxMemory, jar_path_trimmomatic, jar_path_pilon, jarMaxMemory, trueCoverage_config, rematch_script, species_genus, mlst_scheme_genus)

        # Save sample fail report
        utils.write_fail_report(os.path.join(sample_outdir, 'fail_report.txt'), run_report)

        # Save warning report
        write_warning_report(os.path.join(sample_outdir, 'warning_report.txt'), run_report)

        # Get raw reads files size
        fileSize = sum(os.path.getsize(fastq) for fastq in fastq_files)

        # Remove sample directory if it was created during the process
        if removeCreatedSamplesDirectories and not indir_same_outdir:
            utils.removeDirectory(os.path.join(inputDirectory, sample, ''))

        print 'END ' + sample + ' analysis'
        time_taken = utils.runTime(sample_start_time)

        # Save run report
        warning, json_pass_qc = utils.write_sample_report(samples_report_path, sample, run_successfully, pass_qc, time_taken, fileSize, run_report)

        # Save runs statistics
        if run_successfully:
            number_samples_successfully += 1
        if pass_qc:
            if warning:
                number_samples_warning += 1
            else:
                number_samples_pass += 1

        sample_report_json[sample] = {'run_successfully': run_successfully, 'pass_qc': json_pass_qc, 'modules_run_report': run_report}

    # Save combine_samples_reports
    combine_reports.combine_reports(outdir, outdir, args.json, time_str, len(samples))

    # Save sample_report in json
    if args.json:
        import json
        with open(os.path.join(outdir, 'samples_report.' + time_str + '.json'), 'wt') as writer:
            json.dump(sample_report_json, writer)

    # Remove temporary folder with symlink to fastq files in case of --fastq use
    if args.inputDirectory is None and args.fastq is not None:
        utils.removeDirectory(os.path.join(inputDirectory, ''))

    # Run report
    print '\n' + 'END INNUca.py'
    print '\n' + 'Pipeline problems: {not_run_successfully} samples'.format(not_run_successfully=(len(samples) - number_samples_successfully))
    print '\n' + 'FAIL: {number_samples_fail} samples'.format(number_samples_fail=(len(samples) - number_samples_pass - number_samples_warning))
    print '\n' + 'WARNING: {number_samples_warning} samples'.format(number_samples_warning=number_samples_warning)
    print '\n' + 'PASS: {number_samples_pass} samples'.format(number_samples_pass=number_samples_pass)
    time_taken = utils.runTime(general_start_time)
    del time_taken

    # Check whether INNUca.py run at least one sample successfully
    if number_samples_successfully == 0:
        sys.exit('No samples run successfully!')
示例#9
0
def run_INNUca(sampleName, outdir, fastq_files, args, script_path, scheme, spadesMaxMemory, jar_path_trimmomatic, jar_path_pilon, jarMaxMemory, trueCoverage_config, rematch_script, species_genus, mlst_scheme_genus):
    threads = args.threads
    adaptersFasta = args.adapters
    if adaptersFasta is not None:
        adaptersFasta = os.path.abspath(adaptersFasta.name)
    genomeSize = args.genomeSizeExpectedMb
    skipped = [None, None, 0, {'sample': 'Skipped'}]
    not_run = [None, None, 0, {'sample': 'Not run'}]

    runs = {}

    # Run FastQ integrity check
    not_corruption_found, pass_qc, time_taken, failing, fastq_encoding, min_reads_length, max_reads_length = fastQintegrity.runFastQintegrity(fastq_files, threads, outdir)
    runs['FastQ_Integrity'] = [not_corruption_found, pass_qc, time_taken, failing]

    if not_corruption_found:
        # Run first Estimated Coverage
        run_successfully_estimatedCoverage = False
        estimatedCoverage = None
        run_successfully_trueCoverage = False
        pass_qc_trueCoverage = False
        if not args.skipEstimatedCoverage:
            # Check whether the Estimated Coverage output is already present
            report_file = os.path.join(outdir, 'coverage_report.txt')
            if os.path.isfile(report_file):
                os.remove(report_file)
            # Run getEstimatedCoverage
            run_successfully_estimatedCoverage, pass_qc, time_taken, failing, estimatedCoverage = coverage.getEstimatedCoverage(fastq_files, genomeSize, outdir, threads, args.estimatedMinimumCoverage)
            runs['first_Coverage'] = [run_successfully_estimatedCoverage, pass_qc, time_taken, failing]
        else:
            print '--skipEstimatedCoverage set. Skipping First Estimated Coverage analysis'
            runs['first_Coverage'] = skipped

        trimmomatic_run_successfully = False

        if args.skipEstimatedCoverage or (run_successfully_estimatedCoverage and not estimatedCoverage < args.estimatedMinimumCoverage):
            if not args.skipTrueCoverage and trueCoverage_config is not None:
                # Run True Coverage
                run_successfully_trueCoverage, pass_qc_trueCoverage, time_taken, failing = trueCoverage.runTrueCoverage(sampleName, fastq_files, trueCoverage_config['reference_file'], threads, outdir, trueCoverage_config['length_extra_seq'], trueCoverage_config['minimum_depth_presence'], trueCoverage_config['minimum_depth_call'], trueCoverage_config['minimum_depth_frequency_dominant_allele'], trueCoverage_config['minimum_gene_coverage'], False, False, 1, trueCoverage_config['minimum_gene_identity'], trueCoverage_config, rematch_script)
                runs['trueCoverage_ReMatCh'] = [run_successfully_trueCoverage, pass_qc_trueCoverage, time_taken, failing]
            else:
                print '\n' + '--skipTrueCoverage set. Skipping True coverage analysis'
                runs['trueCoverage_ReMatCh'] = skipped

            if args.skipTrueCoverage or trueCoverage_config is None or (run_successfully_trueCoverage and pass_qc_trueCoverage):
                # Run first FastQC
                nts2clip_based_ntsContent = None
                if not args.skipFastQC:
                    run_successfully, pass_qc, time_taken, failing, warning, maximum_reads_length, nts2clip_based_ntsContent = fastqc.runFastQCanalysis(outdir, threads, adaptersFasta, fastq_files, args.fastQCkeepFiles, 'first_run')
                    runs['first_FastQC'] = [run_successfully, pass_qc, time_taken, failing, warning]
                else:
                    print '--skipFastQC set. Skipping First FastQC analysis'
                    runs['first_FastQC'] = skipped + ['NA']

                # Run Trimmomatic
                if not args.skipTrimmomatic:
                    run_successfully, not_empty_fastq, time_taken, failing, paired_reads, trimmomatic_folder, fileSize = trimmomatic.runTrimmomatic(jar_path_trimmomatic, sampleName, outdir, threads, adaptersFasta, script_path, args.doNotSearchAdapters, fastq_files, max_reads_length, args.doNotTrimCrops, args.trimCrop, args.trimHeadCrop, args.trimLeading, args.trimTrailing, args.trimSlidingWindow, args.trimMinLength, nts2clip_based_ntsContent, jarMaxMemory, fastq_encoding)
                    runs['Trimmomatic'] = [run_successfully, None, time_taken, failing, fileSize]
                    trimmomatic_run_successfully = run_successfully

                    if run_successfully and not_empty_fastq:
                        fastq_files = paired_reads
                        min_reads_length = args.trimMinLength

                        # Run second Estimated Coverage
                        if not args.skipEstimatedCoverage:
                            run_successfully_estimatedCoverage, pass_qc, time_taken, failing, estimatedCoverage = coverage.getEstimatedCoverage(fastq_files, genomeSize, outdir, threads, args.estimatedMinimumCoverage)
                            runs['second_Coverage'] = [run_successfully_estimatedCoverage, pass_qc, time_taken, failing]
                        else:
                            print '--skipEstimatedCoverage set. Skipping Second Estimated Coverage analysis'
                            runs['second_Coverage'] = skipped

                        if args.skipEstimatedCoverage or (run_successfully_estimatedCoverage and not estimatedCoverage < args.estimatedMinimumCoverage):
                            # Run second FastQC
                            if not args.skipFastQC:
                                run_successfully, pass_qc, time_taken, failing, warning, maximum_reads_length, nts2clip_based_ntsContent = fastqc.runFastQCanalysis(outdir, threads, adaptersFasta, fastq_files, args.fastQCkeepFiles, 'second_run')
                                runs['second_FastQC'] = [run_successfully, pass_qc, time_taken, failing, warning]
                                if run_successfully:
                                    max_reads_length = maximum_reads_length
                            else:
                                print '--skipFastQC set. Skipping Second FastQC analysis'
                                runs['second_FastQC'] = skipped + ['NA']
                        else:
                            print '\n' + 'Estimated coverage is too lower (< ' + str(args.estimatedMinimumCoverage) + 'x). This sample will not proceed with INNUca pipeline'
                            runs['second_FastQC'] = not_run + ['NA']
                            runs['Pear'] = not_run + ['NA']
                            runs['SPAdes'] = not_run + ['NA']
                            runs['Assembly_Mapping'] = not_run + ['NA']
                            runs['Pilon'] = not_run
                            runs['MLST'] = not_run + ['NA']
                    else:
                        print 'Trimmomatic did not run successfully or return zero reads! Skipping Second Estimated Coverage analysis and FastQC analysis'
                        runs['second_Coverage'] = skipped
                        runs['second_FastQC'] = skipped + ['NA']

                else:
                    print '--skipTrimmomatic set. Skipping Trimmomatic, but also Second FastQC analysis and Second Estimated Coverage analysis'
                    runs['Trimmomatic'] = skipped + ['NA']
                    runs['second_Coverage'] = skipped
                    runs['second_FastQC'] = skipped + ['NA']

                if not args.skipFastQC and (runs['second_FastQC'][1] or (runs['second_FastQC'][1] is None and runs['first_FastQC'][1])) is False and not args.fastQCproceed:
                    print '\n' + 'This sample does not pass FastQC module QA/QC. It will not proceed with INNUca pipeline'
                    runs['Pear'] = not_run + ['NA']
                    runs['SPAdes'] = not_run + ['NA']
                    runs['Assembly_Mapping'] = not_run + ['NA']
                    runs['Pilon'] = not_run
                    runs['MLST'] = not_run + ['NA']
            else:
                print '\n' + 'This sample does not pass True Coverage module QA/QC. This sample will not proceed with INNUca pipeline'
                runs['first_FastQC'] = not_run + ['NA']
                runs['Trimmomatic'] = not_run + ['NA']
                runs['second_Coverage'] = not_run
                runs['second_FastQC'] = not_run + ['NA']
                runs['Pear'] = not_run + ['NA']
                runs['SPAdes'] = not_run + ['NA']
                runs['Assembly_Mapping'] = not_run + ['NA']
                runs['Pilon'] = not_run
                runs['MLST'] = not_run + ['NA']

        else:
            print '\n' + 'Estimated coverage is too lower (< ' + str(args.estimatedMinimumCoverage) + 'x). This sample will not proceed with INNUca pipeline'
            runs['trueCoverage_ReMatCh'] = not_run
            runs['first_FastQC'] = not_run + ['NA']
            runs['Trimmomatic'] = not_run + ['NA']
            runs['second_Coverage'] = not_run
            runs['second_FastQC'] = not_run + ['NA']
            runs['Pear'] = not_run + ['NA']
            runs['SPAdes'] = not_run + ['NA']
            runs['Assembly_Mapping'] = not_run + ['NA']
            runs['Pilon'] = not_run
            runs['MLST'] = not_run + ['NA']

        if args.skipEstimatedCoverage or (run_successfully_estimatedCoverage and not estimatedCoverage < args.estimatedMinimumCoverage):
            if args.skipTrueCoverage or trueCoverage_config is None or (run_successfully_trueCoverage and pass_qc_trueCoverage):
                if args.skipFastQC or (runs['second_FastQC'][1] or (runs['second_FastQC'][1] is None and runs['first_FastQC'][1])) is not False or args.fastQCproceed:
                    unassembled_pe_reads = None
                    assembled_se_reads = None
                    # Run Pear
                    if args.runPear:
                        print '--runPear set. Running Pear'
                        pearMinOverlap = pear.determine_minimum_overlap(args.pearMinOverlap, min_reads_length, max_reads_length)
                        run_successfully, pass_qc, time_taken, failing, unassembled_pe_reads, assembled_se_reads, pear_folder, warning = pear.runPear(fastq_files, threads, outdir, sampleName, fastq_encoding, trimmomatic_run_successfully, pearMinOverlap)
                        runs['Pear'] = [run_successfully, pass_qc, time_taken, failing, warning]
                    else:
                        runs['Pear'] = not_run + ['NA']

                    # Run SPAdes
                    if not args.skipSPAdes:
                        run_successfully, pass_qc, time_taken, failing, contigs_spades, warning = spades.runSpades(sampleName, outdir, threads, unassembled_pe_reads if unassembled_pe_reads is not None else fastq_files, args.spadesNotUseCareful, spadesMaxMemory, args.spadesMinCoverageAssembly, args.spadesMinContigsLength, genomeSize, args.spadesKmers, max_reads_length, args.spadesDefaultKmers, args.spadesMinKmerCovContigs, assembled_se_reads, args.saveExcludedContigs, args.maxNumberContigs)
                        runs['SPAdes'] = [run_successfully, pass_qc, time_taken, failing, warning]

                        if run_successfully:
                            contigs = contigs_spades

                            # Run Assembly Mapping check
                            bam_file = None
                            if not args.skipAssemblyMapping:
                                run_successfully, pass_qc, time_taken, failing, assembly_filtered, bam_file, assemblyMapping_folder, warning = assembly_mapping.runAssemblyMapping(fastq_files, contigs, threads, outdir, args.assemblyMinCoverageContigs, genomeSize, args.saveExcludedContigs, args.maxNumberContigs)
                                runs['Assembly_Mapping'] = [run_successfully, pass_qc, time_taken, failing, warning]

                                if run_successfully:
                                    contigs = assembly_filtered
                                    if not args.keepIntermediateAssemblies and os.path.isfile(contigs_spades) and contigs != contigs_spades:
                                        os.remove(contigs_spades)
                            else:
                                print '--skipAssemblyMapping set. Skipping Assembly Mapping check'
                                runs['Assembly_Mapping'] = skipped + ['NA']

                            # Run Pilon
                            if not args.skipPilon:
                                run_successfully, _, time_taken, failing, assembly_polished, pilon_folder = pilon.runPilon(jar_path_pilon, contigs, fastq_files, threads, outdir, jarMaxMemory, bam_file)
                                runs['Pilon'] = [run_successfully, None, time_taken, failing]

                                if run_successfully:
                                    contigs = assembly_polished
                                    if not args.keepIntermediateAssemblies and 'assembly_filtered' in locals() and os.path.isfile(assembly_filtered):
                                        os.remove(assembly_filtered)

                                if not args.pilonKeepFiles:
                                    utils.removeDirectory(pilon_folder)

                            else:
                                print '--skipPilon set. Skipping Pilon correction'
                                runs['Pilon'] = skipped

                            if 'assemblyMapping_folder' in locals():
                                utils.removeDirectory(assemblyMapping_folder)

                            print '\n' + 'Final assembly: ' + contigs
                            with open(os.path.join(outdir, 'final_assembly.txt'), 'wt') as writer:
                                writer.write(contigs + '\n')

                            # Run MLST
                            if not args.skipMLST:
                                run_successfully, pass_qc, time_taken, failing, warning = mlst.runMlst(contigs, scheme, outdir, species_genus, mlst_scheme_genus)
                                runs['MLST'] = [run_successfully, pass_qc, time_taken, failing, warning]
                            else:
                                print '--skipMLST set. Skipping MLST analysis'
                                runs['MLST'] = skipped + ['NA']
                        else:
                            print 'SPAdes did not run successfully! Skipping Pilon correction, Assembly Mapping check and MLST analysis'
                            runs['Assembly_Mapping'] = skipped + ['NA']
                            runs['Pilon'] = skipped
                            runs['MLST'] = skipped + ['NA']

                    else:
                        print '--skipSPAdes set. Skipping SPAdes, Pilon correction, Assembly Mapping check and MLST analysis'
                        runs['SPAdes'] = skipped + ['NA']
                        runs['Assembly_Mapping'] = skipped + ['NA']
                        runs['Pilon'] = skipped
                        runs['MLST'] = skipped + ['NA']
    else:
        print 'Moving to the next sample'
        for step in ('first_Coverage', 'trueCoverage_ReMatCh', 'first_FastQC', 'Trimmomatic', 'second_Coverage', 'second_FastQC', 'Pear', 'SPAdes', 'Assembly_Mapping', 'Pilon', 'MLST'):
            if step in ('Trimmomatic', 'first_FastQC', 'second_FastQC', 'Pear', 'SPAdes', 'Assembly_Mapping', 'MLST'):
                runs[step] = not_run + ['NA']
            else:
                runs[step] = not_run

    # Remove Pear directory
    if not args.pearKeepFiles and 'pear_folder' in locals():
        utils.removeDirectory(pear_folder)
    # Remove Trimmomatic directory with cleaned reads
    if not args.trimKeepFiles and 'trimmomatic_folder' in locals():
        utils.removeDirectory(trimmomatic_folder)

    # Check run
    run_successfully = all(runs[step][0] or runs[step][0] is None for step in runs)

    pass_fastqIntegrity = runs['FastQ_Integrity'][0]
    pass_cov = (runs['second_Coverage'][1] or (runs['second_Coverage'][1] is None and runs['first_Coverage'][1])) is not False
    pass_trueCov = runs['trueCoverage_ReMatCh'][1] is not False
    pass_fastqc = (runs['second_FastQC'][1] or (runs['second_FastQC'][1] is None and runs['first_FastQC'][1])) is not False
    # pass_trimmomatic = runs['Trimmomatic'][1] is not False
    # pass_pear = runs['Pear'][1] is not False
    # pass_spades = runs['SPAdes'][1] is not False or runs['Assembly_Mapping'][1] is True
    pass_spades = runs['SPAdes'][1] is not False
    pass_assemblyMapping = runs['Assembly_Mapping'][1] is not False
    pass_pilon = runs['Pilon'][0] is not False
    pass_mlst = runs['MLST'][1] is not False
    pass_qc = all([pass_fastqIntegrity, pass_cov, pass_trueCov, pass_fastqc, pass_spades, pass_assemblyMapping, pass_pilon, pass_mlst])

    return run_successfully, pass_qc, runs
示例#10
0
def main():
    version = '3.1'
    args = utils.parseArguments(version)

    general_start_time = time.time()
    time_str = time.strftime("%Y%m%d-%H%M%S")

    # Check if output directory exists
    outdir = os.path.abspath(os.path.join(args.outdir, ''))
    if not os.path.isdir(outdir):
        os.makedirs(outdir)

    # Start logger
    if not args.noLog:
        sys.stdout = utils.Logger(outdir, time_str)

    print '\n' + '==========> INNUca.py <=========='
    print '\n' + 'Program start: ' + time.ctime()

    # Tells where the logfile will be stored
    if not args.noLog:
        print '\n' + 'LOGFILE:'
        print sys.stdout.getLogFile()

    # Print command
    print '\n' + 'COMMAND:'
    script_path = os.path.abspath(sys.argv[0])
    print sys.executable + ' ' + script_path + ' ' + ' '.join(sys.argv[1:])

    # Print directory where programme was lunch
    print '\n' + 'PRESENT DIRECTORY:'
    print os.getcwd()

    # Print program version
    print '\n' + 'VERSION INNUca.py:'
    utils.scriptVersionGit(version, os.getcwd(), script_path, args.noGitInfo)

    # Get CPU information
    utils.get_cpu_information(outdir, time_str)

    # Get trueCoverage_ReMatCh settings
    trueCoverage_config = get_trueCoverage_config(
        args.skipTrueCoverage,
        args.trueConfigFile.name if args.trueConfigFile is not None else None,
        args.speciesExpected, script_path)

    # Check programms
    programs_version_dictionary = {}
    programs_version_dictionary['gunzip'] = ['--version', '>=', '1.6']

    # Java check first for java dependents check next
    if not (args.skipFastQC and args.skipTrimmomatic and
            (args.skipPilon or args.skipSPAdes)):
        # programs_version_dictionary['java'] = ['-version', '>=', '1.8']
        programs_version_dictionary['java'] = [None, '>=', '1.8'
                                               ]  # For OpenJDK compatibility
    missingPrograms, programs_version_dictionary = utils.checkPrograms(
        programs_version_dictionary)
    if len(missingPrograms) > 0:
        sys.exit('\n' + 'Errors:' + '\n' + '\n'.join(missingPrograms))

    if not args.skipTrueCoverage or trueCoverage_config is not None:
        include_rematch_dependencies_path(args.doNotUseProvidedSoftware)
        programs_version_dictionary['rematch.py'] = ['--version', '>=', '3.2']
        programs_version_dictionary['bcftools'] = ['--version', '==', '1.3.1']
    if not (args.skipTrueCoverage and (
        (args.skipAssemblyMapping and args.skipPilon) or args.skipSPAdes)):
        programs_version_dictionary['bowtie2'] = ['--version', '>=', '2.2.9']
        programs_version_dictionary['samtools'] = ['--version', '==', '1.3.1']
    if not args.skipFastQC:
        programs_version_dictionary['fastqc'] = ['--version', '==', '0.11.5']
    if not args.skipTrimmomatic:
        programs_version_dictionary['trimmomatic-0.36.jar'] = [
            '-version', '==', '0.36'
        ]
    if args.runPear:
        programs_version_dictionary['pear'] = ['--version', '>=', '0.9.10']
    if not args.skipSPAdes:
        programs_version_dictionary['spades.py'] = ['--version', '>=', '3.9.0']
    if not (args.skipPilon or args.skipSPAdes):
        programs_version_dictionary['pilon-1.18.jar'] = [
            '--version', '==', '1.18'
        ]
    if not (args.skipMLST or args.skipSPAdes):
        programs_version_dictionary['mlst'] = ['--version', '>=', '2.4']

    # Set and print PATH variable
    utils.setPATHvariable(args, script_path)

    missingPrograms, programs_version_dictionary = utils.checkPrograms(
        programs_version_dictionary)
    if len(missingPrograms) > 0:
        sys.exit('\n' + 'Errors:' + '\n' + '\n'.join(missingPrograms))

    # .jar paths
    jar_path_trimmomatic = None
    if not args.skipTrimmomatic:
        jar_path_trimmomatic = programs_version_dictionary[
            'trimmomatic-0.36.jar'][3]

    jar_path_pilon = None
    if not args.skipPilon and not args.skipSPAdes:
        jar_path_pilon = programs_version_dictionary['pilon-1.18.jar'][3]

    rematch_script = None
    # ReMatCh path
    if not args.skipTrueCoverage:
        rematch_script = programs_version_dictionary['rematch.py'][3]

    # pairEnd_filesSeparation_list = args.pairEnd_filesSeparation
    pairEnd_filesSeparation_list = None
    samples, inputDirectory, removeCreatedSamplesDirectories, indir_same_outdir = get_samples(
        args.inputDirectory, args.fastq, outdir, pairEnd_filesSeparation_list)

    # Start running the analysis
    print '\n' + 'RUNNING INNUca.py'

    # Prepare run report file
    samples_report_path = os.path.join(outdir,
                                       'samples_report.' + time_str + '.tab')
    utils.start_sample_report_file(samples_report_path)

    number_samples_successfully = 0
    number_samples_pass = 0
    number_samples_warning = 0

    # Get MLST scheme to use
    scheme = 'unknown'
    species_genus, mlst_scheme_genus = None, None
    if not args.skipMLST and not args.skipSPAdes:
        scheme, species_genus, mlst_scheme_genus = mlst.getScheme(
            args.speciesExpected)
        # Print path to blastn
        mlst.getBlastPath()

    # Memory
    available_memory_GB = utils.get_free_memory() / (1024.0**2)
    # Determine SPAdes maximum memory
    spadesMaxMemory = None
    if not args.skipSPAdes:
        print ''
        spadesMaxMemory = spades.define_memory(args.spadesMaxMemory,
                                               args.threads,
                                               available_memory_GB)
    # Determine .jar maximum memory
    jarMaxMemory = 'off'
    if not (args.skipTrimmomatic and (args.skipSPAdes or args.skipPilon)):
        print ''
        jarMaxMemory = utils.define_jar_max_memory(args.jarMaxMemory,
                                                   args.threads,
                                                   available_memory_GB)

    # Run INNUca for each sample
    sample_report_json = {}
    for sample in samples:
        sample_start_time = time.time()

        print '\n' + 'Sample: ' + sample + '\n'

        # Create sample outdir
        sample_outdir = os.path.abspath(os.path.join(outdir, sample, ''))
        if not os.path.isdir(sample_outdir):
            os.makedirs(sample_outdir)

        # Get fastq files
        fastq_files = utils.searchFastqFiles(
            os.path.join(inputDirectory, sample, ''),
            pairEnd_filesSeparation_list, False)
        if len(fastq_files) == 1:
            print 'Only one fastq file was found: ' + str(fastq_files)
            print 'Pair-End sequencing is required. Moving to the next sample'
            continue
        elif len(fastq_files) == 0:
            print 'No compressed fastq files were found. Continue to the next sample'
            continue

        print 'The following files will be used:'
        print str(fastq_files) + '\n'

        # Run INNUca.py analysis
        run_successfully, pass_qc, run_report = run_INNUca(
            sample, sample_outdir, fastq_files, args, script_path, scheme,
            spadesMaxMemory, jar_path_trimmomatic, jar_path_pilon,
            jarMaxMemory, trueCoverage_config, rematch_script, species_genus,
            mlst_scheme_genus)

        # Save sample fail report
        utils.write_fail_report(os.path.join(sample_outdir, 'fail_report.txt'),
                                run_report)

        # Save warning report
        write_warning_report(os.path.join(sample_outdir, 'warning_report.txt'),
                             run_report)

        # Get raw reads files size
        fileSize = sum(os.path.getsize(fastq) for fastq in fastq_files)

        # Remove sample directory if it was created during the process
        if removeCreatedSamplesDirectories and not indir_same_outdir:
            utils.removeDirectory(os.path.join(inputDirectory, sample, ''))

        print 'END ' + sample + ' analysis'
        time_taken = utils.runTime(sample_start_time)

        # Save run report
        warning, json_pass_qc = utils.write_sample_report(
            samples_report_path, sample, run_successfully, pass_qc, time_taken,
            fileSize, run_report)

        # Save runs statistics
        if run_successfully:
            number_samples_successfully += 1
        if pass_qc:
            if warning:
                number_samples_warning += 1
            else:
                number_samples_pass += 1

        sample_report_json[sample] = {
            'run_successfully': run_successfully,
            'pass_qc': json_pass_qc,
            'modules_run_report': run_report
        }

    # Save combine_samples_reports
    combine_reports.combine_reports(outdir, outdir, args.json, time_str,
                                    len(samples))

    # Save sample_report in json
    if args.json:
        import json
        with open(os.path.join(outdir, 'samples_report.' + time_str + '.json'),
                  'wt') as writer:
            json.dump(sample_report_json, writer)

    # Remove temporary folder with symlink to fastq files in case of --fastq use
    if args.inputDirectory is None and args.fastq is not None:
        utils.removeDirectory(os.path.join(inputDirectory, ''))

    # Run report
    print '\n' + 'END INNUca.py'
    print '\n' + 'Pipeline problems: {not_run_successfully} samples'.format(
        not_run_successfully=(len(samples) - number_samples_successfully))
    print '\n' + 'FAIL: {number_samples_fail} samples'.format(
        number_samples_fail=(len(samples) - number_samples_pass -
                             number_samples_warning))
    print '\n' + 'WARNING: {number_samples_warning} samples'.format(
        number_samples_warning=number_samples_warning)
    print '\n' + 'PASS: {number_samples_pass} samples'.format(
        number_samples_pass=number_samples_pass)
    time_taken = utils.runTime(general_start_time)
    del time_taken

    # Check whether INNUca.py run at least one sample successfully
    if number_samples_successfully == 0:
        sys.exit('No samples run successfully!')
示例#11
0
def run_INNUca(sampleName, outdir, fastq_files, args, script_path, scheme,
               spadesMaxMemory, jar_path_trimmomatic, jar_path_pilon,
               jarMaxMemory, trueCoverage_config, rematch_script,
               species_genus, mlst_scheme_genus):
    threads = args.threads
    adaptersFasta = args.adapters
    if adaptersFasta is not None:
        adaptersFasta = os.path.abspath(adaptersFasta.name)
    genomeSize = args.genomeSizeExpectedMb
    skipped = [None, None, 0, {'sample': 'Skipped'}]
    not_run = [None, None, 0, {'sample': 'Not run'}]

    runs = {}

    # Run FastQ integrity check
    not_corruption_found, pass_qc, time_taken, failing, fastq_encoding, min_reads_length, max_reads_length = fastQintegrity.runFastQintegrity(
        fastq_files, threads, outdir)
    runs['FastQ_Integrity'] = [
        not_corruption_found, pass_qc, time_taken, failing
    ]

    if not_corruption_found:
        # Run first Estimated Coverage
        run_successfully_estimatedCoverage = False
        estimatedCoverage = None
        run_successfully_trueCoverage = False
        pass_qc_trueCoverage = False
        if not args.skipEstimatedCoverage:
            # Check whether the Estimated Coverage output is already present
            report_file = os.path.join(outdir, 'coverage_report.txt')
            if os.path.isfile(report_file):
                os.remove(report_file)
            # Run getEstimatedCoverage
            run_successfully_estimatedCoverage, pass_qc, time_taken, failing, estimatedCoverage = coverage.getEstimatedCoverage(
                fastq_files, genomeSize, outdir, threads,
                args.estimatedMinimumCoverage)
            runs['first_Coverage'] = [
                run_successfully_estimatedCoverage, pass_qc, time_taken,
                failing
            ]
        else:
            print '--skipEstimatedCoverage set. Skipping First Estimated Coverage analysis'
            runs['first_Coverage'] = skipped

        trimmomatic_run_successfully = False

        if args.skipEstimatedCoverage or (
                run_successfully_estimatedCoverage
                and not estimatedCoverage < args.estimatedMinimumCoverage):
            if not args.skipTrueCoverage and trueCoverage_config is not None:
                # Run True Coverage
                run_successfully_trueCoverage, pass_qc_trueCoverage, time_taken, failing = trueCoverage.runTrueCoverage(
                    sampleName, fastq_files,
                    trueCoverage_config['reference_file'], threads, outdir,
                    trueCoverage_config['length_extra_seq'],
                    trueCoverage_config['minimum_depth_presence'],
                    trueCoverage_config['minimum_depth_call'],
                    trueCoverage_config[
                        'minimum_depth_frequency_dominant_allele'],
                    trueCoverage_config['minimum_gene_coverage'], False, False,
                    1, trueCoverage_config['minimum_gene_identity'],
                    trueCoverage_config, rematch_script)
                runs['trueCoverage_ReMatCh'] = [
                    run_successfully_trueCoverage, pass_qc_trueCoverage,
                    time_taken, failing
                ]
            else:
                print '\n' + '--skipTrueCoverage set. Skipping True coverage analysis'
                runs['trueCoverage_ReMatCh'] = skipped

            if args.skipTrueCoverage or trueCoverage_config is None or (
                    run_successfully_trueCoverage and pass_qc_trueCoverage):
                # Run first FastQC
                nts2clip_based_ntsContent = None
                if not args.skipFastQC:
                    run_successfully, pass_qc, time_taken, failing, warning, maximum_reads_length, nts2clip_based_ntsContent = fastqc.runFastQCanalysis(
                        outdir, threads, adaptersFasta, fastq_files,
                        args.fastQCkeepFiles, 'first_run')
                    runs['first_FastQC'] = [
                        run_successfully, pass_qc, time_taken, failing, warning
                    ]
                else:
                    print '--skipFastQC set. Skipping First FastQC analysis'
                    runs['first_FastQC'] = skipped + ['NA']

                # Run Trimmomatic
                if not args.skipTrimmomatic:
                    run_successfully, not_empty_fastq, time_taken, failing, paired_reads, trimmomatic_folder, fileSize = trimmomatic.runTrimmomatic(
                        jar_path_trimmomatic, sampleName, outdir, threads,
                        adaptersFasta, script_path, args.doNotSearchAdapters,
                        fastq_files, max_reads_length, args.doNotTrimCrops,
                        args.trimCrop, args.trimHeadCrop, args.trimLeading,
                        args.trimTrailing, args.trimSlidingWindow,
                        args.trimMinLength, nts2clip_based_ntsContent,
                        jarMaxMemory, fastq_encoding)
                    runs['Trimmomatic'] = [
                        run_successfully, None, time_taken, failing, fileSize
                    ]
                    trimmomatic_run_successfully = run_successfully

                    if run_successfully and not_empty_fastq:
                        fastq_files = paired_reads
                        min_reads_length = args.trimMinLength

                        # Run second Estimated Coverage
                        if not args.skipEstimatedCoverage:
                            run_successfully_estimatedCoverage, pass_qc, time_taken, failing, estimatedCoverage = coverage.getEstimatedCoverage(
                                fastq_files, genomeSize, outdir, threads,
                                args.estimatedMinimumCoverage)
                            runs['second_Coverage'] = [
                                run_successfully_estimatedCoverage, pass_qc,
                                time_taken, failing
                            ]
                        else:
                            print '--skipEstimatedCoverage set. Skipping Second Estimated Coverage analysis'
                            runs['second_Coverage'] = skipped

                        if args.skipEstimatedCoverage or (
                                run_successfully_estimatedCoverage
                                and not estimatedCoverage <
                                args.estimatedMinimumCoverage):
                            # Run second FastQC
                            if not args.skipFastQC:
                                run_successfully, pass_qc, time_taken, failing, warning, maximum_reads_length, nts2clip_based_ntsContent = fastqc.runFastQCanalysis(
                                    outdir, threads, adaptersFasta,
                                    fastq_files, args.fastQCkeepFiles,
                                    'second_run')
                                runs['second_FastQC'] = [
                                    run_successfully, pass_qc, time_taken,
                                    failing, warning
                                ]
                                if run_successfully:
                                    max_reads_length = maximum_reads_length
                            else:
                                print '--skipFastQC set. Skipping Second FastQC analysis'
                                runs['second_FastQC'] = skipped + ['NA']
                        else:
                            print '\n' + 'Estimated coverage is too lower (< ' + str(
                                args.estimatedMinimumCoverage
                            ) + 'x). This sample will not proceed with INNUca pipeline'
                            runs['second_FastQC'] = not_run + ['NA']
                            runs['Pear'] = not_run + ['NA']
                            runs['SPAdes'] = not_run + ['NA']
                            runs['Assembly_Mapping'] = not_run + ['NA']
                            runs['Pilon'] = not_run
                            runs['MLST'] = not_run + ['NA']
                    else:
                        print 'Trimmomatic did not run successfully or return zero reads! Skipping Second Estimated Coverage analysis and FastQC analysis'
                        runs['second_Coverage'] = skipped
                        runs['second_FastQC'] = skipped + ['NA']

                else:
                    print '--skipTrimmomatic set. Skipping Trimmomatic, but also Second FastQC analysis and Second Estimated Coverage analysis'
                    runs['Trimmomatic'] = skipped + ['NA']
                    runs['second_Coverage'] = skipped
                    runs['second_FastQC'] = skipped + ['NA']

                if not args.skipFastQC and (
                        runs['second_FastQC'][1] or
                    (runs['second_FastQC'][1] is None and runs['first_FastQC']
                     [1])) is False and not args.fastQCproceed:
                    print '\n' + 'This sample does not pass FastQC module QA/QC. It will not proceed with INNUca pipeline'
                    runs['Pear'] = not_run + ['NA']
                    runs['SPAdes'] = not_run + ['NA']
                    runs['Assembly_Mapping'] = not_run + ['NA']
                    runs['Pilon'] = not_run
                    runs['MLST'] = not_run + ['NA']
            else:
                print '\n' + 'This sample does not pass True Coverage module QA/QC. This sample will not proceed with INNUca pipeline'
                runs['first_FastQC'] = not_run + ['NA']
                runs['Trimmomatic'] = not_run + ['NA']
                runs['second_Coverage'] = not_run
                runs['second_FastQC'] = not_run + ['NA']
                runs['Pear'] = not_run + ['NA']
                runs['SPAdes'] = not_run + ['NA']
                runs['Assembly_Mapping'] = not_run + ['NA']
                runs['Pilon'] = not_run
                runs['MLST'] = not_run + ['NA']

        else:
            print '\n' + 'Estimated coverage is too lower (< ' + str(
                args.estimatedMinimumCoverage
            ) + 'x). This sample will not proceed with INNUca pipeline'
            runs['trueCoverage_ReMatCh'] = not_run
            runs['first_FastQC'] = not_run + ['NA']
            runs['Trimmomatic'] = not_run + ['NA']
            runs['second_Coverage'] = not_run
            runs['second_FastQC'] = not_run + ['NA']
            runs['Pear'] = not_run + ['NA']
            runs['SPAdes'] = not_run + ['NA']
            runs['Assembly_Mapping'] = not_run + ['NA']
            runs['Pilon'] = not_run
            runs['MLST'] = not_run + ['NA']

        if args.skipEstimatedCoverage or (
                run_successfully_estimatedCoverage
                and not estimatedCoverage < args.estimatedMinimumCoverage):
            if args.skipTrueCoverage or trueCoverage_config is None or (
                    run_successfully_trueCoverage and pass_qc_trueCoverage):
                if args.skipFastQC or (runs['second_FastQC'][1] or
                                       (runs['second_FastQC'][1] is None
                                        and runs['first_FastQC'][1])
                                       ) is not False or args.fastQCproceed:
                    unassembled_pe_reads = None
                    assembled_se_reads = None
                    # Run Pear
                    if args.runPear:
                        print '--runPear set. Running Pear'
                        pearMinOverlap = pear.determine_minimum_overlap(
                            args.pearMinOverlap, min_reads_length,
                            max_reads_length)
                        run_successfully, pass_qc, time_taken, failing, unassembled_pe_reads, assembled_se_reads, pear_folder, warning = pear.runPear(
                            fastq_files, threads, outdir, sampleName,
                            fastq_encoding, trimmomatic_run_successfully,
                            pearMinOverlap)
                        runs['Pear'] = [
                            run_successfully, pass_qc, time_taken, failing,
                            warning
                        ]
                    else:
                        runs['Pear'] = not_run + ['NA']

                    # Run SPAdes
                    if not args.skipSPAdes:
                        run_successfully, pass_qc, time_taken, failing, contigs_spades, warning = spades.runSpades(
                            sampleName, outdir, threads,
                            unassembled_pe_reads if unassembled_pe_reads
                            is not None else fastq_files,
                            args.spadesNotUseCareful, spadesMaxMemory,
                            args.spadesMinCoverageAssembly,
                            args.spadesMinContigsLength, genomeSize,
                            args.spadesKmers, max_reads_length,
                            args.spadesDefaultKmers,
                            args.spadesMinKmerCovContigs, assembled_se_reads,
                            args.saveExcludedContigs, args.maxNumberContigs)
                        runs['SPAdes'] = [
                            run_successfully, pass_qc, time_taken, failing,
                            warning
                        ]

                        if run_successfully:
                            contigs = contigs_spades

                            # Run Assembly Mapping check
                            bam_file = None
                            if not args.skipAssemblyMapping:
                                run_successfully, pass_qc, time_taken, failing, assembly_filtered, bam_file, assemblyMapping_folder, warning = assembly_mapping.runAssemblyMapping(
                                    fastq_files, contigs, threads, outdir,
                                    args.assemblyMinCoverageContigs,
                                    genomeSize, args.saveExcludedContigs,
                                    args.maxNumberContigs)
                                runs['Assembly_Mapping'] = [
                                    run_successfully, pass_qc, time_taken,
                                    failing, warning
                                ]

                                if run_successfully:
                                    contigs = assembly_filtered
                                    if not args.keepIntermediateAssemblies and os.path.isfile(
                                            contigs_spades
                                    ) and contigs != contigs_spades:
                                        os.remove(contigs_spades)
                            else:
                                print '--skipAssemblyMapping set. Skipping Assembly Mapping check'
                                runs['Assembly_Mapping'] = skipped + ['NA']

                            # Run Pilon
                            if not args.skipPilon:
                                run_successfully, _, time_taken, failing, assembly_polished, pilon_folder = pilon.runPilon(
                                    jar_path_pilon, contigs, fastq_files,
                                    threads, outdir, jarMaxMemory, bam_file)
                                runs['Pilon'] = [
                                    run_successfully, None, time_taken, failing
                                ]

                                if run_successfully:
                                    contigs = assembly_polished
                                    if not args.keepIntermediateAssemblies and 'assembly_filtered' in locals(
                                    ) and os.path.isfile(assembly_filtered):
                                        os.remove(assembly_filtered)

                                if not args.pilonKeepFiles:
                                    utils.removeDirectory(pilon_folder)

                            else:
                                print '--skipPilon set. Skipping Pilon correction'
                                runs['Pilon'] = skipped

                            if 'assemblyMapping_folder' in locals():
                                utils.removeDirectory(assemblyMapping_folder)

                            print '\n' + 'Final assembly: ' + contigs
                            with open(
                                    os.path.join(outdir, 'final_assembly.txt'),
                                    'wt') as writer:
                                writer.write(contigs + '\n')

                            # Run MLST
                            if not args.skipMLST:
                                run_successfully, pass_qc, time_taken, failing, warning = mlst.runMlst(
                                    contigs, scheme, outdir, species_genus,
                                    mlst_scheme_genus)
                                runs['MLST'] = [
                                    run_successfully, pass_qc, time_taken,
                                    failing, warning
                                ]
                            else:
                                print '--skipMLST set. Skipping MLST analysis'
                                runs['MLST'] = skipped + ['NA']
                        else:
                            print 'SPAdes did not run successfully! Skipping Pilon correction, Assembly Mapping check and MLST analysis'
                            runs['Assembly_Mapping'] = skipped + ['NA']
                            runs['Pilon'] = skipped
                            runs['MLST'] = skipped + ['NA']

                    else:
                        print '--skipSPAdes set. Skipping SPAdes, Pilon correction, Assembly Mapping check and MLST analysis'
                        runs['SPAdes'] = skipped + ['NA']
                        runs['Assembly_Mapping'] = skipped + ['NA']
                        runs['Pilon'] = skipped
                        runs['MLST'] = skipped + ['NA']
    else:
        print 'Moving to the next sample'
        for step in ('first_Coverage', 'trueCoverage_ReMatCh', 'first_FastQC',
                     'Trimmomatic', 'second_Coverage', 'second_FastQC', 'Pear',
                     'SPAdes', 'Assembly_Mapping', 'Pilon', 'MLST'):
            if step in ('Trimmomatic', 'first_FastQC', 'second_FastQC', 'Pear',
                        'SPAdes', 'Assembly_Mapping', 'MLST'):
                runs[step] = not_run + ['NA']
            else:
                runs[step] = not_run

    # Remove Pear directory
    if not args.pearKeepFiles and 'pear_folder' in locals():
        utils.removeDirectory(pear_folder)
    # Remove Trimmomatic directory with cleaned reads
    if not args.trimKeepFiles and 'trimmomatic_folder' in locals():
        utils.removeDirectory(trimmomatic_folder)

    # Check run
    run_successfully = all(runs[step][0] or runs[step][0] is None
                           for step in runs)

    pass_fastqIntegrity = runs['FastQ_Integrity'][0]
    pass_cov = (runs['second_Coverage'][1]
                or (runs['second_Coverage'][1] is None
                    and runs['first_Coverage'][1])) is not False
    pass_trueCov = runs['trueCoverage_ReMatCh'][1] is not False
    pass_fastqc = (runs['second_FastQC'][1]
                   or (runs['second_FastQC'][1] is None
                       and runs['first_FastQC'][1])) is not False
    # pass_trimmomatic = runs['Trimmomatic'][1] is not False
    # pass_pear = runs['Pear'][1] is not False
    # pass_spades = runs['SPAdes'][1] is not False or runs['Assembly_Mapping'][1] is True
    pass_spades = runs['SPAdes'][1] is not False
    pass_assemblyMapping = runs['Assembly_Mapping'][1] is not False
    pass_pilon = runs['Pilon'][0] is not False
    pass_mlst = runs['MLST'][1] is not False
    pass_qc = all([
        pass_fastqIntegrity, pass_cov, pass_trueCov, pass_fastqc, pass_spades,
        pass_assemblyMapping, pass_pilon, pass_mlst
    ])

    return run_successfully, pass_qc, runs
示例#12
0
文件: INNUca.py 项目: abremges/INNUca
def main():
    version = '2.0'
    args = utils.parseArguments(version)

    general_start_time = time.time()
    time_str = time.strftime("%Y%m%d-%H%M%S")

    # Check if output directory exists
    outdir = os.path.abspath(os.path.join(args.outdir, ''))
    if not os.path.isdir(outdir):
        os.makedirs(outdir)

    # Start logger
    sys.stdout = utils.Logger(outdir, time_str)

    print '\n' + '==========> INNUca.py <=========='
    print '\n' + 'Program start: ' + time.ctime()

    # Tells where the logfile will be stored
    print '\n' + 'LOGFILE:'
    print sys.stdout.getLogFile()

    # Print command
    print '\n' + 'COMMAND:'
    script_path = os.path.abspath(sys.argv[0])
    print sys.executable + ' ' + script_path + ' ' + ' '.join(sys.argv[1:])

    # Print directory where programme was lunch
    print '\n' + 'PRESENT DIRECTORY :'
    print os.getcwd()

    # Print program version
    print '\n' + 'VERSION INNUca.py:'
    utils.scriptVersionGit(version, os.getcwd(), script_path)

    # Get CPU information
    utils.get_cpu_information(outdir, time_str)

    # Set and print PATH variable
    utils.setPATHvariable(args.doNotUseProvidedSoftware, script_path)

    # Check programms
    programs_version_dictionary = {}
    programs_version_dictionary['gunzip'] = ['--version', '>=', '1.6']
    if (not args.skipTrueCoverage
            or (not args.skipPilon and not args.skipSPAdes)):
        programs_version_dictionary['bowtie2'] = ['--version', '>=', '2.2.9']
        programs_version_dictionary['samtools'] = ['--version', '==', '1.3.1']
    if not (args.skipFastQC and args.skipTrimmomatic and
            (args.skipPilon or args.skipSPAdes)):
        programs_version_dictionary['java'] = ['-version', '>=', '1.8']
    if not args.skipFastQC:
        programs_version_dictionary['fastqc'] = ['--version', '==', '0.11.5']
    if not args.skipTrimmomatic:
        programs_version_dictionary['trimmomatic-0.36.jar'] = [
            '-version', '==', '0.36'
        ]
    if not args.skipSPAdes:
        programs_version_dictionary['spades.py'] = ['--version', '>=', '3.9.0']
    if not args.skipPilon and not args.skipSPAdes:
        programs_version_dictionary['pilon-1.18.jar'] = [
            '--version', '==', '1.18'
        ]
    if not args.skipMLST and not args.skipSPAdes:
        programs_version_dictionary['mlst'] = ['--version', '>=', '2.4']
    missingPrograms, programs_version_dictionary = utils.checkPrograms(
        programs_version_dictionary)
    if len(missingPrograms) > 0:
        sys.exit('\n' + 'Errors:' + '\n' + '\n'.join(missingPrograms))

    # .jar paths
    jar_path_trimmomatic = None
    if not args.skipTrimmomatic:
        jar_path_trimmomatic = programs_version_dictionary[
            'trimmomatic-0.36.jar'][3]

    jar_path_pilon = None
    if not args.skipPilon and not args.skipSPAdes:
        jar_path_pilon = programs_version_dictionary['pilon-1.18.jar'][3]

    # Check if input directory exists with fastq files and store samples name that have fastq files
    inputDirectory = os.path.abspath(os.path.join(args.inputDirectory, ''))
    # pairEnd_filesSeparation_list = args.pairEnd_filesSeparation
    pairEnd_filesSeparation_list = None
    print ''
    samples, removeCreatedSamplesDirectories, indir_same_outdir = utils.checkSetInputDirectory(
        inputDirectory, outdir, pairEnd_filesSeparation_list)

    # Start running the analysis
    print '\n' + 'RUNNING INNUca.py'

    # Prepare run report file
    samples_report_path = os.path.join(outdir,
                                       'samples_report.' + time_str + '.tab')
    utils.start_sample_report_file(samples_report_path)

    number_samples_successfully = 0
    number_samples_pass = 0

    # Get MLST scheme to use
    scheme = 'unknown'
    if not args.skipMLST and not args.skipSPAdes:
        scheme = mlst.getScheme(args.speciesExpected)

    # Get path to blastn
    mlst.getBlastPath()

    # Get trueCoverage_ReMatCh settings
    trueCoverage_config = None
    if not args.skipTrueCoverage:
        trueCoverage_reference = None
        trueCoverage_config_file = None
        trueCoverage_config = None

        if args.trueConfigFile is None:
            print 'No trueCoverage_ReMatCh config file was provided. Search for default files'
            trueCoverage_config_file, trueCoverage_reference = trueCoverage.check_existing_default_config(
                args.speciesExpected, script_path)
        else:
            trueCoverage_config_file = args.trueConfigFile.name

        if trueCoverage_config_file is not None:
            trueCoverage_config = trueCoverage.parse_config(
                trueCoverage_config_file)
        if args.trueConfigFile is None and trueCoverage_config is not None:
            trueCoverage_config['reference_file'] = trueCoverage_reference

        if trueCoverage_config is not None:
            print 'The following trueCoverage_ReMatCh config file will be used: ' + trueCoverage_config_file
            print 'The following trueCoverage_ReMatCh reference file will be used: ' + trueCoverage_config[
                'reference_file'] + '\n'
        else:
            print 'No trueCoverage_ReMatCh config file was found'

    # Memory
    available_memory_GB = utils.get_free_memory() / (1024.0**2)
    # Determine SPAdes maximum memory
    spadesMaxMemory = None
    if not args.skipSPAdes:
        print ''
        spadesMaxMemory = spades.define_memory(args.spadesMaxMemory,
                                               args.threads,
                                               available_memory_GB)
    # Determine .jar maximum memory
    jarMaxMemory = 'off'
    if not (args.skipTrimmomatic and (args.skipSPAdes or args.skipPilon)):
        print ''
        jarMaxMemory = utils.define_jar_max_memory(args.jarMaxMemory,
                                                   args.threads,
                                                   available_memory_GB)

    # Run INNUca for each sample
    for sample in samples:
        sample_start_time = time.time()

        print '\n' + 'Sample: ' + sample + '\n'

        # Create sample outdir
        sample_outdir = os.path.abspath(os.path.join(outdir, sample, ''))
        if not os.path.isdir(sample_outdir):
            os.makedirs(sample_outdir)

        # Get fastq files
        fastq_files = utils.searchFastqFiles(
            os.path.join(inputDirectory, sample, ''),
            pairEnd_filesSeparation_list, False)
        if len(fastq_files) == 1:
            print 'Only one fastq file was found: ' + str(fastq_files)
            print 'Pair-End sequencing is required. Moving to the next sample'
            continue

        print 'The following files will be used:'
        print str(fastq_files) + '\n'

        # Run INNUca.py analysis
        run_successfully, pass_qc, run_report = run_INNUca(
            sample, sample_outdir, fastq_files, args, script_path, scheme,
            spadesMaxMemory, jar_path_trimmomatic, jar_path_pilon,
            jarMaxMemory, trueCoverage_config)

        # Save sample fail report
        fail_report_path = os.path.join(sample_outdir, 'fail_report.txt')
        utils.write_fail_report(fail_report_path, run_report)

        # Save runs statistics
        if run_successfully:
            number_samples_successfully += 1
        if pass_qc:
            number_samples_pass += 1

        # Get raw reads files size
        fileSize = sum(os.path.getsize(fastq) for fastq in fastq_files)

        # Remove sample directory if it was created during the process
        if removeCreatedSamplesDirectories and not indir_same_outdir:
            utils.removeDirectory(os.path.join(inputDirectory, sample, ''))

        print 'END ' + sample + ' analysis'
        time_taken = utils.runTime(sample_start_time)

        # Save run report
        utils.write_sample_report(samples_report_path, sample,
                                  run_successfully, pass_qc, time_taken,
                                  fileSize, run_report)

    # Run report
    print '\n' + 'END INNUca.py'
    print '\n' + str(number_samples_successfully) + ' samples out of ' + str(
        len(samples)) + ' run successfully'
    print '\n' + str(number_samples_pass) + ' samples out of ' + str(
        number_samples_successfully
    ) + ' (run successfully) PASS INNUca.py analysis'
    time_taken = utils.runTime(general_start_time)
    del time_taken

    # Check whether INNUca.py run at least one sample successfully
    if number_samples_successfully == 0:
        sys.exit('No samples run successfully!')
示例#13
0
文件: INNUca.py 项目: abremges/INNUca
def run_INNUca(sampleName, outdir, fastq_files, args, script_path, scheme,
               spadesMaxMemory, jar_path_trimmomatic, jar_path_pilon,
               jarMaxMemory, trueCoverage_config):
    threads = args.threads
    adaptersFasta = args.adapters
    if adaptersFasta is not None:
        adaptersFasta = os.path.abspath(adaptersFasta.name)
    genomeSize = args.genomeSizeExpectedMb
    maximumReadsLength = None
    skipped = [None, None, 0, {'sample': 'Skipped'}]
    not_run = [None, None, 0, {'sample': 'Not run'}]

    runs = {}

    # Run FastQ integrity check
    not_corruption_found, _, time_taken, failing = fastQintegrity.runFastQintegrity(
        fastq_files, threads, outdir)
    runs['FastQ_Integrity'] = [not_corruption_found, None, time_taken, failing]

    if not_corruption_found:
        # Run first Estimated Coverage
        run_successfully_estimatedCoverage = False
        estimatedCoverage = None
        run_successfully_trueCoverage = False
        pass_qc_trueCoverage = False
        if not args.skipEstimatedCoverage:
            # Check whether the Estimated Coverage output is already present
            report_file = os.path.join(outdir, 'coverage_report.txt')
            if os.path.isfile(report_file):
                os.remove(report_file)
            # Run getEstimatedCoverage
            run_successfully_estimatedCoverage, pass_qc, time_taken, failing, estimatedCoverage = coverage.getEstimatedCoverage(
                fastq_files, genomeSize, outdir, threads)
            runs['first_Coverage'] = [
                run_successfully_estimatedCoverage, pass_qc, time_taken,
                failing
            ]
        else:
            print '--skipEstimatedCoverage set. Skipping First Estimated Coverage analysis'
            runs['first_Coverage'] = skipped

        if args.skipEstimatedCoverage or (
                run_successfully_estimatedCoverage
                and not estimatedCoverage < args.estimatedMinimumCoverage):
            if not args.skipTrueCoverage and trueCoverage_config is not None:
                # Run True Coverage
                run_successfully_trueCoverage, pass_qc_trueCoverage, time_taken, failing = trueCoverage.runTrueCoverage(
                    fastq_files, trueCoverage_config['reference_file'],
                    threads, outdir, trueCoverage_config['length_extra_seq'],
                    trueCoverage_config['minimum_depth_presence'],
                    trueCoverage_config['minimum_depth_call'],
                    trueCoverage_config[
                        'minimum_depth_frequency_dominant_allele'],
                    trueCoverage_config['minimum_gene_coverage'],
                    trueCoverage_config['maximum_number_absent_genes'],
                    trueCoverage_config[
                        'maximum_number_genes_multiple_alleles'],
                    trueCoverage_config['minimum_read_coverage'])
                runs['trueCoverage_ReMatCh'] = [
                    run_successfully_trueCoverage, pass_qc_trueCoverage,
                    time_taken, failing
                ]
            else:
                print '\n' + '--skipTrueCoverage set. Skipping True coverage analysis'
                runs['trueCoverage_ReMatCh'] = skipped

            if args.skipTrueCoverage or trueCoverage_config is None or (
                    run_successfully_trueCoverage and pass_qc_trueCoverage):
                # Run first FastQC
                nts2clip_based_ntsContent = None
                if not args.skipFastQC:
                    run_successfully, pass_qc, time_taken, failing, maximumReadsLength, nts2clip_based_ntsContent = fastqc.runFastQCanalysis(
                        outdir, threads, adaptersFasta, fastq_files)
                    runs['first_FastQC'] = [
                        run_successfully, pass_qc, time_taken, failing
                    ]
                else:
                    print '--skipFastQC set. Skipping First FastQC analysis'
                    runs['first_FastQC'] = skipped

                # Run Trimmomatic
                if not args.skipTrimmomatic:
                    run_successfully, not_empty_fastq, time_taken, failing, paired_reads, trimmomatic_folder, fileSize = trimmomatic.runTrimmomatic(
                        jar_path_trimmomatic, sampleName, outdir, threads,
                        adaptersFasta, script_path, args.doNotSearchAdapters,
                        fastq_files, maximumReadsLength, args.doNotTrimCrops,
                        args.trimCrop, args.trimHeadCrop, args.trimLeading,
                        args.trimTrailing, args.trimSlidingWindow,
                        args.trimMinLength, nts2clip_based_ntsContent,
                        jarMaxMemory)
                    runs['Trimmomatic'] = [
                        run_successfully, not_empty_fastq, time_taken, failing,
                        fileSize
                    ]

                    if run_successfully and not_empty_fastq:
                        fastq_files = paired_reads

                        # Run second Estimated Coverage
                        if not args.skipEstimatedCoverage:
                            run_successfully_estimatedCoverage, pass_qc, time_taken, failing, estimatedCoverage = coverage.getEstimatedCoverage(
                                fastq_files, genomeSize, outdir, threads)
                            runs['second_Coverage'] = [
                                run_successfully_estimatedCoverage, pass_qc,
                                time_taken, failing
                            ]
                        else:
                            print '--skipEstimatedCoverage set. Skipping Second Estimated Coverage analysis'
                            runs['second_Coverage'] = skipped

                        if args.skipEstimatedCoverage or (
                                run_successfully_estimatedCoverage
                                and not estimatedCoverage <
                                args.estimatedMinimumCoverage):
                            # Run second FastQC
                            if not args.skipFastQC:
                                run_successfully, pass_qc, time_taken, failing, maximumReadsLength, nts2clip_based_ntsContent = fastqc.runFastQCanalysis(
                                    outdir, threads, adaptersFasta,
                                    fastq_files)
                                runs['second_FastQC'] = [
                                    run_successfully, pass_qc, time_taken,
                                    failing
                                ]
                            else:
                                print '--skipFastQC set. Skipping Second FastQC analysis'
                                runs['second_FastQC'] = skipped
                        else:
                            print '\n' + 'Estimated coverage is too lower (< ' + str(
                                args.estimatedMinimumCoverage
                            ) + 'x). This sample will not proceed with INNUca pipeline'
                            runs['second_FastQC'] = not_run
                            runs['SPAdes'] = not_run
                            runs['Pilon'] = not_run
                            runs['Assembly_Mapping'] = not_run
                            runs['MLST'] = not_run
                    else:
                        print 'Trimmomatic did not run successfully or return zero reads! Skipping Second Estimated Coverage analysis and FastQC analysis'
                        runs['second_Coverage'] = skipped
                        runs['second_FastQC'] = skipped

                else:
                    print '--skipTrimmomatic set. Skipping Trimmomatic, but also Second FastQC analysis and Second Estimated Coverage analysis'
                    runs['Trimmomatic'] = skipped + ['NA']
                    runs['second_Coverage'] = skipped
                    runs['second_FastQC'] = skipped
            else:
                print '\n' + 'This sample does not pass True Coverage module QA/QC. This sample will not proceed with INNUca pipeline'
                runs['first_FastQC'] = not_run
                runs['Trimmomatic'] = not_run + ['NA']
                runs['second_Coverage'] = not_run
                runs['second_FastQC'] = not_run
                runs['SPAdes'] = not_run
                runs['Pilon'] = not_run
                runs['Assembly_Mapping'] = not_run
                runs['MLST'] = not_run

        else:
            print '\n' + 'Estimated coverage is too lower (< ' + str(
                args.estimatedMinimumCoverage
            ) + 'x). This sample will not proceed with INNUca pipeline'
            runs['trueCoverage_ReMatCh'] = not_run
            runs['first_FastQC'] = not_run
            runs['Trimmomatic'] = not_run + ['NA']
            runs['second_Coverage'] = not_run
            runs['second_FastQC'] = not_run
            runs['SPAdes'] = not_run
            runs['Pilon'] = not_run
            runs['Assembly_Mapping'] = not_run
            runs['MLST'] = not_run

        if args.skipEstimatedCoverage or (
                run_successfully_estimatedCoverage
                and not estimatedCoverage < args.estimatedMinimumCoverage):
            if args.skipTrueCoverage or trueCoverage_config is None or (
                    run_successfully_trueCoverage and pass_qc_trueCoverage):
                # Run SPAdes
                if not args.skipSPAdes:
                    run_successfully, pass_qc, time_taken, failing, contigs_spades = spades.runSpades(
                        sampleName, outdir, threads, fastq_files,
                        args.spadesNotUseCareful, spadesMaxMemory,
                        args.spadesMinCoverageAssembly,
                        args.spadesMinContigsLength, genomeSize,
                        args.spadesKmers, maximumReadsLength,
                        args.spadesDefaultKmers, args.spadesMinKmerCovContigs)
                    runs['SPAdes'] = [
                        run_successfully, pass_qc, time_taken, failing
                    ]

                    if run_successfully:
                        # Run Pilon
                        contigs = contigs_spades

                        if not args.skipPilon:
                            run_successfully, _, time_taken, failing, assembly_polished, bam_file, pilon_folder = pilon.runPilon(
                                jar_path_pilon, contigs_spades, fastq_files,
                                threads, outdir, jarMaxMemory)
                            runs['Pilon'] = [
                                run_successfully, None, time_taken, failing
                            ]

                            if run_successfully:
                                contigs = assembly_polished

                            # Run Assembly Mapping check
                            if bam_file is not None:
                                if not args.skipAssemblyMapping:
                                    run_successfully, pass_qc, time_taken, failing, assembly_filtered = assembly_mapping.runAssemblyMapping(
                                        bam_file, contigs_spades, threads,
                                        outdir,
                                        args.assemblyMinCoverageContigs,
                                        assembly_polished, genomeSize)
                                    runs['Assembly_Mapping'] = [
                                        run_successfully, pass_qc, time_taken,
                                        failing
                                    ]

                                    if run_successfully:
                                        contigs = assembly_filtered
                                else:
                                    print '--skipAssemblyMapping set. Skipping Assembly Mapping check'
                                    runs['Assembly_Mapping'] = skipped
                            else:
                                print 'Pilon did not produce the bam file! Assembly Mapping check'
                                runs['Assembly_Mapping'] = skipped

                            if not args.pilonKeepFiles:
                                utils.removeDirectory(pilon_folder)

                        else:
                            print '--skipPilon set. Skipping Pilon correction and Assembly Mapping check'
                            runs['Pilon'] = skipped
                            runs['Assembly_Mapping'] = skipped

                        print '\n' + 'Final assembly: ' + contigs
                        with open(os.path.join(outdir, 'final_assembly.txt'),
                                  'wt') as writer:
                            writer.write(contigs + '\n')

                        # Run MLST
                        if not args.skipMLST:
                            runs['MLST'] = mlst.runMlst(
                                contigs, scheme, outdir)
                        else:
                            print '--skipMLST set. Skipping MLST analysis'
                            runs['MLST'] = skipped
                    else:
                        print 'SPAdes did not run successfully! Skipping Pilon correction, Assembly Mapping check and MLST analysis'
                        runs['Pilon'] = skipped
                        runs['Assembly_Mapping'] = skipped
                        runs['MLST'] = skipped

                else:
                    print '--skipSPAdes set. Skipping SPAdes Pilon correction, Assembly Mapping check and MLST analysis'
                    runs['SPAdes'] = skipped
                    runs['Pilon'] = skipped
                    runs['Assembly_Mapping'] = skipped
                    runs['MLST'] = skipped
    else:
        print 'Moving to the next sample'
        for step in ('first_Coverage', 'trueCoverage_ReMatCh', 'first_FastQC',
                     'Trimmomatic', 'second_Coverage', 'second_FastQC',
                     'SPAdes', 'Pilon', 'Assembly_Mapping', 'MLST'):
            if step == 'Trimmomatic':
                runs[step] = not_run + ['NA']
            else:
                runs[step] = not_run

    # Remove Trimmomatic directory with cleaned reads
    if not args.trimKeepFiles:
        try:
            utils.removeDirectory(trimmomatic_folder)
        except:
            print 'It is not possible to remove Trimmomatic directory because Trimmomatic did not run'

    # Check run
    run_successfully = all(runs[step][0] or runs[step][0] is None
                           for step in runs)

    pass_fastqIntegrity = runs['FastQ_Integrity'][0]
    pass_cov = (runs['second_Coverage'][1]
                or (runs['second_Coverage'][1] is None
                    and runs['first_Coverage'][1])) is not False
    pass_trueCov = runs['trueCoverage_ReMatCh'][1] is not False
    pass_fastqc = (runs['second_FastQC'][1]
                   or (runs['second_FastQC'][1] is None
                       and runs['first_FastQC'][1])) is not False
    pass_trimmomatic = runs['Trimmomatic'][1] is not False
    pass_spades = runs['SPAdes'][1] is not False
    pass_assemblyMapping = runs['Assembly_Mapping'][1] is not False
    pass_mlst = runs['MLST'][1] is not False
    pass_qc = all([
        pass_fastqIntegrity, pass_cov, pass_trueCov, pass_fastqc,
        pass_trimmomatic, pass_spades, pass_assemblyMapping, pass_mlst
    ])

    return run_successfully, pass_qc, runs
示例#14
0
def run_innuca(sample_name,
               outdir,
               fastq_files,
               args,
               script_path,
               scheme,
               spades_max_memory,
               jar_path_trimmomatic,
               jar_path_pilon,
               jar_max_memory,
               true_coverage_config,
               rematch_script,
               species_genus,
               mlst_scheme_genus,
               spades_version=None):
    threads = args.threads
    adapters_fasta = args.adapters
    if adapters_fasta is not None:
        adapters_fasta = os.path.abspath(adapters_fasta.name)
    genome_size = args.genomeSizeExpectedMb
    # run_successfully, pass_qc, time_taken, failing, warning, file_size
    skipped = [None, None, 0, {'sample': 'Skipped'}, {}, 'NA']
    not_run = [None, None, 0, {'sample': 'Not run'}, {}, 'NA']

    runs = {}

    # Run FastQ integrity check
    not_corruption_found, pass_qc, time_taken, failing, fastq_encoding, min_reads_length, max_reads_length = \
        fastQintegrity.runFastQintegrity(fastq_files, threads, outdir)
    runs['FastQ_Integrity'] = [
        not_corruption_found, pass_qc, time_taken, failing, {}, 'NA'
    ]

    pear_folder = None
    trimmomatic_folder = None
    if not_corruption_found:
        # Run Kraken
        # most_abundant_taxon_percent = None
        run_successfully_kraken = False
        run_successfully_estimated_coverage = False
        estimated_coverage = None
        run_successfully_true_coverage = False
        pass_qc_true_coverage = False

        trimmomatic_run_successfully = False
        if args.runKraken:
            print('\n' '--runKraken set. Running Kraken for reads')
            run_successfully_kraken, pass_qc, time_taken, failing, warning, _ = \
                kraken(species=args.speciesExpected, files_to_classify=fastq_files, kraken_db=args.krakenDB,
                       files_type='fastq', outdir=outdir, version_kraken=version_kraken_global,
                       db_mem=args.krakenMemory, quick=args.krakenQuick, min_percent_covered=args.krakenMinCov,
                       max_unclassified_frag=args.krakenMaxUnclass, min_base_quality=args.krakenMinQual,
                       threads=threads)
            runs['reads_Kraken'] = [
                run_successfully_kraken, pass_qc, time_taken, failing, warning,
                'NA'
            ]
        else:
            runs['reads_Kraken'] = skipped

        if args.runKraken and \
                (run_successfully_kraken and not pass_qc) and \
                not args.krakenProceed and \
                not args.krakenIgnoreQC:
            print(
                '\n'
                'This sample does not pass Kraken module QA/QC. It will not proceed with INNUca pipeline'
            )
        else:
            # Run first Estimated Coverage
            if not args.skipEstimatedCoverage:
                # Check whether the Estimated Coverage output is already present
                report_file = os.path.join(outdir, 'coverage_report.txt')
                if os.path.isfile(report_file):
                    os.remove(report_file)
                # Run getEstimatedCoverage
                run_successfully_estimated_coverage, pass_qc, time_taken, failing, estimated_coverage = \
                    coverage.getEstimatedCoverage(fastq_files, genome_size, outdir, threads,
                                                  args.estimatedMinimumCoverage)
                runs['first_Coverage'] = [
                    run_successfully_estimated_coverage, pass_qc, time_taken,
                    failing, {}, 'NA'
                ]
            else:
                print(
                    '--skipEstimatedCoverage set. Skipping First Estimated Coverage analysis'
                )
                runs['first_Coverage'] = skipped

            # # Correct first estimation coverage with Kraken percentage
            # # Does not seem to be a good idea (at least for Streptococcus agalactiae)
            # if args.runKraken and \
            #         (runs['Kraken'][0] and runs['Kraken'][1]) and \
            #         most_abundant_taxon_percent is not None and \
            #         estimated_coverage is not None:
            #     new_estimation = estimated_coverage * (most_abundant_taxon_percent / 100)
            #     print('\n'
            #           'Correct estimated coverage ({estimated}x) with Kraken taxon percentage'
            #           ' coverage ({percent}%): {new_estimation}x'.format(estimated=estimated_coverage,
            #                                                              percent=most_abundant_taxon_percent,
            #                                                              new_estimation=new_estimation))
            #     estimated_coverage = new_estimation

            if args.skipEstimatedCoverage or (
                    run_successfully_estimated_coverage and
                    not estimated_coverage < args.estimatedMinimumCoverage):
                if not args.skipTrueCoverage and true_coverage_config is not None:
                    # Run True Coverage
                    run_successfully_true_coverage, pass_qc_true_coverage, time_taken, failing, _ = \
                        trueCoverage.run_true_coverage(sample_name, fastq_files, true_coverage_config['reference_file'],
                                                       threads, outdir,
                                                       true_coverage_config['length_extra_seq'],
                                                       true_coverage_config['minimum_depth_presence'],
                                                       true_coverage_config['minimum_depth_call'],
                                                       true_coverage_config['minimum_depth_frequency_dominant_allele'],
                                                       true_coverage_config['minimum_gene_coverage'], False,
                                                       true_coverage_config['minimum_gene_identity'],
                                                       true_coverage_config, rematch_script, num_map_loc=1,
                                                       bowtie_algorithm=args.trueCoverageBowtieAlgo,
                                                       clean_run_rematch=True)
                    runs['trueCoverage_ReMatCh'] = [
                        run_successfully_true_coverage, pass_qc_true_coverage,
                        time_taken, failing, {}, 'NA'
                    ]
                else:
                    print(
                        '\n' +
                        '--skipTrueCoverage set. Skipping True coverage analysis'
                    )
                    runs['trueCoverage_ReMatCh'] = skipped

                if args.skipTrueCoverage or true_coverage_config is None or args.trueCoverageProceed or \
                        (run_successfully_true_coverage and pass_qc_true_coverage):
                    # Run first FastQC
                    nts2clip_based_nts_content = None
                    if not args.skipFastQC:
                        run_successfully, pass_qc, time_taken, failing, warning, maximum_reads_length, \
                            nts2clip_based_nts_content = fastqc.runFastQCanalysis(outdir, threads, adapters_fasta,
                                                                                  fastq_files, args.fastQCkeepFiles,
                                                                                  'first_run')
                        runs['first_FastQC'] = [
                            run_successfully, pass_qc, time_taken, failing,
                            warning, 'NA'
                        ]
                    else:
                        print(
                            '--skipFastQC set. Skipping First FastQC analysis')
                        runs['first_FastQC'] = skipped

                    # Run Trimmomatic
                    not_empty_fastq = True
                    if not args.skipTrimmomatic:
                        run_successfully, not_empty_fastq, time_taken, failing, paired_reads, trimmomatic_folder, \
                            file_size, warning = trimmomatic.runTrimmomatic(jar_path_trimmomatic, sample_name, outdir,
                                                                            threads, adapters_fasta, script_path,
                                                                            args.doNotSearchAdapters, fastq_files,
                                                                            max_reads_length, args.doNotTrimCrops,
                                                                            args.trimCrop, args.trimHeadCrop,
                                                                            args.trimLeading, args.trimTrailing,
                                                                            args.trimSlidingWindow, args.trimMinLength,
                                                                            nts2clip_based_nts_content, jar_max_memory,
                                                                            fastq_encoding)
                        runs['Trimmomatic'] = [
                            run_successfully, None, time_taken, failing,
                            warning, file_size
                        ]
                        trimmomatic_run_successfully = run_successfully

                        if run_successfully and not_empty_fastq:
                            fastq_files = paired_reads
                            min_reads_length = args.trimMinLength

                            # Run second Estimated Coverage
                            if not args.skipEstimatedCoverage:
                                run_successfully_estimated_coverage, pass_qc, time_run, failing, estimated_coverage = \
                                    coverage.getEstimatedCoverage(fastq_files, genome_size, outdir, threads,
                                                                  args.estimatedMinimumCoverage)
                                runs['second_Coverage'] = [
                                    run_successfully_estimated_coverage,
                                    pass_qc, time_run, failing, {}, 'NA'
                                ]
                            else:
                                print(
                                    '--skipEstimatedCoverage set. Skipping Second Estimated Coverage analysis'
                                )
                                runs['second_Coverage'] = skipped

                            if args.skipEstimatedCoverage or (
                                    run_successfully_estimated_coverage
                                    and not estimated_coverage <
                                    args.estimatedMinimumCoverage):
                                # Run second FastQC
                                if not args.skipFastQC:
                                    run_successfully, pass_qc, time_taken, failing, warning, maximum_reads_length, \
                                        nts2clip_based_nts_content = fastqc.runFastQCanalysis(outdir, threads,
                                                                                              adapters_fasta,
                                                                                              fastq_files,
                                                                                              args.fastQCkeepFiles,
                                                                                              'second_run')
                                    runs['second_FastQC'] = [
                                        run_successfully, pass_qc, time_taken,
                                        failing, warning, 'NA'
                                    ]
                                    if run_successfully:
                                        max_reads_length = maximum_reads_length
                                else:
                                    print(
                                        '--skipFastQC set. Skipping Second FastQC analysis'
                                    )
                                    runs['second_FastQC'] = skipped
                            else:
                                print(
                                    '\n'
                                    'Estimated coverage is too lower (< {estimatedMinimumCoverage}x). This sample'
                                    ' will not proceed with INNUca'
                                    ' pipeline'.format(
                                        estimatedMinimumCoverage=args.
                                        estimatedMinimumCoverage))
                                runs['second_FastQC'] = skipped
                        else:
                            print(
                                'Trimmomatic did not run successfully or return zero reads! Skipping Second Estimated'
                                ' Coverage analysis and FastQC analysis')
                            runs['second_Coverage'] = skipped
                            runs['second_FastQC'] = skipped
                    else:
                        print(
                            '--skipTrimmomatic set. Skipping Trimmomatic, but also Second FastQC analysis and Second'
                            ' Estimated Coverage analysis')
                        runs['Trimmomatic'] = skipped
                        runs['second_Coverage'] = skipped
                        runs['second_FastQC'] = skipped

                    if not args.skipFastQC and \
                            (runs['second_FastQC'][1] or (runs['second_FastQC'][1] is None and
                                                          runs['first_FastQC'][1])) is False and \
                            not not_empty_fastq and not args.fastQCproceed:
                        print(
                            '\n'
                            'This sample does not pass FastQC module QA/QC. It will not proceed with INNUca pipeline'
                        )
                else:
                    print(
                        '\n'
                        'This sample does not pass True Coverage module QA/QC. This sample will not proceed with'
                        ' INNUca pipeline')
            else:
                print(
                    '\n'
                    'Estimated coverage is too lower (< {estimatedMinimumCoverage}x). This sample will not proceed'
                    ' with INNUca pipeline'.format(
                        estimatedMinimumCoverage=args.estimatedMinimumCoverage)
                )

        continue_second_part = False
        if not args.runKraken or \
                (runs['reads_Kraken'][0] is True and runs['reads_Kraken'][1] is True) or \
                args.krakenProceed or \
                args.krakenIgnoreQC:
            if args.skipEstimatedCoverage or (
                    run_successfully_estimated_coverage and
                    not estimated_coverage < args.estimatedMinimumCoverage):
                if args.skipTrueCoverage or true_coverage_config is None or args.trueCoverageProceed or \
                        (run_successfully_true_coverage and pass_qc_true_coverage):
                    if args.skipFastQC or (runs['second_FastQC'][1] or
                                           (runs['second_FastQC'][1] is None and
                                            runs['first_FastQC'][1])) is not False or \
                            args.fastQCproceed:
                        continue_second_part = True

        if continue_second_part:
            unassembled_pe_reads = None
            assembled_se_reads = None
            # Run Pear
            if args.runPear:
                print('--runPear set. Running Pear')
                pear_min_overlap = pear.determine_minimum_overlap(
                    args.pearMinOverlap, min_reads_length, max_reads_length)
                run_successfully, pass_qc, time_taken, failing, unassembled_pe_reads, assembled_se_reads, \
                    pear_folder, warning = pear.runPear(fastq_files, threads, outdir, sample_name,
                                                        fastq_encoding, trimmomatic_run_successfully,
                                                        pear_min_overlap)
                runs['Pear'] = [
                    run_successfully, pass_qc, time_taken, failing, warning,
                    'NA'
                ]
            else:
                runs['Pear'] = skipped

            # Run SPAdes
            if not args.skipSPAdes:
                run_successfully, pass_qc, time_taken, failing, contigs_spades, warning = \
                    spades.run_spades(sample_name, outdir, threads,
                                      unassembled_pe_reads if unassembled_pe_reads is not None else fastq_files,
                                      args.spadesNotUseCareful, spades_max_memory,
                                      args.spadesMinCoverageAssembly, args.spadesMinContigsLength, genome_size,
                                      args.spadesKmers, max_reads_length, args.spadesDefaultKmers,
                                      args.spadesMinKmerCovContigs, assembled_se_reads, args.saveExcludedContigs,
                                      args.maxNumberContigs, args.keepSPAdesScaffolds, spades_version=spades_version,
                                      estimated_coverage=estimated_coverage,
                                      spades_not_use_isolate=args.spadesNotUseIsolate)
                runs['SPAdes'] = [
                    run_successfully, pass_qc, time_taken, failing, warning,
                    'NA'
                ]

                if run_successfully:
                    contigs = contigs_spades

                    # Run Assembly Mapping check
                    bam_file = None
                    original_bam = None
                    assembly_mapping_folder = None
                    possible_assemblies_bam_remove = {}
                    if not args.skipAssemblyMapping:
                        run_successfully, pass_qc, time_taken, failing, assembly_filtered, bam_file, \
                            assembly_mapping_folder, warning, original_bam = \
                            assembly_mapping.run_assembly_mapping(fastq_files=fastq_files, reference_file=contigs,
                                                                  outdir=outdir, estimated_genome_size_mb=genome_size,
                                                                  max_number_contigs=args.maxNumberContigs,
                                                                  save_excluded_contigs=args.saveExcludedContigs,
                                                                  min_coverage_assembly=args.assemblyMinCoverageContigs,
                                                                  keep_bam=args.keepBAM, threads=threads)
                        runs['Assembly_Mapping'] = [
                            run_successfully, pass_qc, time_taken, failing,
                            warning, 'NA'
                        ]

                        if run_successfully:
                            # Assembly to remove
                            if not args.keepIntermediateAssemblies:
                                if os.path.isfile(contigs_spades) and \
                                        assembly_filtered is not None and \
                                        assembly_filtered != contigs_spades:
                                    if not args.keepBAM:
                                        os.remove(contigs_spades)
                                    else:
                                        possible_assemblies_bam_remove[
                                            'assembly_mapping'] = contigs_spades

                            if assembly_filtered is not None and \
                                    assembly_filtered != contigs_spades and \
                                    os.path.isfile(assembly_filtered):
                                contigs = assembly_filtered
                    else:
                        print(
                            '--skipAssemblyMapping set. Skipping Assembly Mapping check'
                        )
                        runs['Assembly_Mapping'] = skipped

                    # Run Pilon
                    pilon_new_bam = False
                    pilon_bam = None
                    if not args.skipPilon:
                        run_successfully, _, time_taken, failing, assembly_polished, pilon_folder, pilon_new_bam, \
                            pilon_bam = pilon.run_pilon(jar_path_pilon=jar_path_pilon, assembly=contigs,
                                                        fastq_files=fastq_files, outdir=outdir,
                                                        jar_max_memory=jar_max_memory, alignment_file=bam_file,
                                                        keep_bam=args.keepBAM, threads=threads)
                        runs['Pilon'] = [
                            run_successfully, None, time_taken, failing, {},
                            'NA'
                        ]

                        if run_successfully:
                            if not args.keepIntermediateAssemblies:
                                if os.path.isfile(contigs) and \
                                        assembly_polished is not None and \
                                        os.path.isfile(assembly_polished):
                                    if not args.keepBAM:
                                        os.remove(contigs)
                                    else:
                                        if not pilon_new_bam:
                                            possible_assemblies_bam_remove[
                                                'pilon'] = contigs

                            if assembly_polished is not None and \
                                    os.path.isfile(assembly_polished):
                                contigs = assembly_polished

                        if not args.pilonKeepFiles and os.path.isdir(
                                pilon_folder):
                            utils.removeDirectory(pilon_folder)

                    else:
                        print('--skipPilon set. Skipping Pilon correction')
                        runs['Pilon'] = skipped

                    if not args.keepBAM:
                        if bam_file is not None:
                            if os.path.isfile(bam_file):
                                os.remove(bam_file)
                            if os.path.isfile(bam_file + '.bai'):
                                os.remove(bam_file + '.bai')

                        if original_bam is not None and os.path.isfile(
                                original_bam):
                            os.remove(original_bam)

                        if pilon_bam is not None and os.path.isfile(pilon_bam):
                            os.remove(pilon_bam)

                        if 'assembly_mapping' in possible_assemblies_bam_remove and \
                                os.path.isfile(possible_assemblies_bam_remove['assembly_mapping']):
                            os.remove(possible_assemblies_bam_remove[
                                'assembly_mapping'])
                        if 'pilon' in possible_assemblies_bam_remove and \
                                os.path.isfile(possible_assemblies_bam_remove['pilon']):
                            os.remove(possible_assemblies_bam_remove['pilon'])
                    else:
                        if pilon_new_bam:
                            if bam_file is not None:
                                if os.path.isfile(bam_file):
                                    os.remove(bam_file)
                                if os.path.isfile(bam_file + '.bai'):
                                    os.remove(bam_file + '.bai')

                            if original_bam is not None and os.path.isfile(
                                    original_bam):
                                os.remove(original_bam)

                            if 'assembly_mapping' in possible_assemblies_bam_remove and \
                                    os.path.isfile(possible_assemblies_bam_remove['assembly_mapping']):
                                os.remove(possible_assemblies_bam_remove[
                                    'assembly_mapping'])
                        else:
                            if original_bam is not None and os.path.isfile(original_bam) and \
                                    bam_file is not None and os.path.isfile(bam_file):
                                os.remove(bam_file)
                            if 'pilon' in possible_assemblies_bam_remove and \
                                    os.path.isfile(possible_assemblies_bam_remove['pilon']):
                                os.remove(
                                    possible_assemblies_bam_remove['pilon'])

                    if not args.skipAssemblyMapping:
                        utils.removeDirectory(assembly_mapping_folder)

                    print('\n' + 'Final assembly: ' + contigs)
                    with open(os.path.join(outdir, 'final_assembly.txt'),
                              'wt') as writer:
                        writer.write(contigs + '\n')

                    # Run MLST
                    if not args.skipMLST:
                        run_successfully, pass_qc, time_taken, failing, warning = \
                            mlst.runMlst(contigs, scheme, outdir, species_genus, mlst_scheme_genus)
                        runs['MLST'] = [
                            run_successfully, pass_qc, time_taken, failing,
                            warning, 'NA'
                        ]
                    else:
                        print('--skipMLST set. Skipping MLST analysis')
                        runs['MLST'] = skipped

                    # Run Kraken
                    if args.runKraken:
                        print('\n'
                              '--runKraken set. Running Kraken for assembly')
                        run_successfully, pass_qc, time_taken, failing, warning, _ = \
                            kraken(species=args.speciesExpected, files_to_classify=[contigs], kraken_db=args.krakenDB,
                                   files_type='fasta', outdir=outdir, version_kraken=version_kraken_global,
                                   db_mem=args.krakenMemory, quick=args.krakenQuick,
                                   min_percent_covered=args.krakenMinCov,
                                   max_unclassified_frag=args.krakenMaxUnclass, min_base_quality=args.krakenMinQual,
                                   threads=threads)
                        runs['assembly_Kraken'] = [
                            run_successfully, pass_qc, time_taken, failing,
                            warning, 'NA'
                        ]
                    else:
                        runs['assembly_Kraken'] = skipped

                    # Run insert_size
                    if args.runInsertSize:
                        print('\n' '--runInsertSize set. Running insert_size')
                        run_successfully, _, time_taken, failing = \
                            insert_size(sample_name=sample_name, reference=contigs,
                                        fastq=fastq_files, outdir=outdir, threads=threads, dist=args.insertSizeDist)
                        runs['insert_size'] = [
                            run_successfully, None, time_taken, failing, {},
                            'NA'
                        ]
                    else:
                        runs['insert_size'] = skipped
                else:
                    print(
                        'SPAdes did not run successfully! Skipping Pilon correction, Assembly Mapping check,'
                        ' MLST and Kraken (assembly) analysis and insert size determination'
                    )
            else:
                print(
                    '--skipSPAdes set. Skipping SPAdes, Pilon correction, Assembly Mapping check and MLST and Kraken'
                    ' (assembly) analysis and insert size determination')
                runs['SPAdes'] = skipped
                runs['Assembly_Mapping'] = skipped
                runs['Pilon'] = skipped
                runs['MLST'] = skipped
                runs['assembly_Kraken'] = skipped
                runs['insert_size'] = skipped
    else:
        print('Moving to the next sample')

    for step in ('reads_Kraken', 'first_Coverage', 'trueCoverage_ReMatCh',
                 'first_FastQC', 'Trimmomatic', 'second_Coverage',
                 'second_FastQC', 'Pear', 'SPAdes', 'Assembly_Mapping',
                 'Pilon', 'MLST', 'assembly_Kraken', 'insert_size'):
        if step not in runs:
            runs[step] = not_run

    # Remove Pear directory
    if not args.pearKeepFiles and pear_folder is not None:
        utils.removeDirectory(pear_folder)
    # Remove Trimmomatic directory with cleaned reads
    if not args.trimKeepFiles and trimmomatic_folder is not None:
        utils.removeDirectory(trimmomatic_folder)

    # Check run
    run_successfully = all(runs[step][0] or runs[step][0] is None
                           for step in runs)

    pass_fastq_integrity = runs['FastQ_Integrity'][0]
    pass_reads_kraken = runs['reads_Kraken'][
        1] is not False or args.krakenIgnoreQC
    pass_cov = (runs['second_Coverage'][1]
                or (runs['second_Coverage'][1] is None
                    and runs['first_Coverage'][1])) is not False
    pass_true_cov = runs['trueCoverage_ReMatCh'][
        1] is not False or args.trueCoverageIgnoreQC
    pass_fastqc = (runs['second_FastQC'][1]
                   or (runs['second_FastQC'][1] is None
                       and runs['first_FastQC'][1])) is not False
    # pass_trimmomatic = runs['Trimmomatic'][1] is not False
    # pass_pear = runs['Pear'][1] is not False
    # pass_spades = runs['SPAdes'][1] is not False or runs['Assembly_Mapping'][1] is True
    pass_spades = runs['SPAdes'][1] is not False
    pass_assembly_mapping = runs['Assembly_Mapping'][1] is not False
    pass_pilon = runs['Pilon'][0] is not False
    pass_mlst = runs['MLST'][1] is not False or args.mlstIgnoreQC
    pass_assembly_kraken = runs['assembly_Kraken'][
        1] is not False or args.krakenIgnoreQC
    pass_qc = all([
        pass_fastq_integrity, pass_reads_kraken, pass_cov, pass_true_cov,
        pass_fastqc, pass_spades, pass_assembly_mapping, pass_pilon, pass_mlst,
        pass_assembly_kraken
    ])

    return run_successfully, pass_qc, runs
示例#15
0
def blast_subcommand(args):
    msg = []
    if args.fasta is not None and args.type is None:
        msg.append('With --fasta option you must provide the --type')
    # if args.fasta is None and args.org is None:
    #     msg.append('--fasta or --org must be provided')

    if len(msg) > 0:
        argparse.ArgumentParser(prog='blast subcommand options').error(
            '\n'.join(msg))

    utils.required_programs({'makeblastdb': ['-version', '>=', '2.6.0']})

    if args.fasta is not None:
        args.fasta = [os.path.abspath(fasta.name) for fasta in args.fasta]
    else:
        args.fasta, _ = get_fasta_config(args.org)
        if args.type != 'nucl':
            print('\n'
                  'ATTENTION: Blast DB type provided was not "nucl"\n'
                  'It was changed to "nucl"'
                  '\n')
        args.type = 'nucl'

        print('\n'
              'Settings that will be used:\n'
              '    fasta: {reference}\n'
              '    Blast DB type: nucl\n'
              '\n'.format(reference=args.fasta))

    utils.removeDirectory(os.path.join(args.outdir, 'pickles', ''))

    error_msg = []
    for fasta in args.fasta:
        # Create DB
        blast_db = os.path.join(
            args.outdir, '{blast_DB}'.format(blast_DB=os.path.basename(fasta)))
        db_exists, original_file = run_blast.check_db_exists(blast_db)
        if not db_exists and not original_file:
            db_exists = run_blast.create_blast_db(fasta, blast_db, args.type)
            if db_exists:
                print('Blast DB created for {file} in {outdir}'.format(
                    file=fasta, outdir=args.outdir))
                # sys.exit(0)
            else:
                error_msg.append(
                    'It was not possible to create Blast DB or {}'.format(
                        fasta))
        elif db_exists and original_file:
            error_msg.append(
                'Blast DB already found for {file} in {outdir} as {blast_db}'.
                format(file=fasta, outdir=args.outdir, blast_db=blast_db))
        else:
            error_msg.append(
                'It was found only Blast DB files or the original fasta file from which the Blast DB'
                ' should be produced ({file}). Either include the missing files or remove the ones present'
                ' (usually the original fasta file)'.format(file=fasta))

    if len(error_msg) == 0:
        sys.exit(0)
    else:
        sys.exit('\n'.join(error_msg))
示例#16
0
def main():
    program_name = 'ecoli_stx_subtyping.py'

    if sys.version_info[0] < 3:
        sys.exit('Must be using Python 3. Try calling "python3 {}"'.format(
            program_name))

    parser, parser_reads, _, parser_assembly, _ = python_arguments(
        program_name=program_name, version=version)
    parser.description = 'Gets E. coli stx subtypes'

    # Add specific arguments
    parser_reads.add_argument(
        '--stx2covered',
        type=float,
        metavar='N',
        help='Minimal percentage of sequence covered to consider extra stx2'
        ' subtypes (value between [0, 100]) (default: 100)',
        required=False,
        default=100)
    parser_reads.add_argument(
        '--stx2identity',
        type=float,
        metavar='N',
        help='Minimal sequence identity to consider extra stx2'
        ' subtypes (value between [0, 100]) (default: 99.5)',
        required=False,
        default=99.5)

    parser_assembly.add_argument(
        '--stx2covered',
        type=float,
        metavar='N',
        help='Minimal percentage of sequence covered to consider extra stx2'
        ' subtypes (value between [0, 100]) (default: 100)',
        required=False,
        default=100)
    parser_assembly.add_argument(
        '--stx2identity',
        type=float,
        metavar='N',
        help='Minimal sequence identity to consider extra stx2'
        ' subtypes (value between [0, 100]) (default: 99.5)',
        required=False,
        default=99.5)

    args = parser.parse_args()

    msg = []
    if args.minGeneCoverage < 0 or args.minGeneCoverage > 100:
        msg.append('--minGeneCoverage should be a value between [0, 100]')
    if args.minGeneIdentity < 0 or args.minGeneIdentity > 100:
        msg.append('--minGeneIdentity should be a value between [0, 100]')
    if args.stx2covered < 0 or args.stx2covered > 100:
        msg.append('--stx2covered should be a value between [0, 100]')
    if args.stx2identity < 0 or args.stx2identity > 100:
        msg.append('--stx2identity should be a value between [0, 100]')
    if args.org != ['stx', 'subtyping']:
        msg.append('Use "--org stx subtyping" with {}'.format(program_name))

    if len(msg) > 0:
        argparse.ArgumentParser(prog='{} options'.format(program_name)).error(
            '\n'.join(msg))

    start_time = time.time()

    args.outdir = os.path.abspath(args.outdir)
    if not os.path.isdir(args.outdir):
        os.makedirs(args.outdir)

    # Start logger
    logfile, time_str = utils.start_logger(args.outdir)

    _ = utils.general_information(script_name=program_name,
                                  logfile=logfile,
                                  version=version,
                                  outdir=args.outdir,
                                  time_str=time_str)
    print('\n')

    folders_2_remove = []

    # Create modules pickles folder
    pickles_folder = os.path.join(args.outdir, 'pickles', '')
    if not os.path.isdir(pickles_folder):
        os.makedirs(pickles_folder)
    folders_2_remove.append(pickles_folder)

    # Run functions
    folders_2_remove_func, references_results, reference, references_headers = args.func(
        args)
    folders_2_remove.extend(folders_2_remove_func)

    # Parse results
    _, _, _, _, _ = parse_results.parse_results(
        references_results, reference, references_headers, args.outdir,
        args.minGeneCoverage, args.minDepthCoverage, args.typeSeparator)

    stx1_result, stx2_result = stx_subtype_parser(
        os.path.join(args.outdir, 'seq_typing.report_types.tab'), [
            ref_file for ref_file in reference
            if 'stx1' in os.path.basename(ref_file).lower()
        ][0], [
            ref_file for ref_file in reference
            if 'stx2' in os.path.basename(ref_file).lower()
        ][0], args.stx2covered, args.stx2identity)

    # Rename the file to keep ecoli_stx_subtyping stamp
    if os.path.isfile(os.path.join(args.outdir,
                                   'seq_typing.report_types.tab')):
        os.rename(
            os.path.join(args.outdir, 'seq_typing.report_types.tab'),
            os.path.join(args.outdir,
                         'seq_typing.ecoli_stx_subtyping.report_types.tab'))

    # Remove the file to only keep the ecoli_stx_subtyping one
    if os.path.isfile(os.path.join(args.outdir, 'seq_typing.report.txt')):
        os.remove(os.path.join(args.outdir, 'seq_typing.report.txt'))

    print('\n'
          'E. coli stx_subtyping - {stx1_result}:{stx2_result}\n'
          '\n'.format(stx1_result=stx1_result, stx2_result=stx2_result))
    with open(os.path.join(args.outdir, 'seq_typing.ecoli_stx_subtyping.txt'),
              'wt') as writer:
        writer.write(':'.join([stx1_result, stx2_result]))

    if not args.debug:
        for folder in folders_2_remove:
            utils.removeDirectory(folder)

    _ = utils.runTime(start_time)