def main(): program_name = 'seq_typing.py' if sys.version_info[0] < 3: sys.exit('Must be using Python 3. Try calling "python3 {}"'.format( program_name)) parser, _, _, _, _ = python_arguments(program_name, __version__) args = parser.parse_args() start_time = time.time() args.outdir = os.path.abspath(args.outdir) if not os.path.isdir(args.outdir): os.makedirs(args.outdir) # Start logger logfile, time_str = utils.start_logger(args.outdir) script_path = utils.general_information(script_name=program_name, logfile=logfile, version=__version__, outdir=args.outdir, time_str=time_str) del script_path print('\n') folders_2_remove = [] # Create modules pickles folder pickles_folder = os.path.join(args.outdir, 'pickles', '') if not os.path.isdir(pickles_folder): os.makedirs(pickles_folder) folders_2_remove.append(pickles_folder) # Run functions folders_2_remove_func, references_results, reference, references_headers = args.func( args) folders_2_remove.extend(folders_2_remove_func) # Parse results _, _, _, _, _ = parse_results.parse_results( references_results, reference, references_headers, args.outdir, args.minGeneCoverage, args.minDepthCoverage, args.typeSeparator) if not args.debug: for folder in folders_2_remove: utils.removeDirectory(folder) _ = utils.runTime(start_time)
def main(): version = '3.1' args = utils.parseArguments(version) general_start_time = time.time() time_str = time.strftime("%Y%m%d-%H%M%S") # Check if output directory exists outdir = os.path.abspath(os.path.join(args.outdir, '')) if not os.path.isdir(outdir): os.makedirs(outdir) # Start logger if not args.noLog: sys.stdout = utils.Logger(outdir, time_str) print '\n' + '==========> INNUca.py <==========' print '\n' + 'Program start: ' + time.ctime() # Tells where the logfile will be stored if not args.noLog: print '\n' + 'LOGFILE:' print sys.stdout.getLogFile() # Print command print '\n' + 'COMMAND:' script_path = os.path.abspath(sys.argv[0]) print sys.executable + ' ' + script_path + ' ' + ' '.join(sys.argv[1:]) # Print directory where programme was lunch print '\n' + 'PRESENT DIRECTORY:' print os.getcwd() # Print program version print '\n' + 'VERSION INNUca.py:' utils.scriptVersionGit(version, os.getcwd(), script_path, args.noGitInfo) # Get CPU information utils.get_cpu_information(outdir, time_str) # Get trueCoverage_ReMatCh settings trueCoverage_config = get_trueCoverage_config(args.skipTrueCoverage, args.trueConfigFile.name if args.trueConfigFile is not None else None, args.speciesExpected, script_path) # Check programms programs_version_dictionary = {} programs_version_dictionary['gunzip'] = ['--version', '>=', '1.6'] # Java check first for java dependents check next if not (args.skipFastQC and args.skipTrimmomatic and (args.skipPilon or args.skipSPAdes)): # programs_version_dictionary['java'] = ['-version', '>=', '1.8'] programs_version_dictionary['java'] = [None, '>=', '1.8'] # For OpenJDK compatibility missingPrograms, programs_version_dictionary = utils.checkPrograms(programs_version_dictionary) if len(missingPrograms) > 0: sys.exit('\n' + 'Errors:' + '\n' + '\n'.join(missingPrograms)) if not args.skipTrueCoverage or trueCoverage_config is not None: include_rematch_dependencies_path(args.doNotUseProvidedSoftware) programs_version_dictionary['rematch.py'] = ['--version', '>=', '3.2'] programs_version_dictionary['bcftools'] = ['--version', '==', '1.3.1'] if not (args.skipTrueCoverage and ((args.skipAssemblyMapping and args.skipPilon) or args.skipSPAdes)): programs_version_dictionary['bowtie2'] = ['--version', '>=', '2.2.9'] programs_version_dictionary['samtools'] = ['--version', '==', '1.3.1'] if not args.skipFastQC: programs_version_dictionary['fastqc'] = ['--version', '==', '0.11.5'] if not args.skipTrimmomatic: programs_version_dictionary['trimmomatic-0.36.jar'] = ['-version', '==', '0.36'] if args.runPear: programs_version_dictionary['pear'] = ['--version', '>=', '0.9.10'] if not args.skipSPAdes: programs_version_dictionary['spades.py'] = ['--version', '>=', '3.9.0'] if not (args.skipPilon or args.skipSPAdes): programs_version_dictionary['pilon-1.18.jar'] = ['--version', '==', '1.18'] if not (args.skipMLST or args.skipSPAdes): programs_version_dictionary['mlst'] = ['--version', '>=', '2.4'] # Set and print PATH variable utils.setPATHvariable(args, script_path) missingPrograms, programs_version_dictionary = utils.checkPrograms(programs_version_dictionary) if len(missingPrograms) > 0: sys.exit('\n' + 'Errors:' + '\n' + '\n'.join(missingPrograms)) # .jar paths jar_path_trimmomatic = None if not args.skipTrimmomatic: jar_path_trimmomatic = programs_version_dictionary['trimmomatic-0.36.jar'][3] jar_path_pilon = None if not args.skipPilon and not args.skipSPAdes: jar_path_pilon = programs_version_dictionary['pilon-1.18.jar'][3] rematch_script = None # ReMatCh path if not args.skipTrueCoverage: rematch_script = programs_version_dictionary['rematch.py'][3] # pairEnd_filesSeparation_list = args.pairEnd_filesSeparation pairEnd_filesSeparation_list = None samples, inputDirectory, removeCreatedSamplesDirectories, indir_same_outdir = get_samples(args.inputDirectory, args.fastq, outdir, pairEnd_filesSeparation_list) # Start running the analysis print '\n' + 'RUNNING INNUca.py' # Prepare run report file samples_report_path = os.path.join(outdir, 'samples_report.' + time_str + '.tab') utils.start_sample_report_file(samples_report_path) number_samples_successfully = 0 number_samples_pass = 0 number_samples_warning = 0 # Get MLST scheme to use scheme = 'unknown' species_genus, mlst_scheme_genus = None, None if not args.skipMLST and not args.skipSPAdes: scheme, species_genus, mlst_scheme_genus = mlst.getScheme(args.speciesExpected) # Print path to blastn mlst.getBlastPath() # Memory available_memory_GB = utils.get_free_memory() / (1024.0 ** 2) # Determine SPAdes maximum memory spadesMaxMemory = None if not args.skipSPAdes: print '' spadesMaxMemory = spades.define_memory(args.spadesMaxMemory, args.threads, available_memory_GB) # Determine .jar maximum memory jarMaxMemory = 'off' if not (args.skipTrimmomatic and (args.skipSPAdes or args.skipPilon)): print '' jarMaxMemory = utils.define_jar_max_memory(args.jarMaxMemory, args.threads, available_memory_GB) # Run INNUca for each sample sample_report_json = {} for sample in samples: sample_start_time = time.time() print '\n' + 'Sample: ' + sample + '\n' # Create sample outdir sample_outdir = os.path.abspath(os.path.join(outdir, sample, '')) if not os.path.isdir(sample_outdir): os.makedirs(sample_outdir) # Get fastq files fastq_files = utils.searchFastqFiles(os.path.join(inputDirectory, sample, ''), pairEnd_filesSeparation_list, False) if len(fastq_files) == 1: print 'Only one fastq file was found: ' + str(fastq_files) print 'Pair-End sequencing is required. Moving to the next sample' continue elif len(fastq_files) == 0: print 'No compressed fastq files were found. Continue to the next sample' continue print 'The following files will be used:' print str(fastq_files) + '\n' # Run INNUca.py analysis run_successfully, pass_qc, run_report = run_INNUca(sample, sample_outdir, fastq_files, args, script_path, scheme, spadesMaxMemory, jar_path_trimmomatic, jar_path_pilon, jarMaxMemory, trueCoverage_config, rematch_script, species_genus, mlst_scheme_genus) # Save sample fail report utils.write_fail_report(os.path.join(sample_outdir, 'fail_report.txt'), run_report) # Save warning report write_warning_report(os.path.join(sample_outdir, 'warning_report.txt'), run_report) # Get raw reads files size fileSize = sum(os.path.getsize(fastq) for fastq in fastq_files) # Remove sample directory if it was created during the process if removeCreatedSamplesDirectories and not indir_same_outdir: utils.removeDirectory(os.path.join(inputDirectory, sample, '')) print 'END ' + sample + ' analysis' time_taken = utils.runTime(sample_start_time) # Save run report warning, json_pass_qc = utils.write_sample_report(samples_report_path, sample, run_successfully, pass_qc, time_taken, fileSize, run_report) # Save runs statistics if run_successfully: number_samples_successfully += 1 if pass_qc: if warning: number_samples_warning += 1 else: number_samples_pass += 1 sample_report_json[sample] = {'run_successfully': run_successfully, 'pass_qc': json_pass_qc, 'modules_run_report': run_report} # Save combine_samples_reports combine_reports.combine_reports(outdir, outdir, args.json, time_str, len(samples)) # Save sample_report in json if args.json: import json with open(os.path.join(outdir, 'samples_report.' + time_str + '.json'), 'wt') as writer: json.dump(sample_report_json, writer) # Remove temporary folder with symlink to fastq files in case of --fastq use if args.inputDirectory is None and args.fastq is not None: utils.removeDirectory(os.path.join(inputDirectory, '')) # Run report print '\n' + 'END INNUca.py' print '\n' + 'Pipeline problems: {not_run_successfully} samples'.format(not_run_successfully=(len(samples) - number_samples_successfully)) print '\n' + 'FAIL: {number_samples_fail} samples'.format(number_samples_fail=(len(samples) - number_samples_pass - number_samples_warning)) print '\n' + 'WARNING: {number_samples_warning} samples'.format(number_samples_warning=number_samples_warning) print '\n' + 'PASS: {number_samples_pass} samples'.format(number_samples_pass=number_samples_pass) time_taken = utils.runTime(general_start_time) del time_taken # Check whether INNUca.py run at least one sample successfully if number_samples_successfully == 0: sys.exit('No samples run successfully!')
def main(): version = '3.1' args = utils.parseArguments(version) general_start_time = time.time() time_str = time.strftime("%Y%m%d-%H%M%S") # Check if output directory exists outdir = os.path.abspath(os.path.join(args.outdir, '')) if not os.path.isdir(outdir): os.makedirs(outdir) # Start logger if not args.noLog: sys.stdout = utils.Logger(outdir, time_str) print '\n' + '==========> INNUca.py <==========' print '\n' + 'Program start: ' + time.ctime() # Tells where the logfile will be stored if not args.noLog: print '\n' + 'LOGFILE:' print sys.stdout.getLogFile() # Print command print '\n' + 'COMMAND:' script_path = os.path.abspath(sys.argv[0]) print sys.executable + ' ' + script_path + ' ' + ' '.join(sys.argv[1:]) # Print directory where programme was lunch print '\n' + 'PRESENT DIRECTORY:' print os.getcwd() # Print program version print '\n' + 'VERSION INNUca.py:' utils.scriptVersionGit(version, os.getcwd(), script_path, args.noGitInfo) # Get CPU information utils.get_cpu_information(outdir, time_str) # Get trueCoverage_ReMatCh settings trueCoverage_config = get_trueCoverage_config( args.skipTrueCoverage, args.trueConfigFile.name if args.trueConfigFile is not None else None, args.speciesExpected, script_path) # Check programms programs_version_dictionary = {} programs_version_dictionary['gunzip'] = ['--version', '>=', '1.6'] # Java check first for java dependents check next if not (args.skipFastQC and args.skipTrimmomatic and (args.skipPilon or args.skipSPAdes)): # programs_version_dictionary['java'] = ['-version', '>=', '1.8'] programs_version_dictionary['java'] = [None, '>=', '1.8' ] # For OpenJDK compatibility missingPrograms, programs_version_dictionary = utils.checkPrograms( programs_version_dictionary) if len(missingPrograms) > 0: sys.exit('\n' + 'Errors:' + '\n' + '\n'.join(missingPrograms)) if not args.skipTrueCoverage or trueCoverage_config is not None: include_rematch_dependencies_path(args.doNotUseProvidedSoftware) programs_version_dictionary['rematch.py'] = ['--version', '>=', '3.2'] programs_version_dictionary['bcftools'] = ['--version', '==', '1.3.1'] if not (args.skipTrueCoverage and ( (args.skipAssemblyMapping and args.skipPilon) or args.skipSPAdes)): programs_version_dictionary['bowtie2'] = ['--version', '>=', '2.2.9'] programs_version_dictionary['samtools'] = ['--version', '==', '1.3.1'] if not args.skipFastQC: programs_version_dictionary['fastqc'] = ['--version', '==', '0.11.5'] if not args.skipTrimmomatic: programs_version_dictionary['trimmomatic-0.36.jar'] = [ '-version', '==', '0.36' ] if args.runPear: programs_version_dictionary['pear'] = ['--version', '>=', '0.9.10'] if not args.skipSPAdes: programs_version_dictionary['spades.py'] = ['--version', '>=', '3.9.0'] if not (args.skipPilon or args.skipSPAdes): programs_version_dictionary['pilon-1.18.jar'] = [ '--version', '==', '1.18' ] if not (args.skipMLST or args.skipSPAdes): programs_version_dictionary['mlst'] = ['--version', '>=', '2.4'] # Set and print PATH variable utils.setPATHvariable(args, script_path) missingPrograms, programs_version_dictionary = utils.checkPrograms( programs_version_dictionary) if len(missingPrograms) > 0: sys.exit('\n' + 'Errors:' + '\n' + '\n'.join(missingPrograms)) # .jar paths jar_path_trimmomatic = None if not args.skipTrimmomatic: jar_path_trimmomatic = programs_version_dictionary[ 'trimmomatic-0.36.jar'][3] jar_path_pilon = None if not args.skipPilon and not args.skipSPAdes: jar_path_pilon = programs_version_dictionary['pilon-1.18.jar'][3] rematch_script = None # ReMatCh path if not args.skipTrueCoverage: rematch_script = programs_version_dictionary['rematch.py'][3] # pairEnd_filesSeparation_list = args.pairEnd_filesSeparation pairEnd_filesSeparation_list = None samples, inputDirectory, removeCreatedSamplesDirectories, indir_same_outdir = get_samples( args.inputDirectory, args.fastq, outdir, pairEnd_filesSeparation_list) # Start running the analysis print '\n' + 'RUNNING INNUca.py' # Prepare run report file samples_report_path = os.path.join(outdir, 'samples_report.' + time_str + '.tab') utils.start_sample_report_file(samples_report_path) number_samples_successfully = 0 number_samples_pass = 0 number_samples_warning = 0 # Get MLST scheme to use scheme = 'unknown' species_genus, mlst_scheme_genus = None, None if not args.skipMLST and not args.skipSPAdes: scheme, species_genus, mlst_scheme_genus = mlst.getScheme( args.speciesExpected) # Print path to blastn mlst.getBlastPath() # Memory available_memory_GB = utils.get_free_memory() / (1024.0**2) # Determine SPAdes maximum memory spadesMaxMemory = None if not args.skipSPAdes: print '' spadesMaxMemory = spades.define_memory(args.spadesMaxMemory, args.threads, available_memory_GB) # Determine .jar maximum memory jarMaxMemory = 'off' if not (args.skipTrimmomatic and (args.skipSPAdes or args.skipPilon)): print '' jarMaxMemory = utils.define_jar_max_memory(args.jarMaxMemory, args.threads, available_memory_GB) # Run INNUca for each sample sample_report_json = {} for sample in samples: sample_start_time = time.time() print '\n' + 'Sample: ' + sample + '\n' # Create sample outdir sample_outdir = os.path.abspath(os.path.join(outdir, sample, '')) if not os.path.isdir(sample_outdir): os.makedirs(sample_outdir) # Get fastq files fastq_files = utils.searchFastqFiles( os.path.join(inputDirectory, sample, ''), pairEnd_filesSeparation_list, False) if len(fastq_files) == 1: print 'Only one fastq file was found: ' + str(fastq_files) print 'Pair-End sequencing is required. Moving to the next sample' continue elif len(fastq_files) == 0: print 'No compressed fastq files were found. Continue to the next sample' continue print 'The following files will be used:' print str(fastq_files) + '\n' # Run INNUca.py analysis run_successfully, pass_qc, run_report = run_INNUca( sample, sample_outdir, fastq_files, args, script_path, scheme, spadesMaxMemory, jar_path_trimmomatic, jar_path_pilon, jarMaxMemory, trueCoverage_config, rematch_script, species_genus, mlst_scheme_genus) # Save sample fail report utils.write_fail_report(os.path.join(sample_outdir, 'fail_report.txt'), run_report) # Save warning report write_warning_report(os.path.join(sample_outdir, 'warning_report.txt'), run_report) # Get raw reads files size fileSize = sum(os.path.getsize(fastq) for fastq in fastq_files) # Remove sample directory if it was created during the process if removeCreatedSamplesDirectories and not indir_same_outdir: utils.removeDirectory(os.path.join(inputDirectory, sample, '')) print 'END ' + sample + ' analysis' time_taken = utils.runTime(sample_start_time) # Save run report warning, json_pass_qc = utils.write_sample_report( samples_report_path, sample, run_successfully, pass_qc, time_taken, fileSize, run_report) # Save runs statistics if run_successfully: number_samples_successfully += 1 if pass_qc: if warning: number_samples_warning += 1 else: number_samples_pass += 1 sample_report_json[sample] = { 'run_successfully': run_successfully, 'pass_qc': json_pass_qc, 'modules_run_report': run_report } # Save combine_samples_reports combine_reports.combine_reports(outdir, outdir, args.json, time_str, len(samples)) # Save sample_report in json if args.json: import json with open(os.path.join(outdir, 'samples_report.' + time_str + '.json'), 'wt') as writer: json.dump(sample_report_json, writer) # Remove temporary folder with symlink to fastq files in case of --fastq use if args.inputDirectory is None and args.fastq is not None: utils.removeDirectory(os.path.join(inputDirectory, '')) # Run report print '\n' + 'END INNUca.py' print '\n' + 'Pipeline problems: {not_run_successfully} samples'.format( not_run_successfully=(len(samples) - number_samples_successfully)) print '\n' + 'FAIL: {number_samples_fail} samples'.format( number_samples_fail=(len(samples) - number_samples_pass - number_samples_warning)) print '\n' + 'WARNING: {number_samples_warning} samples'.format( number_samples_warning=number_samples_warning) print '\n' + 'PASS: {number_samples_pass} samples'.format( number_samples_pass=number_samples_pass) time_taken = utils.runTime(general_start_time) del time_taken # Check whether INNUca.py run at least one sample successfully if number_samples_successfully == 0: sys.exit('No samples run successfully!')
def main(): version = '2.0' args = utils.parseArguments(version) general_start_time = time.time() time_str = time.strftime("%Y%m%d-%H%M%S") # Check if output directory exists outdir = os.path.abspath(os.path.join(args.outdir, '')) if not os.path.isdir(outdir): os.makedirs(outdir) # Start logger sys.stdout = utils.Logger(outdir, time_str) print '\n' + '==========> INNUca.py <==========' print '\n' + 'Program start: ' + time.ctime() # Tells where the logfile will be stored print '\n' + 'LOGFILE:' print sys.stdout.getLogFile() # Print command print '\n' + 'COMMAND:' script_path = os.path.abspath(sys.argv[0]) print sys.executable + ' ' + script_path + ' ' + ' '.join(sys.argv[1:]) # Print directory where programme was lunch print '\n' + 'PRESENT DIRECTORY :' print os.getcwd() # Print program version print '\n' + 'VERSION INNUca.py:' utils.scriptVersionGit(version, os.getcwd(), script_path) # Get CPU information utils.get_cpu_information(outdir, time_str) # Set and print PATH variable utils.setPATHvariable(args.doNotUseProvidedSoftware, script_path) # Check programms programs_version_dictionary = {} programs_version_dictionary['gunzip'] = ['--version', '>=', '1.6'] if (not args.skipTrueCoverage or (not args.skipPilon and not args.skipSPAdes)): programs_version_dictionary['bowtie2'] = ['--version', '>=', '2.2.9'] programs_version_dictionary['samtools'] = ['--version', '==', '1.3.1'] if not (args.skipFastQC and args.skipTrimmomatic and (args.skipPilon or args.skipSPAdes)): programs_version_dictionary['java'] = ['-version', '>=', '1.8'] if not args.skipFastQC: programs_version_dictionary['fastqc'] = ['--version', '==', '0.11.5'] if not args.skipTrimmomatic: programs_version_dictionary['trimmomatic-0.36.jar'] = [ '-version', '==', '0.36' ] if not args.skipSPAdes: programs_version_dictionary['spades.py'] = ['--version', '>=', '3.9.0'] if not args.skipPilon and not args.skipSPAdes: programs_version_dictionary['pilon-1.18.jar'] = [ '--version', '==', '1.18' ] if not args.skipMLST and not args.skipSPAdes: programs_version_dictionary['mlst'] = ['--version', '>=', '2.4'] missingPrograms, programs_version_dictionary = utils.checkPrograms( programs_version_dictionary) if len(missingPrograms) > 0: sys.exit('\n' + 'Errors:' + '\n' + '\n'.join(missingPrograms)) # .jar paths jar_path_trimmomatic = None if not args.skipTrimmomatic: jar_path_trimmomatic = programs_version_dictionary[ 'trimmomatic-0.36.jar'][3] jar_path_pilon = None if not args.skipPilon and not args.skipSPAdes: jar_path_pilon = programs_version_dictionary['pilon-1.18.jar'][3] # Check if input directory exists with fastq files and store samples name that have fastq files inputDirectory = os.path.abspath(os.path.join(args.inputDirectory, '')) # pairEnd_filesSeparation_list = args.pairEnd_filesSeparation pairEnd_filesSeparation_list = None print '' samples, removeCreatedSamplesDirectories, indir_same_outdir = utils.checkSetInputDirectory( inputDirectory, outdir, pairEnd_filesSeparation_list) # Start running the analysis print '\n' + 'RUNNING INNUca.py' # Prepare run report file samples_report_path = os.path.join(outdir, 'samples_report.' + time_str + '.tab') utils.start_sample_report_file(samples_report_path) number_samples_successfully = 0 number_samples_pass = 0 # Get MLST scheme to use scheme = 'unknown' if not args.skipMLST and not args.skipSPAdes: scheme = mlst.getScheme(args.speciesExpected) # Get path to blastn mlst.getBlastPath() # Get trueCoverage_ReMatCh settings trueCoverage_config = None if not args.skipTrueCoverage: trueCoverage_reference = None trueCoverage_config_file = None trueCoverage_config = None if args.trueConfigFile is None: print 'No trueCoverage_ReMatCh config file was provided. Search for default files' trueCoverage_config_file, trueCoverage_reference = trueCoverage.check_existing_default_config( args.speciesExpected, script_path) else: trueCoverage_config_file = args.trueConfigFile.name if trueCoverage_config_file is not None: trueCoverage_config = trueCoverage.parse_config( trueCoverage_config_file) if args.trueConfigFile is None and trueCoverage_config is not None: trueCoverage_config['reference_file'] = trueCoverage_reference if trueCoverage_config is not None: print 'The following trueCoverage_ReMatCh config file will be used: ' + trueCoverage_config_file print 'The following trueCoverage_ReMatCh reference file will be used: ' + trueCoverage_config[ 'reference_file'] + '\n' else: print 'No trueCoverage_ReMatCh config file was found' # Memory available_memory_GB = utils.get_free_memory() / (1024.0**2) # Determine SPAdes maximum memory spadesMaxMemory = None if not args.skipSPAdes: print '' spadesMaxMemory = spades.define_memory(args.spadesMaxMemory, args.threads, available_memory_GB) # Determine .jar maximum memory jarMaxMemory = 'off' if not (args.skipTrimmomatic and (args.skipSPAdes or args.skipPilon)): print '' jarMaxMemory = utils.define_jar_max_memory(args.jarMaxMemory, args.threads, available_memory_GB) # Run INNUca for each sample for sample in samples: sample_start_time = time.time() print '\n' + 'Sample: ' + sample + '\n' # Create sample outdir sample_outdir = os.path.abspath(os.path.join(outdir, sample, '')) if not os.path.isdir(sample_outdir): os.makedirs(sample_outdir) # Get fastq files fastq_files = utils.searchFastqFiles( os.path.join(inputDirectory, sample, ''), pairEnd_filesSeparation_list, False) if len(fastq_files) == 1: print 'Only one fastq file was found: ' + str(fastq_files) print 'Pair-End sequencing is required. Moving to the next sample' continue print 'The following files will be used:' print str(fastq_files) + '\n' # Run INNUca.py analysis run_successfully, pass_qc, run_report = run_INNUca( sample, sample_outdir, fastq_files, args, script_path, scheme, spadesMaxMemory, jar_path_trimmomatic, jar_path_pilon, jarMaxMemory, trueCoverage_config) # Save sample fail report fail_report_path = os.path.join(sample_outdir, 'fail_report.txt') utils.write_fail_report(fail_report_path, run_report) # Save runs statistics if run_successfully: number_samples_successfully += 1 if pass_qc: number_samples_pass += 1 # Get raw reads files size fileSize = sum(os.path.getsize(fastq) for fastq in fastq_files) # Remove sample directory if it was created during the process if removeCreatedSamplesDirectories and not indir_same_outdir: utils.removeDirectory(os.path.join(inputDirectory, sample, '')) print 'END ' + sample + ' analysis' time_taken = utils.runTime(sample_start_time) # Save run report utils.write_sample_report(samples_report_path, sample, run_successfully, pass_qc, time_taken, fileSize, run_report) # Run report print '\n' + 'END INNUca.py' print '\n' + str(number_samples_successfully) + ' samples out of ' + str( len(samples)) + ' run successfully' print '\n' + str(number_samples_pass) + ' samples out of ' + str( number_samples_successfully ) + ' (run successfully) PASS INNUca.py analysis' time_taken = utils.runTime(general_start_time) del time_taken # Check whether INNUca.py run at least one sample successfully if number_samples_successfully == 0: sys.exit('No samples run successfully!')
def main(): parser = argparse.ArgumentParser(prog='patho_typing.py', description='In silico pathogenic typing directly from raw Illumina reads', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--version', help='Version information', action='version', version='{prog} v{version}'.format(prog=parser.prog, version=__version__)) parser_required = parser.add_argument_group('Required options') parser_required.add_argument('-f', '--fastq', nargs='+', action=utils.required_length((1, 2), '--fastq'), type=argparse.FileType('r'), metavar=('/path/to/input/file.fq.gz'), help='Path to single OR paired-end fastq files. If two files are passed, they will be' ' assumed as being the paired fastq files', required=True) parser_required.add_argument('-s', '--species', nargs=2, type=str, metavar=('Yersinia', 'enterocolitica'), help='Species name', required=True) parser_optional_general = parser.add_argument_group('General facultative options') parser_optional_general.add_argument('-o', '--outdir', type=str, metavar='/path/to/output/directory/', help='Path to the directory where the information will be stored', required=False, default='.') parser_optional_general.add_argument('-j', '--threads', type=int, metavar='N', help='Number of threads to use', required=False, default=1) parser_optional_general.add_argument('--trueCoverage', action='store_true', help='Assess true coverage before continue typing') parser_optional_general.add_argument('--noCheckPoint', action='store_true', help='Ignore the true coverage checking point') parser_optional_general.add_argument('--minGeneCoverage', type=int, metavar='N', help='Minimum typing percentage of target reference gene sequence covered to' ' consider a gene to be present (value between [0, 100])', required=False) parser_optional_general.add_argument('--minGeneIdentity', type=int, metavar='N', help='Minimum typing percentage of identity of reference gene sequence covered' ' to consider a gene to be present (value between [0, 100]). One INDEL' ' will be considered as one difference', required=False) parser_optional_general.add_argument('--minGeneDepth', type=int, metavar='N', help='Minimum typing gene average coverage depth of present positions to' ' consider a gene to be present (default is 1/3 of average sample' ' coverage or 15x)', required=False) parser_optional_general.add_argument('--doNotRemoveConsensus', action='store_true', help='Do not remove ReMatCh consensus sequences') parser_optional_general.add_argument('--debug', action='store_true', help='DeBug Mode: do not remove temporary files') args = parser.parse_args() if args.minGeneCoverage is not None and (args.minGeneCoverage < 0 or args.minGeneCoverage > 100): parser.error('--minGeneCoverage should be a value between [0, 100]') if args.minGeneIdentity is not None and (args.minGeneIdentity < 0 or args.minGeneIdentity > 100): parser.error('--minGeneIdentity should be a value between [0, 100]') start_time = time.time() args.outdir = os.path.abspath(args.outdir) if not os.path.isdir(args.outdir): os.makedirs(args.outdir) # Start logger logfile, time_str = utils.start_logger(args.outdir) script_path = utils.general_information(logfile, __version__, args.outdir, time_str) print('\n') rematch = include_rematch_dependencies_path() args.fastq = [fastq.name for fastq in args.fastq] reference_file, trueCoverage_file, trueCoverage_sequences, trueCoverage_headers, trueCoverage_config, typing_file, \ typing_sequences, typing_headers, typing_rules, typing_config = \ set_reference(args.species, args.outdir, script_path, args.trueCoverage) original_reference_file = str(reference_file) confirm_genes_fasta_rules(typing_headers, typing_rules) run_successfully, bam_file = mapping_reads(args.fastq, reference_file, args.threads, args.outdir, False, 1) if run_successfully: rematch_dir = os.path.join(args.outdir, 'rematch', '') if not os.path.isdir(rematch_dir): os.makedirs(rematch_dir) if args.trueCoverage: if trueCoverage_file is not None: trueCoverage_dir = os.path.join(rematch_dir, 'trueCoverage', '') if not os.path.isdir(trueCoverage_dir): os.makedirs(trueCoverage_dir) print('\n') run_successfully, trueCoverage_bam = split_bam(bam_file, trueCoverage_headers, trueCoverage_dir, args.threads) if run_successfully: run_successfully = indexAlignment(trueCoverage_bam) if run_successfully: reference_file = os.path.join(trueCoverage_dir, 'reference.fasta') write_sequeces(reference_file, trueCoverage_sequences) index_fasta_samtools(reference_file, None, None, True) config = parse_config(trueCoverage_config) runtime, run_successfully, sample_data_general, data_by_gene = \ run_rematch.run_rematch(rematch, trueCoverage_dir, reference_file, trueCoverage_bam, args.threads, config['length_extra_seq'], config['minimum_depth_presence'], config['minimum_depth_call'], config['minimum_depth_frequency_dominant_allele'], config['minimum_gene_coverage'], config['minimum_gene_identity'], args.debug, args.doNotRemoveConsensus) if run_successfully and sample_data_general['mean_sample_coverage'] is not None and \ sample_data_general['number_absent_genes'] is not None and \ sample_data_general['number_genes_multiple_alleles'] is not None: if args.minGeneDepth is None: args.minGeneDepth = sample_data_general['mean_sample_coverage'] / 3 if \ sample_data_general['mean_sample_coverage'] / 3 > 15 else \ 15 exit_info = [] if sample_data_general['mean_sample_coverage'] < config['minimum_read_coverage']: exit_info.append('Sample coverage ({mean}) lower than the minimum' ' required ({minimum})' ''.format(mean=sample_data_general['mean_sample_coverage'], minimum=config['minimum_read_coverage'])) if sample_data_general['number_absent_genes'] > config['maximum_number_absent_genes']: exit_info.append('Number of absent genes ({number}) higher than the' ' maximum allowed ({maximum})' ''.format(number=sample_data_general['number_absent_genes'], maximum=config['maximum_number_absent_genes'])) if sample_data_general['number_genes_multiple_alleles'] > \ config['maximum_number_genes_multiple_alleles']: exit_info.append('Number of genes with multiple alleles' ' ({number}) higher than the maximum' ' allowed ({maximum})' ''.format(number=sample_data_general['number_genes_multiple_alleles'], maximum=config['maximum_number_genes_multiple_alleles'])) if len(exit_info) > 0: print('\n' + '\n'.join(exit_info) + '\n') e = 'TrueCoverage requirements not fulfilled' print('\n' + e + '\n') if not args.noCheckPoint: clean_pathotyping_folder(args.outdir, original_reference_file, args.debug) _ = utils.runTime(start_time) sys.exit(e) else: e = 'TrueCoverage module did not run successfully' print('\n' + e + '\n') if not args.noCheckPoint: clean_pathotyping_folder(args.outdir, original_reference_file, args.debug) _ = utils.runTime(start_time) sys.exit(e) print('\n') typing_dir = os.path.join(rematch_dir, 'typing', '') if not os.path.isdir(typing_dir): os.makedirs(typing_dir) run_successfully, bam_file = split_bam(bam_file, typing_headers, typing_dir, args.threads) if run_successfully: run_successfully = indexAlignment(bam_file) if run_successfully: reference_file = os.path.join(typing_dir, 'reference.fasta') write_sequeces(reference_file, typing_sequences) index_fasta_samtools(reference_file, None, None, True) rematch_dir = str(typing_dir) if not run_successfully: if args.noCheckPoint: clean_pathotyping_folder(args.outdir, original_reference_file, args.debug) _ = utils.runTime(start_time) sys.exit('Something in the required TrueCoverage analysis went wrong') else: print('\n' 'WARNING: it was not found trueCoverage target files. trueCoverage will not run.' '\n') if run_successfully: config = parse_config(typing_config) if args.minGeneCoverage is not None: config['minimum_gene_coverage'] = args.minGeneCoverage if args.minGeneIdentity is not None: config['minimum_gene_identity'] = args.minGeneIdentity runtime, run_successfully, sample_data_general, data_by_gene = \ run_rematch.run_rematch(rematch, rematch_dir, reference_file, bam_file, args.threads, config['length_extra_seq'], config['minimum_depth_presence'], config['minimum_depth_call'], config['minimum_depth_frequency_dominant_allele'], config['minimum_gene_coverage'], config['minimum_gene_identity'], args.debug, args.doNotRemoveConsensus) if run_successfully and data_by_gene is not None: if args.minGeneDepth is None: args.minGeneDepth = sample_data_general['mean_sample_coverage'] / 3 if \ sample_data_general['mean_sample_coverage'] / 3 > 15 else \ 15 _, _, _ = typing.typing(data_by_gene, typing_rules, config['minimum_gene_coverage'], config['minimum_gene_identity'], args.minGeneDepth, args.outdir) else: clean_pathotyping_folder(args.outdir, original_reference_file, args.debug) _ = utils.runTime(start_time) sys.exit('ReMatCh run for pathotyping did not run successfully') else: clean_pathotyping_folder(args.outdir, original_reference_file, args.debug) _ = utils.runTime(start_time) sys.exit('Something did not run successfully') clean_pathotyping_folder(args.outdir, original_reference_file, args.debug) print('\n') _ = utils.runTime(start_time)
def main(): parser = argparse.ArgumentParser( prog='rematch.py', description= 'Reads mapping against target sequences, checking mapping and consensus sequences production', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--version', help='Version information', action='version', version=str('%(prog)s v' + version)) parser_required = parser.add_argument_group('Required options') parser_required.add_argument( '-r', '--reference', type=argparse.FileType('r'), metavar='/path/to/reference_sequence.fasta', help='Fasta file containing reference sequences', required=True) parser_optional_general = parser.add_argument_group( 'General facultative options') parser_optional_general.add_argument( '-w', '--workdir', type=str, metavar='/path/to/workdir/directory/', help= 'Path to the directory where ReMatCh will run and produce the outputs with reads (ended with fastq.gz/fq.gz and, in case of PE data, pair-end direction coded as _R1_001 / _R2_001 or _1 / _2) already present (organized in sample folders) or to be downloaded', required=False, default='.') parser_optional_general.add_argument('-j', '--threads', type=int, metavar='N', help='Number of threads to use', required=False, default=1) parser_optional_general.add_argument( '--doNotUseProvidedSoftware', action='store_true', help= 'Tells ReMatCh to not use Bowtie2, Samtools and Bcftools that are provided with it' ) parser_optional_rematch = parser.add_argument_group( 'ReMatCh module facultative options') parser_optional_rematch.add_argument( '--conservedSeq', action='store_true', help= 'This option can be used with conserved sequences like MLST genes to speedup the analysis by alignning reads using Bowtie2 sensitive algorithm' ) parser_optional_rematch.add_argument( '--extraSeq', type=int, metavar='N', help= 'Sequence length added to both ends of target sequences (usefull to improve reads mapping to the target one) that will be trimmed in ReMatCh outputs', required=False, default=0) parser_optional_rematch.add_argument( '--minCovPresence', type=int, metavar='N', help= 'Reference position minimum coverage depth to consider the position to be present in the sample', required=False, default=5) parser_optional_rematch.add_argument( '--minCovCall', type=int, metavar='N', help= 'Reference position minimum coverage depth to perform a base call. Lower coverage will be coded as N', required=False, default=10) parser_optional_rematch.add_argument( '--minFrequencyDominantAllele', type=float, metavar='0.6', help= 'Minimum relative frequency of the dominant allele coverage depth (value between [0, 1]). Positions with lower values will be considered as having multiple alleles (and will be coded as N)', required=False, default=0.6) parser_optional_rematch.add_argument( '--minGeneCoverage', type=int, metavar='N', help= 'Minimum percentage of target reference gene sequence covered by --minCovPresence to consider a gene to be present (value between [0, 100])', required=False, default=80) parser_optional_rematch.add_argument( '--minGeneIdentity', type=int, metavar='N', help= 'Minimum percentage of identity of reference gene sequence covered by --minCovCall to consider a gene to be present (value between [0, 100]). One INDEL will be considered as one difference', required=False, default=70) parser_optional_rematch.add_argument( '--numMapLoc', type=int, metavar='N', help= 'Maximum number of locations to which a read can map (sometimes useful when mapping against similar sequences)', required=False, default=1) parser_optional_rematch.add_argument( '--doubleRun', action='store_true', help= 'Tells ReMatCh to run a second time using as reference the noMatter consensus sequence produced in the first run. This will improve consensus sequence determination for sequences with high percentage of target reference gene sequence covered' ) parser_optional_rematch.add_argument( '--debug', action='store_true', help='DeBug Mode: do not remove temporary files') parser_optional_download = parser.add_argument_group( 'Download facultative options') parser_optional_download.add_argument( '-a', '--asperaKey', type=argparse.FileType('r'), metavar='/path/to/asperaweb_id_dsa.openssh', help= 'Tells ReMatCh to download fastq files from ENA using Aspera Connect. With this option, the path to Private-key file asperaweb_id_dsa.openssh must be provided (normaly found in ~/.aspera/connect/etc/asperaweb_id_dsa.openssh).', required=False) parser_optional_download.add_argument( '-k', '--keepDownloadedFastq', action='store_true', help='Tells ReMatCh to keep the fastq files downloaded') parser_optional_download.add_argument( '--downloadLibrariesType', type=str, metavar='PAIRED', help='Tells ReMatCh to download files with specific library layout', choices=['PAIRED', 'SINGLE', 'BOTH'], required=False, default='BOTH') parser_optional_download.add_argument( '--downloadInstrumentPlatform', type=str, metavar='ILLUMINA', help='Tells ReMatCh to download files with specific library layout', choices=['ILLUMINA', 'ALL'], required=False, default='ILLUMINA') parser_optional_download.add_argument( '--downloadCramBam', action='store_true', help= 'Tells ReMatCh to also download cram/bam files and convert them to fastq files' ) parser_optional_download_exclusive = parser.add_mutually_exclusive_group() parser_optional_download_exclusive.add_argument( '-l', '--listIDs', type=argparse.FileType('r'), metavar='/path/to/list_IDs.txt', help='Path to list containing the IDs to be downloaded (one per line)', required=False) parser_optional_download_exclusive.add_argument( '-t', '--taxon', type=str, metavar='"Streptococcus agalactiae"', help='Taxon name for which ReMatCh will download fastq files', required=False) args = parser.parse_args() if args.minFrequencyDominantAllele < 0 or args.minFrequencyDominantAllele > 1: parser.error( '--minFrequencyDominantAllele should be a value between [0, 1]') if args.minGeneCoverage < 0 or args.minGeneCoverage > 100: parser.error('--minGeneCoverage should be a value between [0, 100]') start_time = time.time() number_samples_successfully, samples_total_number = runRematch(args) print '\n' + 'END ReMatCh' print '\n' + str(number_samples_successfully) + ' samples out of ' + str( samples_total_number) + ' run successfully' time_taken = utils.runTime(start_time) del time_taken if number_samples_successfully == 0: sys.exit('No samples run successfully!')
def runRematch(args): workdir = os.path.abspath(args.workdir) if not os.path.isdir(workdir): os.makedirs(workdir) asperaKey = os.path.abspath( args.asperaKey.name) if args.asperaKey is not None else None # Start logger logfile, time_str = utils.start_logger(workdir) # Get general information utils.general_information(logfile, version, workdir, time_str, args.doNotUseProvidedSoftware, asperaKey, args.downloadCramBam) # Set listIDs listIDs, searched_fastq_files = getListIDs( workdir, args.listIDs.name if args.listIDs is not None else None, args.taxon) # Run ReMatCh for each sample print '\n' + 'STARTING ReMatCh' + '\n' # Clean sequences headers reference_file, gene_list_reference = clean_headers_reference_file( os.path.abspath(args.reference.name), workdir, args.extraSeq) if len(gene_list_reference) == 0: sys.exit('No sequences left') # To use in combined report number_samples_successfully = 0 for sample in listIDs: sample_start_time = time.time() print '\n\n' + 'Sample ID: ' + sample # Create sample outdir sample_outdir = os.path.join(workdir, sample, '') if not os.path.isdir(sample_outdir): os.mkdir(sample_outdir) run_successfully_fastq = None time_taken_fastq = 0 sequencingInformation = { 'run_accession': None, 'instrument_platform': None, 'instrument_model': None, 'library_layout': None, 'library_source': None, 'extra_run_accession': None, 'date_download': None } if not searched_fastq_files: # Download Files time_taken_fastq, run_successfully_fastq, fastq_files, sequencingInformation = download.runDownload( sample, args.downloadLibrariesType, asperaKey, sample_outdir, args.downloadCramBam, args.threads, args.downloadInstrumentPlatform) else: fastq_files = listIDs[sample] fileSize = None run_successfully_rematch_first = None run_successfully_rematch_second = None time_taken_rematch_first = 0 time_taken_rematch_second = 0 if run_successfully_fastq is not False: fileSize = sum(os.path.getsize(fastq) for fastq in fastq_files) # Run ReMatCh time_taken_rematch_first, run_successfully_rematch_first, data_by_gene, sample_data_general_first, consensus_files = rematch_module.runRematchModule( sample, fastq_files, reference_file, args.threads, sample_outdir, args.extraSeq, args.minCovPresence, args.minCovCall, args.minFrequencyDominantAllele, args.minGeneCoverage, args.conservedSeq, args.debug, args.numMapLoc, args.minGeneIdentity) if run_successfully_rematch_first: write_data_by_gene(gene_list_reference, args.minGeneCoverage, sample, data_by_gene, workdir, time_str, 'first_run', args.minGeneIdentity) if args.doubleRun: rematch_second_outdir = os.path.join( sample_outdir, 'rematch_second_run', '') if not os.path.isdir(rematch_second_outdir): os.mkdir(rematch_second_outdir) consensus_concatenated_fasta, consensus_concatenated_gene_list = concatenate_extraSeq_2_consensus( consensus_files['noMatter'], reference_file, args.extraSeq, rematch_second_outdir) if len(consensus_concatenated_gene_list) > 0: time_taken_rematch_second, run_successfully_rematch_second, data_by_gene, sample_data_general_second, consensus_files = rematch_module.runRematchModule( sample, fastq_files, consensus_concatenated_fasta, args.threads, rematch_second_outdir, args.extraSeq, args.minCovPresence, args.minCovCall, args.minFrequencyDominantAllele, args.minGeneCoverage, args.conservedSeq, args.debug, args.numMapLoc, args.minGeneIdentity) if not args.debug: os.remove(consensus_concatenated_fasta) if run_successfully_rematch_second: write_data_by_gene(gene_list_reference, args.minGeneCoverage, sample, data_by_gene, workdir, time_str, 'second_run', args.minGeneIdentity) else: print 'No sequences left after ReMatCh module first run. Second run will not be performed' if not searched_fastq_files and not args.keepDownloadedFastq and fastq_files is not None: for fastq in fastq_files: if os.path.isfile(fastq): os.remove(fastq) time_taken = utils.runTime(sample_start_time) write_sample_report( sample, workdir, time_str, fileSize, run_successfully_fastq, run_successfully_rematch_first, run_successfully_rematch_second, time_taken_fastq, time_taken_rematch_first, time_taken_rematch_second, time_taken, sequencingInformation, sample_data_general_first if run_successfully_rematch_first else { 'number_absent_genes': None, 'number_genes_multiple_alleles': None, 'mean_sample_coverage': None }, sample_data_general_second if run_successfully_rematch_second else { 'number_absent_genes': None, 'number_genes_multiple_alleles': None, 'mean_sample_coverage': None }, fastq_files if fastq_files is not None else '') if all([ run_successfully_fastq is not False, run_successfully_rematch_first is not False, run_successfully_rematch_second is not False ]): number_samples_successfully += 1 return number_samples_successfully, len(listIDs)
def main(): program_name = 'ecoli_stx_subtyping.py' if sys.version_info[0] < 3: sys.exit('Must be using Python 3. Try calling "python3 {}"'.format( program_name)) parser, parser_reads, _, parser_assembly, _ = python_arguments( program_name=program_name, version=version) parser.description = 'Gets E. coli stx subtypes' # Add specific arguments parser_reads.add_argument( '--stx2covered', type=float, metavar='N', help='Minimal percentage of sequence covered to consider extra stx2' ' subtypes (value between [0, 100]) (default: 100)', required=False, default=100) parser_reads.add_argument( '--stx2identity', type=float, metavar='N', help='Minimal sequence identity to consider extra stx2' ' subtypes (value between [0, 100]) (default: 99.5)', required=False, default=99.5) parser_assembly.add_argument( '--stx2covered', type=float, metavar='N', help='Minimal percentage of sequence covered to consider extra stx2' ' subtypes (value between [0, 100]) (default: 100)', required=False, default=100) parser_assembly.add_argument( '--stx2identity', type=float, metavar='N', help='Minimal sequence identity to consider extra stx2' ' subtypes (value between [0, 100]) (default: 99.5)', required=False, default=99.5) args = parser.parse_args() msg = [] if args.minGeneCoverage < 0 or args.minGeneCoverage > 100: msg.append('--minGeneCoverage should be a value between [0, 100]') if args.minGeneIdentity < 0 or args.minGeneIdentity > 100: msg.append('--minGeneIdentity should be a value between [0, 100]') if args.stx2covered < 0 or args.stx2covered > 100: msg.append('--stx2covered should be a value between [0, 100]') if args.stx2identity < 0 or args.stx2identity > 100: msg.append('--stx2identity should be a value between [0, 100]') if args.org != ['stx', 'subtyping']: msg.append('Use "--org stx subtyping" with {}'.format(program_name)) if len(msg) > 0: argparse.ArgumentParser(prog='{} options'.format(program_name)).error( '\n'.join(msg)) start_time = time.time() args.outdir = os.path.abspath(args.outdir) if not os.path.isdir(args.outdir): os.makedirs(args.outdir) # Start logger logfile, time_str = utils.start_logger(args.outdir) _ = utils.general_information(script_name=program_name, logfile=logfile, version=version, outdir=args.outdir, time_str=time_str) print('\n') folders_2_remove = [] # Create modules pickles folder pickles_folder = os.path.join(args.outdir, 'pickles', '') if not os.path.isdir(pickles_folder): os.makedirs(pickles_folder) folders_2_remove.append(pickles_folder) # Run functions folders_2_remove_func, references_results, reference, references_headers = args.func( args) folders_2_remove.extend(folders_2_remove_func) # Parse results _, _, _, _, _ = parse_results.parse_results( references_results, reference, references_headers, args.outdir, args.minGeneCoverage, args.minDepthCoverage, args.typeSeparator) stx1_result, stx2_result = stx_subtype_parser( os.path.join(args.outdir, 'seq_typing.report_types.tab'), [ ref_file for ref_file in reference if 'stx1' in os.path.basename(ref_file).lower() ][0], [ ref_file for ref_file in reference if 'stx2' in os.path.basename(ref_file).lower() ][0], args.stx2covered, args.stx2identity) # Rename the file to keep ecoli_stx_subtyping stamp if os.path.isfile(os.path.join(args.outdir, 'seq_typing.report_types.tab')): os.rename( os.path.join(args.outdir, 'seq_typing.report_types.tab'), os.path.join(args.outdir, 'seq_typing.ecoli_stx_subtyping.report_types.tab')) # Remove the file to only keep the ecoli_stx_subtyping one if os.path.isfile(os.path.join(args.outdir, 'seq_typing.report.txt')): os.remove(os.path.join(args.outdir, 'seq_typing.report.txt')) print('\n' 'E. coli stx_subtyping - {stx1_result}:{stx2_result}\n' '\n'.format(stx1_result=stx1_result, stx2_result=stx2_result)) with open(os.path.join(args.outdir, 'seq_typing.ecoli_stx_subtyping.txt'), 'wt') as writer: writer.write(':'.join([stx1_result, stx2_result])) if not args.debug: for folder in folders_2_remove: utils.removeDirectory(folder) _ = utils.runTime(start_time)
def main(): if sys.version_info[0] < 3: sys.exit('Must be using Python 3. Try calling "python3 get_stx_db.py"') parser = argparse.ArgumentParser( prog='get_stx_db.py', description= 'Gets STX sequences from virulencefinder_db to produce a STX subtyping' ' DB', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--version', help='Version information', action='version', version=str('%(prog)s v' + version)) parser_optional_general = parser.add_argument_group( 'General facultative options') parser_optional_general.add_argument( '-o', '--outdir', type=str, metavar='/path/to/output/directory/', help='Path to the directory where the sequences will be stored', required=False, default='.') args = parser.parse_args() start_time = time.time() args.outdir = os.path.abspath(args.outdir) if not os.path.isdir(args.outdir): os.makedirs(args.outdir) # Get virulencefinder_db url = 'https://bitbucket.org/genomicepidemiology/virulencefinder_db.git' virulencefinder_db = os.path.join(args.outdir, 'virulence_db', '') run_successfully, _, _ = utils.runCommandPopenCommunicate( ['git', 'clone', url, virulencefinder_db], False, None, True) _, commit, _ = utils.runCommandPopenCommunicate([ 'git', '-C', virulencefinder_db, 'log', '--pretty=format:"%h"', '-n', '1' ], True, 15, True) # Get STX sequences stx_seq = {} # stx_seq_write = [] allowed_chars = set(Seq.IUPAC.IUPACData.unambiguous_dna_letters) with open(os.path.join( args.outdir, 'virulence_db.virulence_ecoli.commit_{commit}.problematic_sequences.tab' .format(commit=commit)), 'wt', newline='\n') as writer: for seq in SeqIO.parse( os.path.join(virulencefinder_db, 'virulence_ecoli.fsa'), 'fasta'): if seq.id.lower().startswith('stx'): subtype = seq.id.split(':') if len(subtype) == 4: if seq.id[:4] not in stx_seq: stx_seq[seq.id[:4]] = [] ''' Jani After spending what seemed to be an endless amount of hours trying to solve the STEC stx subtype mystery I've come to the following conclusion. For the platform we need to combine in the target db stx2a, stx2c and stx2d as one subtype called stx2acd. This is due to the fact that all of these subtypes are the most potent ones to cause HUS and cannot be separated from each other by the methods in use right now. ''' if subtype[0][:4] == 'stx2' and subtype[3] in [ 'a', 'c', 'd' ]: subtype[3] = 'acd' subtype = subtype[0][:4] + subtype[3] # Define subtype # if subtype not in stx_seq[seq_name[3]]: # stx_seq[seq_name[3]][subtype] = [] seq.description = '' # To avoid description to be print in outfile # For sequences with IUPAC codes, use one possible sequence based on the one with the codes if not set(seq.seq.upper()).issubset(allowed_chars): # print(seq.id, set(seq.seq.upper())) all_possible_sequences = extend_ambiguous_dna( seq.seq.upper()) if all_possible_sequences is not None: seq = SeqRecord( Seq.Seq(all_possible_sequences[0], generic_dna), id='{seq_name}:IUPAC_codes_removed'.format( seq_name=seq.id), description='') # Change the sequence else: writer.write('\t'.join([ seq.id, 'Memory Error (too much IUPAC codes)' ])) continue seq.id = '{seq_name}:seqTyping_{subtype}'.format( seq_name=seq.id, subtype=subtype) stx_seq[seq.id[:4]].append(seq) # stx_seq_write.append(seq) # Write files for gene, seqs in stx_seq.items(): with open(os.path.join( args.outdir, 'virulence_db.virulence_ecoli.commit_{commit}.{gene}_subtyping.seq_typing.fasta' .format(commit=commit, gene=gene)), 'wt', newline='\n') as writer: _ = SeqIO.write(seqs, writer, "fasta") # print(len(stx_seq)) # for gene, subtype_dict in stx_seq.items(): # print(gene, len(subtype_dict)) # for subtype, seqs in subtype_dict.items(): # print(subtype, len(seqs)) _ = utils.runTime(start_time)