def parse_arguments(args): """ Parse the arguments from the user """ parser = argparse.ArgumentParser( description="KneadData\n", formatter_class=argparse.RawTextHelpFormatter, prog="kneaddata") group1 = parser.add_argument_group("global options") group1.add_argument("--version", action="version", version="%(prog)s v" + VERSION) parser.add_argument("-v", "--verbose", action="store_true", help="additional output is printed\n") group1.add_argument("-i1", "--input1", help="Pair 1 input FASTQ file", dest='input1') group1.add_argument("-i2", "--input2", help="Pair 2 input FASTQ file", dest='input2') group1.add_argument("-un", "--unpaired", help="unparied input FASTQ file", dest='unpaired') group1.add_argument("-o", "--output", dest='output_dir', help="directory to write output files", required=True) group1.add_argument( "-db", "--reference-db", default=[], action="append", help= "location of reference database (additional arguments add databases)") group1.add_argument("--bypass-trim", action="store_true", help="bypass the trim step") group1.add_argument( "--output-prefix", help="prefix for all output files\n[ DEFAULT : $SAMPLE_kneaddata ]") group1.add_argument("-t", "--threads", type=int, default=config.threads, metavar="<" + str(config.threads) + ">", help="number of threads\n[ Default : " + str(config.threads) + " ]") group1.add_argument("-p", "--processes", type=int, default=config.processes, metavar="<" + str(config.processes) + ">", help="number of processes\n[ Default : " + str(config.processes) + " ]") group1.add_argument("-q", "--quality-scores", default=config.quality_scores, choices=config.quality_scores_options, dest='trimmomatic_quality_scores', help="quality scores\n[ DEFAULT : " + config.quality_scores + " ]") group1.add_argument( "--run-bmtagger", default=False, action="store_true", dest='bmtagger', help="run BMTagger instead of Bowtie2 to identify contaminant reads") group1.add_argument("--bypass-trf", action="store_true", help="option to bypass the removal of tandem repeats") group1.add_argument( "--run-trf", action="store_true", help= "legacy option to run the removal of tandem repeats (now run by default)" ) group1.add_argument("--run-fastqc-start", default=False, dest='fastqc_start', action="store_true", help="run fastqc at the beginning of the workflow") group1.add_argument("--run-fastqc-end", default=False, dest='fastqc_end', action="store_true", help="run fastqc at the end of the workflow") group1.add_argument( "--store-temp-output", action="store_true", help= "store temp output files\n[ DEFAULT : temp output files are removed ]") group1.add_argument( "--remove-intermediate-output", action="store_true", help= "remove intermediate output files\n[ DEFAULT : intermediate output files are stored ]" ) group1.add_argument( "--cat-final-output", action="store_true", help= "concatenate all final output files\n[ DEFAULT : final output is not concatenated ]" ) group1.add_argument("--log-level", default=config.log_level, choices=config.log_level_choices, help="level of log messages\n[ DEFAULT : " + config.log_level + " ]") group1.add_argument( "--log", help="log file\n[ DEFAULT : $OUTPUT_DIR/$SAMPLE_kneaddata.log ]") group2 = parser.add_argument_group("trimmomatic arguments") group2.add_argument("--trimmomatic", dest='trimmomatic_path', help="path to trimmomatic\n[ DEFAULT : $PATH ]") group2.add_argument( "--run-trim-repetitive", default=False, dest='run_trim_repetitive', action="store_true", help="Trim fastqc generated overrepresented sequences\n") group2.add_argument("--max-memory", default=config.trimmomatic_memory, help="max amount of memory\n[ DEFAULT : " + config.trimmomatic_memory + " ]") group2.add_argument( "--trimmomatic-options", action="append", help="options for trimmomatic\n[ DEFAULT : "+" ".join(utilities.get_default_trimmomatic_options())+" ]\n"+\ "MINLEN is set to "+str(config.trimmomatic_min_len_percent)+" percent of total input read length. The user can alternatively specify a length (in bases) for MINLEN.") group2.add_argument("--sequencer-source", dest='sequencer_source', default=config.trimmomatic_provided_sequencer_default, choices=config.trimmomatic_provided_sequencer_source, help="options for sequencer-source\n[ DEFAULT : " + config.trimmomatic_provided_sequencer_default + "]") group3 = parser.add_argument_group("bowtie2 arguments") group3.add_argument("--bowtie2", dest='bowtie2_path', help="path to bowtie2\n[ DEFAULT : $PATH ]") group3.add_argument("--bowtie2-options", action="append", help="options for bowtie2\n[ DEFAULT : " + " ".join(config.bowtie2_options) + " ]") group3.add_argument( "--decontaminate-pairs", choices=["strict","lenient","unpaired"], default="strict", help="options for filtering of paired end reads (strict='remove both R1+R2 if either align', lenient='remove only if both R1+R2 align', unpaired='ignore pairing and remove as single end')\n"+\ "[ DEFAULT : %(default)s ]") group3.add_argument( "--reorder", action="store_true", help= "order the sequences in the same order as the input\n[ DEFAULT : Sequences are not ordered ]" ) group3.add_argument( "--serial", action="store_true", help= "filter the input in serial for multiple databases so a subset of reads are processed in each database search" ) group4 = parser.add_argument_group("bmtagger arguments") group4.add_argument("--bmtagger", dest='bmtagger_path', help="path to BMTagger\n[ DEFAULT : $PATH ]") group5 = parser.add_argument_group("trf arguments") group5.add_argument("--trf", dest='trf_path', help="path to TRF\n[ DEFAULT : $PATH ]") group5.add_argument("--match", type=int, default=config.trf_match, help="matching weight\n[ DEFAULT : " + str(config.trf_match) + " ]") group5.add_argument("--mismatch", type=int, default=config.trf_mismatch, help="mismatching penalty\n[ DEFAULT : " + str(config.trf_mismatch) + " ]") group5.add_argument("--delta", type=int, default=config.trf_delta, help="indel penalty\n[ DEFAULT : " + str(config.trf_delta) + " ]") group5.add_argument("--pm", type=int, default=config.trf_match_probability, help="match probability\n[ DEFAULT : " + str(config.trf_match_probability) + " ]") group5.add_argument("--pi", type=int, default=config.trf_pi, help="indel probability\n[ DEFAULT : " + str(config.trf_pi) + " ]") group5.add_argument( "--minscore", type=int, default=config.trf_minscore, help="minimum alignment score to report\n[ DEFAULT : " + str(config.trf_minscore) + " ]") group5.add_argument("--maxperiod", type=int, default=config.trf_maxperiod, help="maximum period size to report\n[ DEFAULT : " + str(config.trf_maxperiod) + " ]") group6 = parser.add_argument_group("fastqc arguments") group6.add_argument("--fastqc", dest='fastqc_path', help="path to fastqc\n[ DEFAULT : $PATH ]") return parser.parse_args()
def main(): # Parse the arguments from the user args = parse_arguments(sys.argv) # Update the configuration args = update_configuration(args) # set the prefix for the output files full_path_output_prefix = os.path.join(args.output_dir, args.output_prefix) # Start logging setup_logging(args) temp_output_files = [] # Check for compressed files, bam files, or sam files for index in range(len(args.input)): args.input[index] = utilities.get_decompressed_file( args.input[index], args.output_dir, temp_output_files, args.input) args.input[index] = utilities.get_sam_from_bam_file( args.input[index], args.output_dir, temp_output_files, args.input) args.input[index] = utilities.get_fastq_from_sam_file( args.input[index], args.output_dir, temp_output_files, args.input) # Get the format of the first input file file_format = utilities.get_file_format(args.input[0]) if file_format != "fastq": message = "Your input file is of type: " + file_format + ". Please provide an input file of fastq format." logger.critical(message) sys.exit(message) # if this is the new illumina identifier format, create temp files after reformatting the headers for index in range(len(args.input)): args.input[index] = utilities.get_reformatted_identifiers( args.input[index], index, args.output_dir, temp_output_files, args.input) # check for reads that are not ordered and order if needed (if trimmomatic is run) if not args.bypass_trim and len(args.input) == 2: args.input = utilities.check_and_reorder_reads(args.input, args.output_dir, temp_output_files) # remove any temp files from decompress/reformat that are no longer needed utilities.update_temp_output_files(temp_output_files, [], args.input) # set trimmomatic options # this is done after the decompression and conversions from sam/bam # as the default requires the read length from the input sequences if args.trimmomatic_options: # parse the options from the user into an array of options args.trimmomatic_options = utilities.format_options_to_list( args.trimmomatic_options) else: # if trimmomatic options not set by user, then set to default options # use read length of input file for minlen args.trimmomatic_options = utilities.get_default_trimmomatic_options( utilities.get_read_length_fastq(args.input[0]), path=config.trimmomatic_adapter_folder, type="PE" if len(args.input) == 2 else "SE", sequencer_source=args.sequencer_source) # Get the number of reads initially utilities.log_read_count_for_files(args.input, "raw", "Initial number of reads", args.verbose) # Run fastqc if set to run at start of workflow if args.fastqc_start or args.run_trim_repetitive: run.fastqc(args.fastqc_path, args.output_dir, original_input_files, args.threads, args.verbose) #Setting fastqc output zip and txt file path output_txt_files = [] for input_file_name in original_input_files: temp_file = os.path.splitext(input_file_name)[0] if (temp_file.count('fastq') > 0 or temp_file.count('fq') > 0): temp_file = os.path.splitext(temp_file)[0] output_txt_files.append(args.output_dir + "/fastqc/" + temp_file.split('/')[-1] + "_fastqc/fastqc_data.txt") if not args.bypass_trim: if args.run_trim_repetitive: # Get the Min Overrepresented Seq Length args.trimmomatic_options = utilities.get_updated_trimmomatic_parameters( output_txt_files, args.output_dir, args.trimmomatic_options) trimmomatic_output_files = run.trim( args.input, full_path_output_prefix, args.trimmomatic_path, args.trimmomatic_quality_scores, args.max_memory, args.trimmomatic_options, args.threads, args.verbose) else: message = "Bypass trimming" logger.info(message) print(message) trimmomatic_output_files = [args.input] # Get the number of reads after trimming utilities.log_read_count_for_files(trimmomatic_output_files, "trimmed", "Total reads after trimming", args.verbose) # run TRF, if set if not args.bypass_trf: # run trf on all output files trf_output_files = run.tandem( trimmomatic_output_files, full_path_output_prefix, args.match, args.mismatch, args.delta, args.pm, args.pi, args.minscore, args.maxperiod, args.trf_path, args.processes, args.verbose, args.remove_temp_output, args.threads) # remove the aligment files, if intermediate output files should be removed if args.reference_db and args.remove_intermediate_output: temp_output_files += utilities.resolve_sublists( trimmomatic_output_files) else: trf_output_files = trimmomatic_output_files # If a reference database is not provided, then bypass decontamination step if not args.reference_db: message = "Bypass decontamination" logger.info(message) print(message) # resolve sub-lists if present final_output_files = trf_output_files else: final_output_files = run.decontaminate(args, full_path_output_prefix, trf_output_files) # remove trimmed output files, if set to remove intermediate outputx if not args.bypass_trim and args.remove_intermediate_output: temp_output_files += utilities.resolve_sublists(trf_output_files) # If set, concat the final output files if there is more than one final_output_files = utilities.resolve_sublists(final_output_files) if args.cat_final_output and len(final_output_files) > 1: cat_output_file = full_path_output_prefix + config.fastq_file_extension utilities.cat_files(final_output_files, cat_output_file) # if removing intermediate output, then remove the files that were merged if args.remove_intermediate_output: temp_output_files += final_output_files final_output_files = [cat_output_file] else: final_output_files.append(cat_output_file) # Remove any temp output files, if set if not args.store_temp_output: for file in temp_output_files: utilities.remove_file(file) # Run fastqc if set to run at end of workflow if args.fastqc_end: run.fastqc(args.fastqc_path, args.output_dir, final_output_files, args.threads, args.verbose) if len(final_output_files) > 1: message = "\nFinal output files created: \n" else: message = "\nFinal output file created: \n" message = message + "\n".join(final_output_files) + "\n" logger.info(message) print(message)