workflow = Workflow(version="0.1", description="A workflow to run strainphlan") # add the custom arguments to the workflow workflow_config = config.ShotGun() workflow.add_argument("input-extension", desc="the input file extension", default="fastq.gz", choices=["fastq.gz","fastq","fq.gz","fq","fasta","fasta.gz"]) workflow.add_argument("threads", desc="number of threads/cores for each task to use", default=1) workflow.add_argument("bypass-taxonomic-profiling", desc="do not run the taxonomic profiling tasks (a tsv profile for each sequence file must be included in the input folder using the same sample name)", action="store_true") workflow.add_argument("strain-profiling-options", desc="additional options when running the strain profiling step", default="") workflow.add_argument("max-strains", desc="the max number of strains to profile", default=20, type=int) # get the arguments from the command line args = workflow.parse_args() # get all input files with the input extension provided on the command line # return an error if no files are found input_files = utilities.find_files(args.input, extension=args.input_extension, exit_if_not_found=True) ### STEP #1: Run taxonomic profiling on all of the filtered files ### if not args.bypass_taxonomic_profiling: merged_taxonomic_profile, taxonomy_tsv_files, taxonomy_sam_files = shotgun.taxonomic_profile(workflow, input_files,args.output,args.threads,args.input_extension) elif: sample_names = utilities.sample_names(input_files,args.input_extension) tsv_profiles = utilities.name_files(sample_names, demultiplex_output_folder, tag="taxonomic_profile", extension="tsv") # check all of the expected profiles are found if len(tsv_profiles) != len(list(filter(os.path.isfile,tsv_profiles))): sys.exit("ERROR: Bypassing taxonomic profiling but all of the tsv taxonomy profile files are not found in the input folder. Expecting the following input files:\n"+"\n".join(tsv_profiles)) # run taxonomic profile steps bypassing metaphlan2 merged_taxonomic_profile, taxonomy_tsv_files, taxonomy_sam_files = shotgun.taxonomic_profile(workflow, tsv_profiles,args.output,args.threads,"tsv",already_profiled=True) # look for the sam profiles
default=20, type=int) workflow.add_argument("strain-list", desc="input file with list of strains to profile", default="") workflow.add_argument("assembly-options", desc="additional options when running the assembly step", default="") # get the arguments from the command line args = workflow.parse_args() # get all input files with the input extension provided on the command line # return an error if no files are found input_files = utilities.find_files(args.input, extension=args.input_extension, exit_if_not_found=True) # check for index files, do not error if they are not found index_files = utilities.find_files(args.input, extension=args.index_identifier + "." + args.input_extension) # remove the index files, if found, from the set of input files input_files = list(filter(lambda file: not file in index_files, input_files)) # if a dual index file is provided, then demultiplex dual indexing if args.dual_barcode_file: if ".bz2" in args.input_extension: sys.exit( "ERROR: Bz2 formatted files are not supported with demultiplexing")