desc="number of threads for panphlan to use") workflow.add_argument("dbfolder", default=None, desc="folder containing database") workflow.add_argument("filesfile", default=None, desc="file with filepaths to run on (relative to input)") workflow.add_argument("ref", default=None, desc="name of reference db") workflow.add_argument( "refs", default=None, desc="file with list of references (relative to dbfolder)") args = workflow.parse_args() in_files = workflow.get_input_files(".fastq.gz") out_files = workflow.name_output_files(name=in_files, tag="panphlan_map", extension="csv.bz2") if args.filesfile: with open(args.filesfile) as f: in_files = [l.strip() for l in f] if args.dbfolder: cmd = "panphlan_map.py -c [reference] -i [depend] -o [target] -p [threads] --i_bowtie2_indexes [db]" else: cmd = "panphlan_map.py -c [reference] -i [depend] -o [target] -p [threads]" if args.ref: refs = [args.ref]
# create a workflow instance, providing the version number and description # the version number will appear when running this script with the "--version" option # the description will appear when running this script with the "--help" option workflow = Workflow(version="0.1", description="A workflow to run KneadData") # add the custom arguments to the workflow workflow.add_argument("kneaddata-db", desc="the kneaddata database", default="/work/code/kneaddata/db/") workflow.add_argument("input-extension", desc="the input file extension", default="fastq") workflow.add_argument("threads", desc="number of threads for knead_data to use", default=1) # get the arguments from the command line args = workflow.parse_args() # get all input files with the input extension provided on the command line in_files = workflow.get_input_files(extension=args.input_extension) # get a list of output files, one for each input file, with the kneaddata tag out_files = workflow.name_output_files(name=in_files, tag="kneaddata") # create a task for each set of input and output files to run kneaddata workflow.add_task_group( "kneaddata --input [depends[0]] --output [output_folder] --reference-db [kneaddata_db] --threads [threads]", depends=in_files, targets=out_files, output_folder=args.output, kneaddata_db=args.kneaddata_db, threads=args.threads) workflow.go()
db = pickle.load(bz2.BZ2File(args.pkl_database, 'r')) marker_to_species={} for marker,info in db['markers'].items(): if info['clade'] in species_list: marker_to_species[marker]=info['clade'] # read in the sam file and pull out the reads that align with the markers with open(task.targets[0].name, "w") as file_handle_write: with open(task.depends[0].name) as file_handle: for line in file_handle: if not line.startswith("@"): data=line.rstrip().split("\t") reference=data[SAM_REFERENCE_NAME_INDEX] if reference in marker_to_species.keys(): seq_id = ";".join([data[SAM_READ_NAME_INDEX],marker_to_species[reference]]) seq = data[SAM_SEQ_INDEX] file_handle_write.write("\n".join([">"+seq_id,seq])+"\n") # for each of the input files write the fasta file of reads for infile in workflow.get_input_files(extension=args.input_tag_extension): outfile = workflow.name_output_files(infile).replace(args.input_tag_extension,"_metaphlan2_marker_aligned_subset.fasta") workflow.add_task( find_reads, depends=infile, targets=outfile) workflow.go()