class FastqpTask(sl.ContainerTask): # Input: FASTQ in_fastq = None # Output is a summary of the FASTQ quality summary_path = sl.Parameter() input_mount_point = sl.Parameter(default="/mnt/input/") output_mount_point = sl.Parameter(default="/mnt/output/") # URL of the container container = "quay.io/fhcrc-microbiome/fastqp:fastqp-v0.2" def out_summary(self): return sl.ContainerTargetInfo(self, self.summary_path) def run(self): input_targets = {"fastq": self.in_fastq()} output_targets = {"summary_file": self.out_summary()} self.ex( command="fastqp " + "-e $summary_file " + "$fastq", input_targets=input_targets, output_targets=output_targets, input_mount_point=self.input_mount_point, output_mount_point=self.output_mount_point, )
class FastQC(sl.Task): in_data = None tool_path = sl.Parameter(default=None) outp_path = sl.Parameter() # owner_email = '*****@*****.**' def out_fastq(self): print(self.in_data().path) print(self.outp_path) return sl.TargetInfo(self, self.outp_path + 'fastq_res') def run(self): print(self.in_data) fastqs = utils.deserialize(self.in_data().path, load) # #print(self.deps()) utils.mkdir_if_not_exist(self.outp_path) args_dict = OrderedDict() args_dict['-o'] = self.outp_path cmd = '{tool} {args} {inp1} {inp2}'.format( tool=self.tool_path, args=utils.make_args(args_dict), inp1=fastqs[0], inp2=fastqs[1]) print(cmd) try: run(cmd, shell=True, check=True) except CalledProcessError as e: raise e('i failed') utils.serialize(glob(self.outp_path), self.out_fastq().path, dump)
class MarkDuplicates(sl.Task): tool_path = sl.Parameter() outp_path = sl.Parameter() working_dir = sl.Parameter() in_data = None def out_data(self): return sl.TargetInfo(self, self.outp_path + 'mark_dupl_res') def run(self): with utils.cd(self.working_dir): utils.mkdir_if_not_exist(self.outp_path) res = utils.deserialize(self.in_data().path, load) cmds = ['java -jar {tool} MarkDuplicates INPUT=sorted.bam OUTPUT=dedup_reads.bam ' \ 'METRICS_FILE=metrics.txt'.format(tool=self.tool_path), # Add readgroups info 'java -jar {tool} AddOrReplaceReadGroups I=dedup_reads.bam ' \ 'O=dedup_reads_w_readgroups.bam RGID=4 RGLB=lib1 ' \ 'RGPL=illumina RGPU=unit1 RGSM=20'.format(tool=self.tool_path)] for cmd in cmds: print('Command', cmd) try: run(cmd, shell=True, check=True) except CalledProcessError: print('i failed') else: utils.serialize(glob('*'), self.out_data().path, dump)
class Samtools(sl.Task): in_data = None tool_path = sl.Parameter() outp_path = sl.Parameter() working_dir = sl.Parameter() ref_fasta = sl.Parameter() def out_data(self): return sl.TargetInfo(self, self.outp_path + 'samtools_res') def run(self): sam_file = utils.deserialize(self.in_data().path, load)[0] utils.mkdir_if_not_exist(self.outp_path) with utils.cd(self.working_dir): cmds = ['{tool} view -bS {sam_file} > res.bam'.format(tool=self.tool_path, sam_file=sam_file), '{tool} sort res.bam -o sorted.bam -O BAM'.format(tool=self.tool_path), '{tool} index sorted.bam'.format(tool=self.tool_path), # Make index fo reference '{tool} faidx {ref_fasta}'.format(tool=self.tool_path, ref_fasta=self.ref_fasta)] for cmd in cmds: try: run(cmd, shell=True, check=True) except CalledProcessError as e: raise e('failed') utils.serialize(glob('*'), self.out_data().path, dump)
class TransferFTPtoS3(sl.ContainerTask): """Transfer a file from an FTP server to an AWS S3 bucket.""" # FTP path ftp_url = sl.Parameter() # S3 path s3_url = sl.Parameter() # Container with wget container = "quay.io/fhcrc-microbiome/python:python-v0.1" def out_file(self): # File on S3 return sl.ContainerTargetInfo( self, self.s3_url ) def run(self): output_targets = { "out_file": self.out_file() } self.ex( command="wget -O $out_file {}".format(self.ftp_url), output_targets=output_targets )
class ImportSRAFastq(sl.ContainerTask): # Parameter: SRA accession to download data from sra_accession = sl.Parameter() # Parameter: AWS S3 folder for this project base_s3_folder = sl.Parameter() # Scratch directory scratch_directory = sl.Parameter(default="/scratch") # URL of the container container = "quay.io/fhcrc-microbiome/get_sra:v0.3" def out_fastq(self): # Output is an S3 object return sl.ContainerTargetInfo( self, os.path.join(self.base_s3_folder, "reads", self.sra_accession + ".fastq.gz")) def run(self): self.ex(command=" ".join([ "get_sra.py", "--accession", self.sra_accession, "--output-path", self.out_fastq().path, "--temp-folder", self.scratch_directory ]))
class ExtractAnnotations(sl.Task): """Extract all of annotations and make a TSV keyed by the 16S accession names.""" # Input files in_fastas = None in_annotations = None # Single flat file with all of the annotations s3_url = sl.Parameter() # Temporary folder to use for downloading data inside the Docker container temp_folder = sl.Parameter() def out_file(self): # File on S3 return sl.ContainerTargetInfo( self, self.s3_url ) def run(self): # The final output is going to be a DataFrame with rows as 16S accessions and columns as annotations, values are copy numbers output = {} for genome_id, genome_transcripts in self.in_fastas.items(): # Make sure that we also have an annotation for this genome assert genome_id in self.in_annotations # Get the transcript names from the FASTA bucket, transcript_key = genome_transcripts().path[5:].split("/", 1) transcript_ids = [ header.split(" ", 1)[0] for header, seq in read_fasta_from_s3(bucket, transcript_key) if " 16S " in header or " SSU " in header ] # Get the annotations for this genome bucket, annotation_key = self.in_annotations[genome_id]().path[5:].split("/", 1) genome_annotations = read_tsv_from_s3_as_dataframe(bucket, annotation_key) # Make a copy number vector functional_copy_numbers = genome_annotations["product"].value_counts().to_dict() # Add the annotations to the list for transcript in transcript_ids: assert transcript not in output, "Transcript found twice, stopping ({})".format(transcript) output[transcript] = functional_copy_numbers # Make a single DataFrame output = pd.DataFrame(output).fillna(0).T # Now write it to S3 output_bucket, output_key = self.out_file().path[5:].split("/", 1) tsv_buffer = io.StringIO() output.to_csv(tsv_buffer, sep='\t') s3 = boto3.resource('s3') s3.Object(output_bucket, output_key).put(Body=tsv_buffer.getvalue())
class CheckM(sl.ContainerTask): # Input FASTP file of protein sequences in_faa = None # Sample name sample_name = sl.Parameter() # Output folder output_folder = sl.Parameter() # Number of threads to use threads = sl.Parameter(default=4) # Scratch directory temp_folder = sl.Parameter(default="/scratch") # URL of the container container = "quay.io/fhcrc-microbiome/checkm:checkm-v1.0.11" def out_tsv(self): # Output is a tarball with all of the results return sl.ContainerTargetInfo( self, os.path.join(self.output_folder, self.sample_name + ".checkm.tsv")) def run(self): if self.output_folder.endswith("/") is False: self.output_folder = self.output_folder + "/" input_targets = {"faa": self.in_faa()} output_targets = {"tsv": self.out_tsv()} temp_dir = os.path.join(self.temp_folder, str(uuid.uuid4())[:8]) self.ex( command="echo 'Checking to see if temp directory ({}) exists' && ". format(temp_dir) + "[ ! -d '{}' ] && ".format(temp_dir) + "echo Making temp directory {} && ".format(temp_dir) + "mkdir {} && ".format(temp_dir) + "echo Making temp directory for input files {}/checkm_input && ". format(temp_dir) + "mkdir {}/checkm_input && ".format(temp_dir) + "echo Making temp directory for output files {}/checkm_output && ". format(temp_dir) + "mkdir {}/checkm_output && ".format(temp_dir) + "echo Moving gene FAA file into input directory && " + "mv $faa " + "{}/checkm_input/ && ".format(temp_dir) + "echo Decompressing input file && " + "gunzip {}/checkm_input/* && ".format(temp_dir) + "ls -lhtr {}/checkm_input/ && ".format(temp_dir) + "echo Running checkm && " + "checkm lineage_wf --genes -x fastp -t {} --file {}/checkm.tsv {}/checkm_input/ {}/checkm_output/ && " .format(self.threads, temp_dir, temp_dir, temp_dir) + "echo Finished running checkm && " + "echo Copying results out of the container && " + "mv {}/checkm.tsv ".format(temp_dir) + "$tsv && " + "echo Deleting temporary folders && " + "rm -r {}".format(temp_dir), input_targets=input_targets, output_targets=output_targets)
class TrimmoTaskWParameters(sl.Task): """ Execute trimmomatic on fastq files """ # Parameter in_data = None trimmo_parameters = None # OrderedDict adapters_path = sl.Parameter() tool_path = sl.Parameter() adapter = sl.Parameter() outp_path = sl.Parameter() def out_trimmo(self): outp_name = 'trimmed_seq' adapter_full_path = self.adapters_path + self.adapter self.default_trimmo_args = OrderedDict() args = [ 'SE', 'PE', '-basein', '-baseout', 'ILLUMINACLIP:', 'MAXINFO:', 'SLIDINGWINDOW:', 'LEADING:', 'CROP:', 'TRAILING:', 'HEADCROP:', 'MINLEN:' ] vals = [ None, True, None, None, adapter_full_path + ':2:30:10', None, None, 30, None, 30, 15, 20 ] # "leading": head_qual, "trailing": 30, "headcrop": 15} for arg, vals in zip_longest(args, vals): self.default_trimmo_args[arg] = vals # TODO move this constants to config file # print(self.default_trimmo_args) # print(self.tool_path) print(self.outp_path + 'trimmo_res.txt') return sl.TargetInfo(self, self.outp_path + 'trimmo_res.txt') def run(self): outp_full_path = self.outp_path + 'trimmo' print('!!!', self.in_data().path) fastqs = utils.deserialize(self.in_data().path, load) utils.mkdir_if_not_exist(self.outp_path) if not self.trimmo_parameters: self.trimmo_parameters = self.default_trimmo_args self.trimmo_parameters['-basein'] = fastqs[0] self.trimmo_parameters['-baseout'] = outp_full_path cmd = 'java -jar {tool_path} {args}'.format( tool_path=self.tool_path, args=utils.make_args(self.trimmo_parameters)) print('Command', cmd) try: run(cmd, shell=True, check=True) except CalledProcessError: print('i failed') else: utils.serialize(glob(self.outp_path + '*'), self.out_trimmo().path, dump)
class MapVirusesTask(sl.ContainerTask): # Inputs: FASTQ and reference database in_fastq = None in_ref_db_metadata = None in_ref_db_dmnd = None # Parameter: AWS S3 folder for output files output_folder = sl.Parameter() # Parameter: Name for output file(s) sample_name = sl.Parameter() # Parameter: Number of threads for alignment threads = sl.Parameter() # Parameter: Temporary folder to use on the devide temp_folder = sl.Parameter() # URL of the container container = "quay.io/fhcrc-microbiome/map_viruses:v0.7" def out_json(self): # Output is an S3 object return sl.ContainerTargetInfo( self, os.path.join(self.output_folder, self.sample_name + ".json.gz")) def out_sam(self): # Output is an S3 object return sl.ContainerTargetInfo( self, os.path.join(self.output_folder, self.sample_name + ".sam.gz")) def run(self): self.ex(command=" ".join([ "map_viruses.py", "--input", self.in_fastq().path, "--metadata", self.in_ref_db_metadata().path, "--ref-db", self.in_ref_db_dmnd().path, "--output-path", self.out_json().path, "--threads", str(self.threads), "--temp-folder", self.temp_folder, "--keep-alignments", ]))
class TrimReads(sl.Task, AutoPairedFastq): in_fastq1 = None in_fastq2 = None method = sl.Parameter(default="sickle") def run(self): if self.method == "sickle": self.ex( 'sickle pe -g -f "{reads1}" -r "{reads2}" -o "{out1}" -p "{out2}" -t sanger -s /dev/null' .format(reads1=self.in_fastq1().path, reads2=self.in_fastq2().path, out1=self.out_fastq1().path, out2=self.out_fastq2().path)) elif self.method == "fastq-mcf": pass # TODO: Add fastq-mcf elif self.method == "bbduk": self.ex( 'bbduk.sh in="{reads1}" in2="{reads2}" ' 'out="{out1}" out2="{out2}" ref=adapters,phix stats="{outstats}' .format(reads1=self.in_fastq1().path, reads2=self.in_fastq2().path, out1=self.out_fastq1().path, out2=self.out_fastq2().path, outstats=os.path.join( os.path.dirname(self.out_fastq2().path), "stats.txt"))) else: raise_(ValueError, "Unimplemented trimming method chosen", traceback)
class Extract16S(sl.ContainerTask): """Extract all of the 16S transcripts from a list of S3 FASTA files.""" # Input files in_fastas = None # Folder with all of the transcripts in subfolders s3_parent_folder = sl.Parameter() # Single flat file with all of the transcripts s3_url = sl.Parameter() # Temporary folder to use for downloading data inside the Docker container temp_folder = sl.Parameter() # Container with wget container = "quay.io/fhcrc-microbiome/python:python-v0.1" def out_file(self): # File on S3 return sl.ContainerTargetInfo( self, self.s3_url ) def run(self): # Save all of the transcripts with " 16S " or " SSU " in the header output = {} for genome_transcripts in self.in_fastas: # Get the transcript names from the FASTA bucket, transcript_key = genome_transcripts( ).path[5:].split("/", 1) for header, seq in read_fasta_from_s3(bucket, transcript_key): if " 16S " in header or " SSU " in header: header = header.split(" ", 1)[0] assert header not in output, "Duplicated transcript ID, stopping ({})".format(header) output[header] = seq # Now write it to S3 output_bucket, output_key = self.out_file().path[5:].split("/", 1) fasta_buffer = io.StringIO() for header, seq in output.items(): fasta_buffer.write(">{}\n{}\n".format(header, seq)) s3 = boto3.resource('s3') s3.Object(output_bucket, output_key).put(Body=fasta_buffer.getvalue())
class FAMLITask(sl.ContainerTask): # Inputs: FASTQ and reference database in_fastq = None in_ref_dmnd = None # Parameter: Prefix for output file sample_name = sl.Parameter() # Parameter: Output folder output_folder = sl.Parameter() # Parameter: Number of threads for alignment threads = sl.Parameter() # Parameter: Number of blocks for alignment (each block takes ~6Gb) blocks = sl.Parameter(default=5) # Parameter: Temporary folder to use on the device temp_folder = sl.Parameter() # URL of the container container = "quay.io/fhcrc-microbiome/famli:v1.1" def out_json(self): # Output is an S3 object return sl.ContainerTargetInfo( self, os.path.join(self.output_folder, "{}.json.gz".format(self.sample_name))) def run(self): if self.output_folder[-1] != '/': self.output_folder += '/' self.ex(command=" ".join([ "famli", "align", "--input", self.in_fastq().path, "--sample-name", self.sample_name, "--ref-db", self.in_ref_dmnd().path, "--output-folder", self.output_folder, "--threads", str(self.threads), "--blocks", str(self.blocks), "--temp-folder", self.temp_folder ]))
class BatchJobRunner(sciluigi.Task): "common runner class" job_id = None job_def_name = None job_def_revision = None script_url = None queue = sciluigi.Parameter() bucket_name = sciluigi.Parameter() pipeline_name = sciluigi.Parameter() sample_list_file = sciluigi.Parameter() reference = sciluigi.Parameter() job_name = None submit_args = None def run(self): self.job_def_revision = get_latest_jobdef_revision(self.job_def_name) jobdef = self.job_def_name + ":" + str(self.job_def_revision) bytebuf = io.BytesIO() s3client = boto3.client("s3") url = urlparse(self.sample_list_file) bucket = url.netloc path = url.path.lstrip("/") s3client.download_fileobj(bucket, path, bytebuf) raw_sample = bytebuf.getvalue().decode("utf-8") samples = raw_sample.splitlines() samples = [x.replace(".bam", "") for x in samples] array_size = len(samples) if array_size < 2: LOG.info( "You must specify at least two samples to run an array job!") sys.exit(1) env = [ dict(name="BUCKET_NAME", value=self.bucket_name), dict(name="LIST_OF_SAMPLES", value=self.sample_list_file), dict(name="BATCH_FILE_S3_URL", value=self.script_url), dict(name="REFERENCE", value=self.reference) ] self.submit_args = dict(jobQueue=self.queue, arrayProperties=dict(size=array_size), jobDefinition=jobdef, containerOverrides=dict(environment=env))
class IntegrateAssembliesTask(sl.ContainerTask): # Input FASTP files in_fastp_list = None # Input GFF files in_gff_list = None # Folder with GFF files gff_folder = sl.Parameter() # Folder with FASTP files fastp_folder = sl.Parameter() # Output prefix output_prefix = sl.Parameter() # Output folder output_folder = sl.Parameter() # Scratch directory temp_folder = sl.Parameter(default="/scratch") # URL of the container container = "quay.io/fhcrc-microbiome/integrate-metagenomic-assemblies:v0.4" def out_daa(self): # DIAMOND database return sl.ContainerTargetInfo( self, os.path.join(self.output_folder, self.output_prefix + ".dmnd")) def out_json(self): # JSON summary of all data return sl.ContainerTargetInfo( self, os.path.join(self.output_folder, self.output_prefix + ".json.gz")) def run(self): if self.output_folder.endswith("/") is False: self.output_folder = self.output_folder + "/" self.ex(command=" ".join([ "integrate_assemblies.py", "--gff-folder", self.gff_folder, "--prot-folder", self.fastp_folder, "--output-name", self.output_prefix, "--output-folder", self.output_folder, "--temp-folder", self.temp_folder ]))
class AlignFastqTask(sl.ContainerTask): # Inputs: FASTQ and reference database in_fastq = None in_ref_fasta = None # Parameter: Short name for reference ref_name = sl.Parameter() # Parameter: AWS S3 folder for this project base_s3_folder = sl.Parameter() # Parameter: Name for output file(s) sample_name = sl.Parameter() # Parameter: Number of threads for alignment threads = sl.Parameter() # Parameter: Temporary folder to use on the device temp_folder = sl.Parameter() # URL of the container container = "quay.io/fhcrc-microbiome/bwa:v0.7.17--3" def out_bam(self): # Output is an S3 object return sl.ContainerTargetInfo( self, os.path.join(self.base_s3_folder, "align_bwa_{}".format(self.ref_name), "{}.{}.bam".format(self.ref_name, self.sample_name))) def run(self): self.ex(command=" ".join([ "run.py", "--input", self.in_fastq().path, "--ref-db", self.in_ref_fasta().path, "--sample-name", "{}.{}".format( self.ref_name, self.sample_name), "--output-folder", os.path.join(self.base_s3_folder, "align_bwa_{}".format( self.ref_name)), "--threads", str(self.threads), "--temp-folder", self.temp_folder ]))
class WF(sciluigi.WorkflowTask): "workflow class" queue = sciluigi.Parameter() bucket_name = sciluigi.Parameter() pipeline_name = sciluigi.Parameter() sample_list_file = sciluigi.Parameter() reference = sciluigi.Parameter() def workflow(self): now = datetime.datetime.now() self.pipeline_name = self.pipeline_name + "-" + now.strftime( "%Y%m%d%H%M%S") LOG.info("Pipeline name is %s.", self.pipeline_name) step1 = self.new_task('step1', StepOneJobRunner, queue=self.queue, bucket_name=self.bucket_name, pipeline_name=self.pipeline_name, sample_list_file=self.sample_list_file, reference=self.reference) step2 = self.new_task('step2', StepTwoJobRunner, queue=self.queue, bucket_name=self.bucket_name, pipeline_name=self.pipeline_name, sample_list_file=self.sample_list_file, reference=self.reference) step2.in_step1 = step1.out_jobid step3 = self.new_task('step3', StepThreeJobRunner, queue=self.queue, bucket_name=self.bucket_name, pipeline_name=self.pipeline_name, sample_list_file=self.sample_list_file, reference=self.reference) step3.in_step2 = step2.out_jobid return step3
class Input(sl.Task): in_fastq1 = sl.Parameter() in_fastq2 = sl.Parameter() def out_fastq1(self): return sl.TargetInfo( self, os.path.join(str(self.workflow_task.workdir), "input_reads", str(self.workflow_task.prefix) + "_1.fq.gz")) def out_fastq2(self): return sl.TargetInfo( self, os.path.join(str(self.workflow_task.workdir), "input_reads", str(self.workflow_task.prefix) + "_2.fq.gz")) def run(self): self.out_fastq1().target.fs.mkdir( os.path.dirname(self.out_fastq1().path)) os.symlink(os.path.abspath(self.in_fastq1), self.out_fastq1().path) os.symlink(os.path.abspath(self.in_fastq2), self.out_fastq2().path)
class MultiSampleWorkflow(sl.WorkflowTask): """ This workflow is meant to take an entire dataset description and run the the SingleSampleWorkflow on each sample """ midas_db = sl.Parameter() dataset_description = sl.Parameter() workdir = sl.Parameter() contaminant_removal_method = sl.Parameter(default="bbsplit") filter_genomes = luigi.ListParameter() ref_info_dir = sl.Parameter() ref_combo_hash = sl.Parameter() def workflow(self): dataset_spec = json.load(self.dataset_description) tasks = [] if len(self.filter_genomes) > 0: index_task = self.new_task("ref_index", CreateIndexForContamRemoval) tasks.append(index_task) # Samples are in an array in the json. Each sample has a prefix and two read files for sample in dataset_spec["samples"]: wf = self.new_task('SampleWorkflow_' + sample["prefix"], SingleSampleWorkflow, workdir=self.workdir, prefix=sample["prefix"], in_fastq1=sample["in_fastq1"], in_fastq2=sample["in_fastq2"], midas_db=self.midas_db, filter_genomes=self.filter_genomes, ref_info_dir=self.ref_info_dir, ref_combo_hash=self.ref_combo_hash) tasks.append(wf) return tasks
class PreprocessNematus(sciluigi.Task): src_lang = sciluigi.Parameter() trg_lang = sciluigi.Parameter() in_parallel = None def out_processed(self): return [ TargetInfo(self, 'data/translate/preprocess/source.tok'), TargetInfo(self, 'data/translate/preprocess/target.tok'), TargetInfo(self, 'data/translate/preprocess/source.tok.json'), TargetInfo(self, 'data/translate/preprocess/target.tok.json') ] def run(self): self.ex('mkdir -p data/translate/preprocess/') if False: logging.info('Lowercasing and tokenizing source and target data.') self.ex('perl nematus/data/lowercase.perl < %s | \ perl nematus/data/tokenizer.perl -threads 5 -l %s > data/translate/preprocess/source.tok' \ % (self.in_parallel[0].path, self.src_lang)) self.ex('perl nematus/data/lowercase.perl < %s | \ perl nematus/data/tokenizer.perl -threads 5 -l %s > data/translate/preprocess/target.tok.ul' \ % (self.in_parallel[1].path, self.trg_lang)) # underline fix self.ex( "cat data/translate/preprocess/target.tok.ul | sed 's/ _ /_/g' > \ data/translate/preprocess/target.tok") logging.info('Building vocabularies.') self.ex( '(. /home/jwei/miniconda3/etc/profile.d/conda.sh && conda activate nematus \ && python nematus/data/build_dictionary.py \ data/translate/preprocess/source.tok)') self.ex( '(. /home/jwei/miniconda3/etc/profile.d/conda.sh && conda activate nematus \ && python nematus/data/build_dictionary.py \ data/translate/preprocess/target.tok)')
class VirFinderTask(sl.ContainerTask): """Run the VirFinder tool on a set of contigs.""" # Inputs: FASTA in_fasta = None # Parameter: AWS S3 folder for this project base_s3_folder = sl.Parameter() # Parameter: Name for output file(s) sample_name = sl.Parameter() # Use this to specify a different mount point for temporary files, if needed input_mount_point = sl.Parameter(default="/mnt/input/") output_mount_point = sl.Parameter(default="/mnt/output/") # URL of the container container = "quay.io/fhcrc-microbiome/virfinder:v1.1--0" def out_tsv(self): # Output is an S3 object return sl.ContainerTargetInfo( self, os.path.join(self.base_s3_folder, "virfinder", self.sample_name + ".tsv")) def run(self): input_targets = {"input_fasta": self.in_fasta()} output_targets = {"output_tsv": self.out_tsv()} self.ex( command="run_virfinder.Rscript $input_fasta $output_tsv", input_targets=input_targets, output_targets=output_targets, input_mount_point=self.input_mount_point, output_mount_point=self.output_mount_point, )
class PrepareFastq(sl.Task): #def __init__(self): # print('Init!!') # super(sl.Task, self).__init__() data_path = sl.Parameter() outp_path = sl.Parameter() # Write parameters in config whithout ', or it will interpret this as directory name # Path must be FILE!!!!! def out_data(self): return sl.TargetInfo(self, self.outp_path) def run(self): print('run') s = sorted(glob(self.data_path + '*.fastq.gz')) paired_fastq = list(zip(s[::2], s[1::2]))[0] with self.out_data().open('w') as outfile: dump(paired_fastq, outfile)
class AnnotateProkka(sl.ContainerTask): # Input FASTA file in_fasta = None # Sample name sample_name = sl.Parameter() # Output folder output_folder = sl.Parameter() # Number of threads to use threads = sl.Parameter(default=4) # Scratch directory temp_folder = sl.Parameter(default="/scratch") # URL of the container container = "quay.io/fhcrc-microbiome/metaspades:v3.11.1--8" def out_gff(self): # Output is an S3 object return sl.ContainerTargetInfo( self, os.path.join(self.output_folder, self.sample_name + ".gff.gz")) def out_faa(self): # Output is an S3 object return sl.ContainerTargetInfo( self, os.path.join(self.output_folder, self.sample_name + ".fastp.gz")) def run(self): if self.output_folder.endswith("/") is False: self.output_folder = self.output_folder + "/" self.ex(command=" ".join([ "run_prokka.py", "--input", self.in_fasta().path, "--sample-name", self.sample_name, "--output-folder", self.output_folder, "--threads", str(int(self.threads)), "--temp-folder", self.temp_folder ]))
class RunMIDAS(sl.Task, AutoSentinel): midas_subtask = sl.Parameter() in_fastq1 = None in_fastq2 = None nproc = sl.Parameter(default=1) def run(self): output = self.out_put() outdir = os.path.dirname(output.path) self.ex(''' run_midas.py {subtask} \ {outdir} \ -d \ -1 {in1} \ -2 {in2} \ -t {nproc} --remove_temp '''.format(subtask=self.midas_subtask, outdir=outdir, in1=self.in_fastq1().path, in2=self.in_fastq2().path, nproc=self.nproc)) with output.open("w") as fp: self.log_info(fp)
class BQSR(sl.Task): tool_path = sl.Parameter() outp_path = sl.Parameter() working_dir = sl.Parameter() ref_fasta = sl.Parameter() dbsnp = sl.Parameter() indels = sl.Parameter() in_data = None def out_data(self): return sl.TargetInfo(self, self.outp_path + 'BQSR_res') def run(self): with utils.cd(self.working_dir): utils.mkdir_if_not_exist(self.outp_path) res = utils.deserialize(self.in_data().path, load) # Analyze patterns of covariation in the sequence dataset # TODO: Добавить индексацию файла dedup_reads_w... cmds = ['java -jar {tool_path} -T BaseRecalibrator ' \ '-R {ref_fasta} -I dedup_reads_w_readgroups.bam ' \ '-knownSites {dbsnp} ' \ '-knownSites {indels} ' \ '-o recal_data.table'.format(tool_path=self.tool_path, ref_fasta=self.ref_fasta, dbsnp=self.dbsnp, indels=self.indels), # Do a second pass to analyze covariation remaining after recalibration 'java -jar {tool_path} -T BaseRecalibrator ' \ '-R {ref_fasta} -I dedup_reads_w_readgroups.bam ' \ '-knownSites {dbsnp} ' \ '-knownSites {indels} ' \ '-BQSR recal_data.table -o post_recal_data.table'.format(tool_path=self.tool_path, ref_fasta=self.ref_fasta, dbsnp=self.dbsnp, indels=self.indels), # Generate before/after plots 'java -jar {tool_path} -T AnalyzeCovariates -R {ref_fasta} ' \ '-L chr20 -before recal_data.table ' \ '-after post_recal_data.table ' \ '-plots recalibration_plots.pdf'.format(tool_path=self.tool_path, ref_fasta=self.ref_fasta), # Apply the recalibration to your sequence data 'java -jar {tool_path} -T PrintReads -R {ref_fasta} ' \ '-I dedup_reads_w_readgroups.bam ' \ '-BQSR recal_data.table -o recal_reads.bam'.format(tool_path=self.tool_path, ref_fasta=self.ref_fasta)] for cmd in cmds[2:]: print('Command', cmd) try: run(cmd, shell=True, check=True) except CalledProcessError: print('i failed') raise else: utils.serialize(glob('*'), self.out_data().path, dump)
class MyFooReplacer(sciluigi.Task): replacement = sciluigi.Parameter() # Here, we take as a parameter # what to replace foo with. # Here we have one input, a "foo file": in_foo = None # ... and an output, a "bar file": def out_replaced(self): # As the path to the returned target(info), we # use the path of the foo file: return sciluigi.TargetInfo(self, self.in_foo().path + '.bar.txt') def run(self): with self.in_foo().open() as in_f: with self.out_replaced().open('w') as out_f: # Here we see that we use the parameter self.replacement: out_f.write(in_f.read().replace('foo', self.replacement))
class AssembleMetaSPAdes(sl.ContainerTask): # Input FASTQ file in_fastq = None # Sample name sample_name = sl.Parameter() # Output folder output_folder = sl.Parameter() # Number of threads to use threads = sl.Parameter(default=4) # Maximum amount of memory to use (gigabytes) max_mem = sl.Parameter(default=10) # Scratch directory temp_folder = sl.Parameter(default="/scratch") # URL of the container container = "quay.io/fhcrc-microbiome/metaspades:v3.11.1--8" def out_fasta(self): # Output is an S3 object return sl.ContainerTargetInfo( self, os.path.join(self.output_folder, self.sample_name + ".fasta.gz")) def run(self): if self.output_folder.endswith("/") is False: self.output_folder = self.output_folder + "/" self.ex(command=" ".join([ "run_metaspades.py", "--input", self.in_fastq().path, "--sample-name", self.sample_name, "--output-folder", self.output_folder, "--threads", str(int(self.threads)), "--max-mem", str(int(self.max_mem)), "--temp-folder", self.temp_folder ]))
class RunMetaphlan2(sl.Task, AutoTxt): in_fastq1 = None in_fastq2 = None nproc = sl.Parameter(default=1) def run(self): self.ex(''' metaphlan2.py \ {reads1},{reads2} \ --no_map -t rel_ab_w_read_stats \ --sample_id {prefix} \ --nproc {nproc} --input_type fastq -o {out} '''.format(reads1=self.in_fastq1().path, reads2=self.in_fastq2().path, out=self.out_put().path, nproc=self.nproc, prefix=self.workflow_task.prefix))
class HUMAnN2Task(sl.ContainerTask): # Input FASTQ file in_fastq = None # Reference database ref_db = sl.Parameter(default="") # Sample name sample_name = sl.Parameter() # Output folder output_folder = sl.Parameter() # Number of threads to use threads = sl.Parameter(default=4) # Maximum amount of memory to use (gigabytes) max_mem = sl.Parameter(default=10) # Scratch directory temp_folder = sl.Parameter(default="/scratch") # URL of the container container = "quay.io/fhcrc-microbiome/humann2:v0.11.1--7" def out_json(self): # Output is an S3 object return sl.ContainerTargetInfo( self, os.path.join( self.output_folder, self.sample_name + ".json.gz" ) ) def run(self): if self.output_folder.endswith("/") is False: self.output_folder = self.output_folder + "/" self.ex( command=" ".join([ "run.py", "--input", self.in_fastq().path, "--sample-name", self.sample_name, "--output-folder", self.output_folder, "--ref-db", self.ref_db, "--threads", str(int(self.threads)), "--temp-folder", self.temp_folder ]) )
class LoadFile(sl.ExternalTask): path = sl.Parameter() def out_file(self): return sl.ContainerTargetInfo(self, self.path)