def run(self): valid_path(self.output().path, check_ofile=1) cmdline = "%s/convert2annovar.pl %s --includeinfo -format vcf4 > %s" % ( config.annovar_pro, self.input().path, self.output().path) run_cmd(cmdline, dry_run=self.dry_run, log_file=self.get_log_path()) if self.dry_run: run_cmd("touch %s" % self.output().path, dry_run=False)
def run(self): valid_path(self.output()[0].path, check_ofile=1) # auto make output dir sample_name = str(self.sampleid) # if len(self.input()) == 2: # input1 = self.input()[0].path # input2 = self.input()[1].path # else: input1 = self.PE1 input2 = self.PE2 if not exists(self.PE2): cmdline = f"java -jar {trimmomatic_jar} SE -threads {default_params.trimmomatic_thread} {input1} {self.output()[0].path} ILLUMINACLIP:{trimmomatic_dir}/adapters/TruSeq3-SE.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36" else: cmdline = "java -jar {trimmomatic_jar} PE -threads {thread} {input1} {input2} {ofile1} {outdir}/{PE1_id}.unpaired.fq.gz {ofile2} {outdir}/{PE2_id}.unpaired.fq.gz ILLUMINACLIP:{trimmomatic_dir}/adapters/TruSeq3-PE.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:50".format( trimmomatic_jar=trimmomatic_jar, trimmomatic_dir=trimmomatic_dir, input1=input1, input2=input2, PE1_id=sample_name + "_R1", PE2_id=sample_name + "_R2", ofile1=self.output()[0].path, ofile2=self.output()[1].path, outdir=dirname(self.output()[0].path), thread=default_params.trimmomatic_thread) run_cmd(cmdline, dry_run=self.dry_run, log_file=self.get_log_path()) if self.dry_run: for _o in self.output(): run_cmd("touch %s" % _o.path, dry_run=False)
def run(self): valid_path(self.output()[0].path, check_ofile=1) # auto make output dir sample_name = self.infodict.get("SampleID", '') input1 = self.PE1 input2 = self.PE2 if input2: cmdline = "java -jar {trimmomatic_jar} PE -threads {thread} {input1} {input2} {ofile1} {base_out}/{PE1_id}.unpaired.fq.gz {ofile2} {base_out}/{PE2_id}.unpaired.fq.gz ILLUMINACLIP:{trimmomatic_dir}/adapters/TruSeq3-PE.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:50".format( trimmomatic_jar=config.trimmomatic_jar, trimmomatic_dir=dirname(config.trimmomatic_jar), input1=input1, input2=input2, PE1_id=sample_name + "_R1", PE2_id=sample_name + "_R2", ofile1=self.output()[0].path, ofile2=self.output()[1].path, base_out=dirname(self.output()[0].path), thread=config.trimmomatic_thread) else: cmdline = "java -jar {trimmomatic_jar} SE -threads {thread} {input1} {ofile} ILLUMINACLIP:{trimmomatic_dir}/adapters/TruSeq3-SE.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36".format( trimmomatic_jar=config.trimmomatic_jar, trimmomatic_dir=dirname(config.trimmomatic_jar), input1=input1, SE_id=sample_name, ofile=self.output()[0].path, base_out=dirname(self.output()[0].path), thread=config.trimmomatic_thread) run_cmd(cmdline, dry_run=self.dry_run, log_file=self.get_log_path()) if self.dry_run: for _o in self.output(): run_cmd("touch %s" % _o.path, dry_run=False)
def test(odir, cmd, worker): project_root_path = get_dir_path(__file__, 1) cmd = " --local-scheduler" if cmd else '' cmd += " --workers {}".format(str(int(worker))) run_cmd( f"python3 {project_root_path}/main.py run -- workflow --tab {project_root_path}/testset/seq_data/data_input.tsv --odir {odir} --analysis-type all --log-path {odir}/cmd_log.txt " + cmd, dry_run=False)
def _join_pairs_w_command_output( fwd_fp, rev_fp, fastq_out, truncqual: int = join_params['truncqual'], minlen: int = join_params['minlen'], maxns: int = join_params['maxns'], allowmergestagger: bool = join_params['allowmergestagger'], minovlen: int = join_params['minovlen'], maxdiffs: int = join_params['maxdiffs'], minmergelen: int = join_params['minmergelen'], maxmergelen: int = join_params['maxmergelen'], maxee: float = join_params['maxee'], qmin: int = join_params['qmin'], qminout: int = join_params['qminout'], qmax: int = join_params['qmax'], qmaxout: int = join_params['qmaxout'], # qascii: int = join_params['qascii'], log_file=None): # this function exists only to simplify unit testing cmd = [ soft_db_path.vsearch_pth, '--fastq_mergepairs', fwd_fp, '--reverse', rev_fp, '--fastqout', fastq_out, # '--fastq_ascii', str(qascii), '--fastq_minlen', str(minlen), '--fastq_minovlen', str(minovlen), '--fastq_maxdiffs', str(maxdiffs), '--fastq_qmin', str(qmin), '--fastq_qminout', str(qminout), '--fastq_qmax', str(qmax), '--fastq_qmaxout', str(qmaxout), ] if truncqual is not None: cmd += ['--fastq_truncqual', str(truncqual)] if maxns is not None: cmd += ['--fastq_maxns', str(maxns)] if minmergelen is not None: cmd += ['--fastq_minmergelen', str(minmergelen)] if maxmergelen is not None: cmd += ['--fastq_maxmergelen', str(maxmergelen)] if maxee is not None: cmd += ['--fastq_maxee', str(maxee)] if allowmergestagger: cmd.append('--fastq_allowmergestagger') run_cmd(' '.join(cmd), dry_run=False, log_file=log_file) run_cmd(' '.join(['gzip', '-f', fastq_out]), dry_run=False, log_file=log_file)
def run(self): cmd = "qiime feature-table tabulate-seqs --i-data {input_f} --o-visualization {output_f}".format( input_f=self.input()[1].path, output_f=self.output().path) run_cmd(cmd, dry_run=self.dry_run, log_file=self.get_log_path()) if self.dry_run: run_cmd( "touch %s" % self.output().path, dry_run=False, )
def run(self): cmd = "qiime demux summarize --i-data {input_f} --o-visualization {output_f}".format( input_f=self.input().path, output_f=self.output().path) run_cmd(cmd, dry_run=self.dry_run, log_file=self.get_log_path()) if self.dry_run: run_cmd( "touch %s" % self.output().path, dry_run=False, )
def run(self): derep_fa = self.input()[0].path sorted_d2_fa = self.output().path valid_path([sorted_d2_fa], check_ofile=1) cmd = f"{vsearch} --sortbysize {derep_fa} --output {sorted_d2_fa} --minsize 1" run_cmd(cmd, dry_run=self.dry_run, log_file=self.get_log_path()) if self.dry_run: for _o in [self.output()]: run_cmd("touch %s" % _o.path, dry_run=False)
def run(self): filtered_fa = self.input().path derep_fa = self.output()[0].path derep_uc = self.output()[1].path cmd = f"{vsearch} --derep_fulllength {filtered_fa} --output {derep_fa} -sizeout --fasta_width 0 --uc {derep_uc}" run_cmd(cmd, dry_run=self.dry_run, log_file=self.get_log_path()) if self.dry_run: for _o in self.output(): run_cmd("touch %s" % _o.path, dry_run=False)
def run(self): # cmd = "vsearch --fastx_filter {input} --fastq_maxee 1.0 --fastq_trunclen 240 --fastaout {output}".format(input=input,output=output) merged_fa = self.input().path filtered_fa = self.output().path valid_path(filtered_fa, check_ofile=1) cmd = f"{vsearch} --fastx_filter {merged_fa} --fastq_maxee 1.0 --fastq_trunclen 240 --fastaout {filtered_fa}" run_cmd(cmd, dry_run=self.dry_run, log_file=self.get_log_path()) if self.dry_run: for _o in [self.output()]: run_cmd("touch %s" % _o.path, dry_run=False)
def run(self): if config.samtools_version > 1: cmdline = "samtools view -G 100 -bu %s -o %s" % ( self.input().path, self.output().path) else: cmdline = "samtools view -F 0x100 -bSu %s -o %s" % ( self.input().path, self.output().path) if self.dry_run: run_cmd("touch %s" % self.output().path, dry_run=False) run_cmd(cmdline, dry_run=self.dry_run, log_file=self.get_log_path())
def run(self): cmdline = "{gatk4} BaseRecalibrator --java-options '{java_option}' --reference {REF} --input {input_f} --known-sites {db_snp} --known-sites {known_gold_vcf} --output {output_f}".format( gatk4=config.gatk_pro, java_option=config.java_option, REF=config.REF_file_path, input_f=self.input().path, db_snp=config.db_snp, known_gold_vcf=config.known_gold_vcf, output_f=self.output().path) run_cmd(cmdline, dry_run=self.dry_run, log_file=self.get_log_path()) if self.dry_run: run_cmd("touch %s" % self.output().path, dry_run=False)
def run(self): # summarize python script # it will iterate all samples contains at `self.tab_file` py_file = os.path.join(config.project_root_path, "api", "quality_accessment.py") run_cmd("python3 {pyfile} -i {input} -o {output}".format( pyfile=py_file, input=self.tab_file, output=self.odir), dry_run=self.dry_run, log_file=self.get_log_path()) if self.dry_run: run_cmd("touch %s" % self.output().path, dry_run=False)
def run(self): # cmd3 = """vsearch --uchime_ref v_analysis/all.denovo.nonchimeras.fastaq \ # --db /home/liaoth/data2/project/16s_pipelines/microbiome_utils/vsearch_pipeliens/rdp_gold.fa --sizein --sizeout --fasta_width 0 \ # --nonchimeras v_analysis/all.ref.nonchimeras.fasta""" denovo_nonchimer_fa = self.input().path ref_nonchimer_fa = self.output().path valid_path(ref_nonchimer_fa, check_ofile=1) cmd = f"{vsearch} --uchime_ref {denovo_nonchimer_fa} --db {rdp_gold} --sizein --sizeout --fasta_width 0 --nonchimeras {ref_nonchimer_fa}" run_cmd(cmd, dry_run=self.dry_run, log_file=self.get_log_path()) if self.dry_run: for _o in [self.output()]: run_cmd("touch %s" % _o.path, dry_run=False)
def run(self): # cmd2 = """vsearch --uchime_deno v_analysis/all.preclustered.fasta \ # --sizein --sizeout --fasta_width 0 \ # --nonchimeras v_analysis/all.denovo.nonchimeras.fastaq""" precluster_fa = self.input()[0].path denovo_nonchimer_fa = self.output().path valid_path([denovo_nonchimer_fa], check_ofile=1) cmd = f"{vsearch} --uchime_deno {precluster_fa} --sizein --sizeout --fasta_width 0 --nonchimeras {denovo_nonchimer_fa}" run_cmd(cmd, dry_run=self.dry_run, log_file=self.get_log_path()) if self.dry_run: for _o in [self.output()]: run_cmd("touch %s" % _o.path, dry_run=False)
def run(self): # cmd5 = """perl /home/liaoth/data2/project/16s_pipelines/microbiome_utils/vsearch_pipeliens/map.pl splited/derep.fa splited/all.derep.uc v_analysis/all.nonchimeras.derep.fasta > v_analysis/all.nonchimeras.fasta""" derep_fa = self.input()["derep"][0].path derep_uc = self.input()["derep"][1].path nonchimera_derep_fa = self.input()["map1"].path nonchimera_fa = self.output().path valid_path(nonchimera_fa, check_ofile=1) cmd = f"perl {map_pl} {derep_fa} {derep_uc} {nonchimera_derep_fa} > {nonchimera_fa}" run_cmd(cmd, dry_run=self.dry_run, log_file=self.get_log_path()) if self.dry_run: for _o in [self.output()]: run_cmd("touch %s" % _o.path, dry_run=False)
def run(self): cmdline = "java {java_option} -jar {gatk} -T BaseRecalibrator -nct {gatk_thread} -R {REF} -I {input_f} -knownSites {db_snp} -knownSites {known_gold_vcf} -o {output_f}".format( gatk=config.gatkv36_path, java_option=config.java_option, gatk_thread=config.gatk_thread, REF=config.REF_file_path, input_f=self.input().path, db_snp=config.db_snp, known_gold_vcf=config.known_gold_vcf, output_f=self.output().path) run_cmd(cmdline, dry_run=self.dry_run, log_file=self.get_log_path()) if self.dry_run: run_cmd("touch %s" % self.output().path, dry_run=False)
def run(self): valid_path(self.output().path, check_ofile=1) if config.PCR_ON: cmdline = "touch %s" % self.output().path else: cmdline = "java {java_option} -jar {pircard_jar} MarkDuplicates INPUT={input} OUTPUT={output} METRICS_FILE={odir}/dedup_metrics.txt CREATE_INDEX=true REMOVE_DUPLICATES=true AS=true".format( pircard_jar=config.pircard_jar, java_option=config.java_option, input=self.input().path, output=self.output().path, odir=os.path.dirname(self.output().path)) run_cmd(cmdline, dry_run=self.dry_run, log_file=self.get_log_path()) if self.dry_run: run_cmd("touch %s" % self.output().path, dry_run=False)
def run(self): extra_str = '' for p, val in config.join_params.items(): if val is True: extra_str += ' --p-%s' % p elif val is not None and val is not False: extra_str += ' --p-%s %s ' % (p, val) cmd = "{qiime2_p} vsearch join-pairs --i-demultiplexed-seqs {input_file} --o-joined-sequences {ofile}".format( qiime2_p=config.qiime2_p, input_file=self.input().path, ofile=self.output().path) + extra_str run_cmd(cmd, dry_run=self.dry_run, log_file=self.get_log_path()) if self.dry_run: run_cmd("touch %s" % self.output().path, dry_run=False)
def run(self): # cmd = """vsearch --cluster_size splited/derep.fa \ # --id 0.97 --sizeout --fasta_width 0 \ # --uc v_analysis/all.preclustered.uc \ # --centroids v_analysis/all.preclustered.fasta""" derep_fa = self.input()[0].path precluster_uc = self.output()[1].path precluster_fa = self.output()[0].path cmd = f"{vsearch} --cluster_size {derep_fa} --id 0.97 --sizeout --fasta_width 0 --uc {precluster_uc} --centroids {precluster_fa}" run_cmd(cmd, dry_run=self.dry_run, log_file=self.get_log_path()) if self.dry_run: for _o in self.output(): run_cmd("touch %s" % _o.path, dry_run=False)
def run(self): # cmd2 = """vsearch --usearch_global splited/filtered_uparsed.fa --db v_analysis/all.otus.fasta --strand plus --id 0.97 --uc v_analysis/map.txt --otutabout v_analysis/otu_raw.tab """ filtered_fa = self.input()['filter'].path rep_fa = self.input()['cluster'][0].path map_output = self.output()[1].path raw_otutab = self.output()[0].path valid_path([map_output, raw_otutab], check_ofile=1) cmd = f"{vsearch} --usearch_global {filtered_fa} --db {rep_fa} --strand plus --id 0.97 --uc {map_output} --otutabout {raw_otutab}" run_cmd(cmd, dry_run=self.dry_run, log_file=self.get_log_path()) if self.dry_run: for _o in self.output(): run_cmd("touch %s" % _o.path, dry_run=False)
def run(self): valid_path(self.output().path, check_ofile=1) if config.PCR_ON: cmdline = "touch %s" % self.output().path else: cmdline = "{gatk4} MarkDuplicates --java-options '{java_option}' --INPUT {input_f} --OUTPUT {output_f} --METRICS_FILE {odir}/dedup_metrics.txt --CREATE_INDEX true --REMOVE_DUPLICATES true -AS true".format( gatk4=config.gatk_pro, java_option=config.java_option, input_f=self.input().path, output_f=self.output().path, odir=dirname(self.output().path)) run_cmd(cmdline, dry_run=self.dry_run, log_file=self.get_log_path()) if self.dry_run: run_cmd("touch %s" % self.output().path, dry_run=False)
def main(): cmdline_args = sys.argv[1:] if "--tab" not in cmdline_args: cmdline_args += ["--tab", join(dirname(dirname(__file__)), "testset", "seq_data", "data_input.tsv")] with CmdlineParser.global_instance(cmdline_args) as cp: task = cp.get_task_obj() graph = get_graph(task) dot_graph = construct_dot_output(graph) with open('/tmp/tmp.dot', 'w') as f1: f1.write(dot_graph) dot_graph = '/tmp/tmp.dot' ofile = join(task.odir, "pipelines.png") run_cmd(f"dot -Tpng < {dot_graph} > {ofile}", )
def run(self): from static.join_pairs import _join_pairs_w_command_output clean_pe1 = self.input()[0].path if exists(self.PE2): clean_pe2 = self.input()[1].path if not self.dry_run: _join_pairs_w_command_output( fwd_fp=clean_pe1, rev_fp=clean_pe2, fastq_out=self.output().path.replace('.gz', ''), log_file=self.log_path) else: run_cmd("touch %s" % self.output().path, dry_run=False), else: cmd = f"ln -s `realpath {clean_pe1}` {self.output().path}" run_cmd(cmd, dry_run=self.dry_run, log_file=self.get_log_path())
def run(self): valid_path(self.output().path, check_ofile=1) if config.PCR_ON: input_f = self.input()[1].path else: input_f = self.input()[0].path cmdline = "java -jar {gatk} -T RealignerTargetCreator -nt {thread} -R {REF} -I {input_f} --known {known_gold_vcf} -o {output_f}".format( gatk=config.gatkv36_path, thread=config.gatk_thread, REF=config.REF_file_path, input_f=input_f, output_f=self.output().path, known_gold_vcf=config.known_gold_vcf) run_cmd(cmdline, dry_run=self.dry_run, log_file=self.get_log_path()) if self.dry_run: run_cmd("touch %s" % self.output().path, dry_run=False)
def run(self): # cmd = """vsearch --cluster_size v_analysis/all.nonchimeras.fasta --id 0.97 \ # --sizein --sizeout --fasta_width 0 \ # --uc v_analysis/all.clustered.uc \ # --relabel OTU --centroids v_analysis/all.otus.fasta""" nonchimera_fa = self.input().path rep_fa = self.output()[0].path cluster_uc = self.output()[1].path valid_path([rep_fa, cluster_uc], check_ofile=1) cmd = f"{vsearch} --cluster_size {nonchimera_fa} --id 0.97 --sizein --sizeout --fasta_width 0 --uc {cluster_uc} --relabel OTU --centroids {rep_fa}" run_cmd(cmd, dry_run=self.dry_run, log_file=self.get_log_path()) if self.dry_run: for _o in self.output(): run_cmd("touch %s" % _o.path, dry_run=False)
def run(self): valid_path(self.output().path, check_ofile=1) sample_name = self.infodict.get("SampleID", '') input_file1 = self.input()[0].path if len(self.input()) == 1: input_file2 = '' else: input_file2 = self.input()[1].path cmdline = "bwa mem -M -t {bwa_thread} -k 19 -R '@RG\\tID:{SN}\\tSM:{SN}\\tPL:illumina\\tLB:lib1\\tPU:L001' {REF} {i1} {i2} > {ofile}".format( bwa_thread=config.bwa_thread, SN=sample_name, REF=config.REF_file_path, i1=input_file1, i2=input_file2, ofile=self.output().path) if self.dry_run: run_cmd("touch %s" % self.output().path, dry_run=False) run_cmd(cmdline, dry_run=self.dry_run, log_file=self.get_log_path())
def run(self): if self.dry_run: for _o in self.output(): run_cmd("touch %s" % _o.path, dry_run=False) for _o, _i in zip(self.output(), self.input()): if not self.dry_run: bam2info(bam_path=_i.path, output_cov=_o.path, bed_file=config.bed_file_path, REF_file=config.REF_file_path) summarize_covinfo(_o.path, output_f=_o.path.replace( 'cov.info', 'cov_summary.info')) # summaize the cov info with fixed format # todo: change the fixed format?? does it needed?? else: run_cmd("run bam2info for %s" % _i.path, dry_run=self.dry_run, log_file=self.get_log_path())
def run(self): extra_str = '' for p, val in config.qc_joined_params.items(): p = p.replace('_', '-') if val is True: extra_str += ' --p-%s' % p elif val is not None and val is not False: extra_str += ' --p-%s %s ' % (p, val) cmd = "{qiime2_p} quality-filter q-score-joined --i-demux {input_qza} --o-filtered-sequences {output_seq} --o-filter-stats {output_stats}".format( qiime2_p=config.qiime2_p, input_qza=self.input().path, output_seq=self.output().path, output_stats=self.output().path.replace('.qza', '-stats.qza')) # path of stats used in `share_tasks.summaried_tasks` cmd += extra_str run_cmd(cmd, dry_run=self.dry_run, log_file=self.get_log_path()) if self.dry_run: run_cmd( "touch %s" % self.output().path, dry_run=False, )
def run(self): valid_path(self.output()[0].path, check_ofile=1) extra_str = '' for p, val in config.dada2_args.items(): p = p.replace('_', '-') if val is True: extra_str += ' --p-%s' % p elif val is not None and val is not False: extra_str += ' --p-%s %s ' % (p, val) cmd = """{qiime2_p} dada2 denoise-paired --i-demultiplexed-seqs {input_file} --o-representative-sequences {rep_seq} --o-table {profiling_tab} --o-denoising-stats {stats_file}""".format( qiime2_p=config.qiime2_p, input_file=self.input().path, profiling_tab=self.output()[0].path, rep_seq=self.output()[1].path, stats_file=self.output()[2].path) cmd += extra_str run_cmd(cmd, dry_run=self.dry_run, log_file=self.get_log_path()) if self.dry_run: for _o in self.output(): run_cmd("touch %s" % _o.path, dry_run=False)