def output(self): odir = join(str(self.odir), "OTU_pipelines", 'derep') ofile = join(odir, 'derep.fa') o_uc = ofile.replace('.fa', '.uc') valid_path([ofile, o_uc], check_ofile=1) return [luigi.LocalTarget(ofile), luigi.LocalTarget(o_uc)]
def output(self): ofiles = [] for k, f in self.input().items(): odir = dirname(f[1].path) ofiles.append(join(odir, "rep.tree")) valid_path(ofiles, check_ofile=1) return [luigi.LocalTarget(_) for _ in ofiles]
def run(self): valid_path(self.output()[0].path, check_ofile=1) # auto make output dir sample_name = str(self.sampleid) # if len(self.input()) == 2: # input1 = self.input()[0].path # input2 = self.input()[1].path # else: input1 = self.PE1 input2 = self.PE2 if not exists(self.PE2): cmdline = f"java -jar {trimmomatic_jar} SE -threads {default_params.trimmomatic_thread} {input1} {self.output()[0].path} ILLUMINACLIP:{trimmomatic_dir}/adapters/TruSeq3-SE.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36" else: cmdline = "java -jar {trimmomatic_jar} PE -threads {thread} {input1} {input2} {ofile1} {outdir}/{PE1_id}.unpaired.fq.gz {ofile2} {outdir}/{PE2_id}.unpaired.fq.gz ILLUMINACLIP:{trimmomatic_dir}/adapters/TruSeq3-PE.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:50".format( trimmomatic_jar=trimmomatic_jar, trimmomatic_dir=trimmomatic_dir, input1=input1, input2=input2, PE1_id=sample_name + "_R1", PE2_id=sample_name + "_R2", ofile1=self.output()[0].path, ofile2=self.output()[1].path, outdir=dirname(self.output()[0].path), thread=default_params.trimmomatic_thread) run_cmd(cmdline, dry_run=self.dry_run, log_file=self.get_log_path()) if self.dry_run: for _o in self.output(): run_cmd("touch %s" % _o.path, dry_run=False)
def run(self): valid_path(self.output()[0].path, check_ofile=1) # auto make output dir sample_name = self.infodict.get("SampleID", '') input1 = self.PE1 input2 = self.PE2 if input2: cmdline = "java -jar {trimmomatic_jar} PE -threads {thread} {input1} {input2} {ofile1} {base_out}/{PE1_id}.unpaired.fq.gz {ofile2} {base_out}/{PE2_id}.unpaired.fq.gz ILLUMINACLIP:{trimmomatic_dir}/adapters/TruSeq3-PE.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:50".format( trimmomatic_jar=config.trimmomatic_jar, trimmomatic_dir=dirname(config.trimmomatic_jar), input1=input1, input2=input2, PE1_id=sample_name + "_R1", PE2_id=sample_name + "_R2", ofile1=self.output()[0].path, ofile2=self.output()[1].path, base_out=dirname(self.output()[0].path), thread=config.trimmomatic_thread) else: cmdline = "java -jar {trimmomatic_jar} SE -threads {thread} {input1} {ofile} ILLUMINACLIP:{trimmomatic_dir}/adapters/TruSeq3-SE.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36".format( trimmomatic_jar=config.trimmomatic_jar, trimmomatic_dir=dirname(config.trimmomatic_jar), input1=input1, SE_id=sample_name, ofile=self.output()[0].path, base_out=dirname(self.output()[0].path), thread=config.trimmomatic_thread) run_cmd(cmdline, dry_run=self.dry_run, log_file=self.get_log_path()) if self.dry_run: for _o in self.output(): run_cmd("touch %s" % _o.path, dry_run=False)
def run(self): valid_path(self.output().path, check_ofile=1) cmdline = "%s/convert2annovar.pl %s --includeinfo -format vcf4 > %s" % ( config.annovar_pro, self.input().path, self.output().path) run_cmd(cmdline, dry_run=self.dry_run, log_file=self.get_log_path()) if self.dry_run: run_cmd("touch %s" % self.output().path, dry_run=False)
def output(self): odir = str(self.odir) ofile_name = join(str(odir), "OTU_pipelines", "preprocessed", "joined_reads", "{}.fq.gz".format(str(self.sampleid))) valid_path(ofile_name, check_ofile=1) return luigi.LocalTarget(ofile_name)
def run(self): # cmd = "vsearch --fastx_filter {input} --fastq_maxee 1.0 --fastq_trunclen 240 --fastaout {output}".format(input=input,output=output) merged_fa = self.input().path filtered_fa = self.output().path valid_path(filtered_fa, check_ofile=1) cmd = f"{vsearch} --fastx_filter {merged_fa} --fastq_maxee 1.0 --fastq_trunclen 240 --fastaout {filtered_fa}" run_cmd(cmd, dry_run=self.dry_run, log_file=self.get_log_path()) if self.dry_run: for _o in [self.output()]: run_cmd("touch %s" % _o.path, dry_run=False)
def run(self): derep_fa = self.input()[0].path sorted_d2_fa = self.output().path valid_path([sorted_d2_fa], check_ofile=1) cmd = f"{vsearch} --sortbysize {derep_fa} --output {sorted_d2_fa} --minsize 1" run_cmd(cmd, dry_run=self.dry_run, log_file=self.get_log_path()) if self.dry_run: for _o in [self.output()]: run_cmd("touch %s" % _o.path, dry_run=False)
def run(self): # cmd3 = """vsearch --uchime_ref v_analysis/all.denovo.nonchimeras.fastaq \ # --db /home/liaoth/data2/project/16s_pipelines/microbiome_utils/vsearch_pipeliens/rdp_gold.fa --sizein --sizeout --fasta_width 0 \ # --nonchimeras v_analysis/all.ref.nonchimeras.fasta""" denovo_nonchimer_fa = self.input().path ref_nonchimer_fa = self.output().path valid_path(ref_nonchimer_fa, check_ofile=1) cmd = f"{vsearch} --uchime_ref {denovo_nonchimer_fa} --db {rdp_gold} --sizein --sizeout --fasta_width 0 --nonchimeras {ref_nonchimer_fa}" run_cmd(cmd, dry_run=self.dry_run, log_file=self.get_log_path()) if self.dry_run: for _o in [self.output()]: run_cmd("touch %s" % _o.path, dry_run=False)
def run(self): # cmd2 = """vsearch --uchime_deno v_analysis/all.preclustered.fasta \ # --sizein --sizeout --fasta_width 0 \ # --nonchimeras v_analysis/all.denovo.nonchimeras.fastaq""" precluster_fa = self.input()[0].path denovo_nonchimer_fa = self.output().path valid_path([denovo_nonchimer_fa], check_ofile=1) cmd = f"{vsearch} --uchime_deno {precluster_fa} --sizein --sizeout --fasta_width 0 --nonchimeras {denovo_nonchimer_fa}" run_cmd(cmd, dry_run=self.dry_run, log_file=self.get_log_path()) if self.dry_run: for _o in [self.output()]: run_cmd("touch %s" % _o.path, dry_run=False)
def run(self): # cmd5 = """perl /home/liaoth/data2/project/16s_pipelines/microbiome_utils/vsearch_pipeliens/map.pl splited/derep.fa splited/all.derep.uc v_analysis/all.nonchimeras.derep.fasta > v_analysis/all.nonchimeras.fasta""" derep_fa = self.input()["derep"][0].path derep_uc = self.input()["derep"][1].path nonchimera_derep_fa = self.input()["map1"].path nonchimera_fa = self.output().path valid_path(nonchimera_fa, check_ofile=1) cmd = f"perl {map_pl} {derep_fa} {derep_uc} {nonchimera_derep_fa} > {nonchimera_fa}" run_cmd(cmd, dry_run=self.dry_run, log_file=self.get_log_path()) if self.dry_run: for _o in [self.output()]: run_cmd("touch %s" % _o.path, dry_run=False)
def run(self): valid_path(self.output().path, check_ofile=1) if config.PCR_ON: cmdline = "touch %s" % self.output().path else: cmdline = "java {java_option} -jar {pircard_jar} MarkDuplicates INPUT={input} OUTPUT={output} METRICS_FILE={odir}/dedup_metrics.txt CREATE_INDEX=true REMOVE_DUPLICATES=true AS=true".format( pircard_jar=config.pircard_jar, java_option=config.java_option, input=self.input().path, output=self.output().path, odir=os.path.dirname(self.output().path)) run_cmd(cmdline, dry_run=self.dry_run, log_file=self.get_log_path()) if self.dry_run: run_cmd("touch %s" % self.output().path, dry_run=False)
def run(self): # cmd2 = """vsearch --usearch_global splited/filtered_uparsed.fa --db v_analysis/all.otus.fasta --strand plus --id 0.97 --uc v_analysis/map.txt --otutabout v_analysis/otu_raw.tab """ filtered_fa = self.input()['filter'].path rep_fa = self.input()['cluster'][0].path map_output = self.output()[1].path raw_otutab = self.output()[0].path valid_path([map_output, raw_otutab], check_ofile=1) cmd = f"{vsearch} --usearch_global {filtered_fa} --db {rep_fa} --strand plus --id 0.97 --uc {map_output} --otutabout {raw_otutab}" run_cmd(cmd, dry_run=self.dry_run, log_file=self.get_log_path()) if self.dry_run: for _o in self.output(): run_cmd("touch %s" % _o.path, dry_run=False)
def run(self): valid_path(self.output().path, check_ofile=1) if config.PCR_ON: cmdline = "touch %s" % self.output().path else: cmdline = "{gatk4} MarkDuplicates --java-options '{java_option}' --INPUT {input_f} --OUTPUT {output_f} --METRICS_FILE {odir}/dedup_metrics.txt --CREATE_INDEX true --REMOVE_DUPLICATES true -AS true".format( gatk4=config.gatk_pro, java_option=config.java_option, input_f=self.input().path, output_f=self.output().path, odir=dirname(self.output().path)) run_cmd(cmdline, dry_run=self.dry_run, log_file=self.get_log_path()) if self.dry_run: run_cmd("touch %s" % self.output().path, dry_run=False)
def run(self): fq_screen = soft_db_path.fq_screen if exists(self.PE2): clean_pe1 = self.input()[0].path clean_pe2 = self.input()[1].path infiles = ' '.join([clean_pe1, clean_pe2]) else: clean_pe1 = self.input()[0].path clean_pe2 = '' infiles = clean_pe1 outdir = join(dirname(dirname(self.output()[0].path)), "_screened_cache") valid_path(outdir, check_odir=1) # anyone could be ok threads = default_params.fq_screen_thread cmdline = f"{fq_screen} {infiles} --outdir {outdir} --nohits --aligner bowtie2 --threads {threads}" run_cmd( cmdline, log_file=self.get_log_path(), dry_run=self.dry_run, ) ############################################################ # renamed name1 = basename(clean_pe1).replace('.fq.gz', '') name2 = basename(clean_pe2).replace('.fq.gz', '') filtered_r1 = join(outdir, "{}.tagged_filter.fastq.gz".format(name1)) filtered_r2 = join(outdir, "{}.tagged_filter.fastq.gz".format(name2)) for filtered_f, output_target in zip([filtered_r1, filtered_r2], self.output()): opath = output_target.path.replace('.gz', '') with open(opath, 'w') as new_file: stream = SeqIO.parse(gzip.open(filtered_f, 'rt'), format='fastq') for read in stream: _cache = str(read.description) read.id = read.description = read.name = '' _cache = _cache.rpartition('#FQST')[0] read.id = _cache SeqIO.write(read, new_file, format='fastq') run_cmd("gzip -f %s" % opath, dry_run=self.dry_run, log_file=self.get_log_path()) if self.dry_run: for _o in self.output(): run_cmd("touch %s" % _o.path, dry_run=False)
def run(self): valid_path(self.output().path, check_ofile=1) if config.PCR_ON: input_f = self.input()[1].path else: input_f = self.input()[0].path cmdline = "java -jar {gatk} -T RealignerTargetCreator -nt {thread} -R {REF} -I {input_f} --known {known_gold_vcf} -o {output_f}".format( gatk=config.gatkv36_path, thread=config.gatk_thread, REF=config.REF_file_path, input_f=input_f, output_f=self.output().path, known_gold_vcf=config.known_gold_vcf) run_cmd(cmdline, dry_run=self.dry_run, log_file=self.get_log_path()) if self.dry_run: run_cmd("touch %s" % self.output().path, dry_run=False)
def run(self): # cmd = """vsearch --cluster_size v_analysis/all.nonchimeras.fasta --id 0.97 \ # --sizein --sizeout --fasta_width 0 \ # --uc v_analysis/all.clustered.uc \ # --relabel OTU --centroids v_analysis/all.otus.fasta""" nonchimera_fa = self.input().path rep_fa = self.output()[0].path cluster_uc = self.output()[1].path valid_path([rep_fa, cluster_uc], check_ofile=1) cmd = f"{vsearch} --cluster_size {nonchimera_fa} --id 0.97 --sizein --sizeout --fasta_width 0 --uc {cluster_uc} --relabel OTU --centroids {rep_fa}" run_cmd(cmd, dry_run=self.dry_run, log_file=self.get_log_path()) if self.dry_run: for _o in self.output(): run_cmd("touch %s" % _o.path, dry_run=False)
def output(self): odir = join( str(self.odir), "OTU_pipelines", "preprocessed", "screened", ) ofile_name1 = join(str(odir), "{}_R1.fq.gz".format(self.sampleid)) ofile_name2 = join(str(odir), "{}_R2.fq.gz".format(self.sampleid)) valid_path(ofile_name1, check_ofile=1) if exists(self.PE2): return [ luigi.LocalTarget(ofile_name1), luigi.LocalTarget(ofile_name2) ] else: return [luigi.LocalTarget(ofile_name1)]
def main(tab, indir, outdir): valid_path(outdir, check_odir=1) df = fileparser(tab) total_stats = dict() project_name = '' for idx, row in df.df.iterrows(): sample_id = idx project_name = row["project_name"] infile = join(indir, sample_id + '.merged.anno.hg19_multianno.csv') stats = exec_filter(infile, sample_id, output_prefix=join(outdir, sample_id)) total_stats.update(stats) summary_df = pd.DataFrame.from_dict(total_stats, orient="index") summary_df = summary_df.T summary_df.index.name = 'variants' summary_df.to_csv(join(outdir, project_name + '.summary.csv'), sep=',', index=1)
def run(self): valid_path(self.output().path, check_ofile=1) sample_name = self.infodict.get("SampleID", '') input_file1 = self.input()[0].path if len(self.input()) == 1: input_file2 = '' else: input_file2 = self.input()[1].path cmdline = "bwa mem -M -t {bwa_thread} -k 19 -R '@RG\\tID:{SN}\\tSM:{SN}\\tPL:illumina\\tLB:lib1\\tPU:L001' {REF} {i1} {i2} > {ofile}".format( bwa_thread=config.bwa_thread, SN=sample_name, REF=config.REF_file_path, i1=input_file1, i2=input_file2, ofile=self.output().path) if self.dry_run: run_cmd("touch %s" % self.output().path, dry_run=False) run_cmd(cmdline, dry_run=self.dry_run, log_file=self.get_log_path())
def output(self): odir = join( str(self.odir), "%s_pipelines" % self.prefix, "preprocessed", "after_QC", ) ofile_name1 = join(str(odir), "{}_R1.clean.fq.gz".format(str(self.sampleid))) ofile_name2 = join(str(odir), "{}_R2.clean.fq.gz".format(str(self.sampleid))) valid_path(ofile_name1, check_ofile=1) if not exists(self.PE2): return [luigi.LocalTarget(ofile_name1)] else: return [ luigi.LocalTarget(ofile_name1), luigi.LocalTarget(ofile_name2) ]
def run(self): valid_path(self.output()[0].path, check_ofile=1) extra_str = '' for p, val in config.dada2_args.items(): p = p.replace('_', '-') if val is True: extra_str += ' --p-%s' % p elif val is not None and val is not False: extra_str += ' --p-%s %s ' % (p, val) cmd = """{qiime2_p} dada2 denoise-paired --i-demultiplexed-seqs {input_file} --o-representative-sequences {rep_seq} --o-table {profiling_tab} --o-denoising-stats {stats_file}""".format( qiime2_p=config.qiime2_p, input_file=self.input().path, profiling_tab=self.output()[0].path, rep_seq=self.output()[1].path, stats_file=self.output()[2].path) cmd += extra_str run_cmd(cmd, dry_run=self.dry_run, log_file=self.get_log_path()) if self.dry_run: for _o in self.output(): run_cmd("touch %s" % _o.path, dry_run=False)
def run(self): if type(self.input()) == dict: input_list = list(self.input().values()) elif type(self.input()) == list: input_list = list(self.input()) else: input_list = [self.input()] for _output, _input in zip(self.output(), input_list): valid_path(_output.path, check_ofile=1) prefix = _input.path.replace('.av', '') cmdline = "{annovar_dir}/table_annovar.pl {input_f} {annovar_db} -buildver {genome_version} -protocol {db_names} -operation g,r,r,f,f,f,f,f,f -nastring . --remove --otherinfo --csvout --thread {annovar_thread} --outfile {output_f} --argument '-exonicsplicing -splicing 25',,,,,,,,".format( annovar_dir=config.annovar_pro, input_f=_input.path, annovar_db=config.annovar_db, genome_version=config.genome_version, db_names=config.db_names, annovar_thread=config.annovar_thread, output_f=prefix + '.anno') run_cmd(cmdline, dry_run=self.dry_run, log_file=self.get_log_path()) if self.dry_run: run_cmd("touch %s" % _output.path, dry_run=False)
def output(self): odir = join(str(self.odir), "OTU_pipelines", 'dechimera') ofile = join(odir, 'all.nonchimeras.fasta') valid_path(ofile, check_ofile=1) return luigi.LocalTarget(ofile)
def output(self): odir = join(str(self.odir), "OTU_pipelines", 'preprocessed') ofile = join(odir, 'merged.fastq') valid_path(ofile, check_ofile=1) return luigi.LocalTarget(ofile)
def output(self): odir = join(str(self.odir), "q2_pipelines") ofile = join(odir, 'joined_seq.qza') valid_path(ofile, check_ofile=1) return luigi.LocalTarget(ofile)
def output(self): odir = join(str(self.odir), "OTU_pipelines", 'derep') ofile = join(odir, 'sorted_d2_fa.fa') valid_path(ofile, check_ofile=1) return luigi.LocalTarget(ofile)