Пример #1
0
    def output(self):
        odir = join(str(self.odir), "OTU_pipelines", 'derep')
        ofile = join(odir, 'derep.fa')

        o_uc = ofile.replace('.fa', '.uc')
        valid_path([ofile, o_uc], check_ofile=1)
        return [luigi.LocalTarget(ofile), luigi.LocalTarget(o_uc)]
Пример #2
0
 def output(self):
     ofiles = []
     for k, f in self.input().items():
         odir = dirname(f[1].path)
         ofiles.append(join(odir, "rep.tree"))
     valid_path(ofiles, check_ofile=1)
     return [luigi.LocalTarget(_) for _ in ofiles]
Пример #3
0
    def run(self):
        valid_path(self.output()[0].path, check_ofile=1)
        # auto make output dir
        sample_name = str(self.sampleid)
        # if len(self.input()) == 2:
        #     input1 = self.input()[0].path
        #     input2 = self.input()[1].path
        # else:
        input1 = self.PE1
        input2 = self.PE2

        if not exists(self.PE2):
            cmdline = f"java -jar {trimmomatic_jar} SE -threads {default_params.trimmomatic_thread} {input1} {self.output()[0].path} ILLUMINACLIP:{trimmomatic_dir}/adapters/TruSeq3-SE.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36"

        else:
            cmdline = "java -jar {trimmomatic_jar} PE -threads {thread} {input1} {input2} {ofile1} {outdir}/{PE1_id}.unpaired.fq.gz {ofile2} {outdir}/{PE2_id}.unpaired.fq.gz ILLUMINACLIP:{trimmomatic_dir}/adapters/TruSeq3-PE.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:50".format(
                trimmomatic_jar=trimmomatic_jar,
                trimmomatic_dir=trimmomatic_dir,
                input1=input1,
                input2=input2,
                PE1_id=sample_name + "_R1",
                PE2_id=sample_name + "_R2",
                ofile1=self.output()[0].path,
                ofile2=self.output()[1].path,
                outdir=dirname(self.output()[0].path),
                thread=default_params.trimmomatic_thread)

        run_cmd(cmdline, dry_run=self.dry_run, log_file=self.get_log_path())
        if self.dry_run:
            for _o in self.output():
                run_cmd("touch %s" % _o.path, dry_run=False)
Пример #4
0
    def run(self):
        valid_path(self.output()[0].path,
                   check_ofile=1)  # auto make output dir
        sample_name = self.infodict.get("SampleID", '')
        input1 = self.PE1
        input2 = self.PE2

        if input2:
            cmdline = "java -jar {trimmomatic_jar} PE -threads {thread} {input1} {input2} {ofile1} {base_out}/{PE1_id}.unpaired.fq.gz {ofile2} {base_out}/{PE2_id}.unpaired.fq.gz ILLUMINACLIP:{trimmomatic_dir}/adapters/TruSeq3-PE.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:50".format(
                trimmomatic_jar=config.trimmomatic_jar,
                trimmomatic_dir=dirname(config.trimmomatic_jar),
                input1=input1,
                input2=input2,
                PE1_id=sample_name + "_R1",
                PE2_id=sample_name + "_R2",
                ofile1=self.output()[0].path,
                ofile2=self.output()[1].path,
                base_out=dirname(self.output()[0].path),
                thread=config.trimmomatic_thread)
        else:
            cmdline = "java -jar {trimmomatic_jar} SE -threads {thread} {input1} {ofile} ILLUMINACLIP:{trimmomatic_dir}/adapters/TruSeq3-SE.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36".format(
                trimmomatic_jar=config.trimmomatic_jar,
                trimmomatic_dir=dirname(config.trimmomatic_jar),
                input1=input1,
                SE_id=sample_name,
                ofile=self.output()[0].path,
                base_out=dirname(self.output()[0].path),
                thread=config.trimmomatic_thread)

        run_cmd(cmdline, dry_run=self.dry_run, log_file=self.get_log_path())
        if self.dry_run:
            for _o in self.output():
                run_cmd("touch %s" % _o.path, dry_run=False)
Пример #5
0
 def run(self):
     valid_path(self.output().path, check_ofile=1)
     cmdline = "%s/convert2annovar.pl %s --includeinfo -format vcf4 > %s" % (
         config.annovar_pro, self.input().path, self.output().path)
     run_cmd(cmdline, dry_run=self.dry_run, log_file=self.get_log_path())
     if self.dry_run:
         run_cmd("touch %s" % self.output().path, dry_run=False)
Пример #6
0
    def output(self):
        odir = str(self.odir)

        ofile_name = join(str(odir), "OTU_pipelines", "preprocessed",
                          "joined_reads",
                          "{}.fq.gz".format(str(self.sampleid)))
        valid_path(ofile_name, check_ofile=1)
        return luigi.LocalTarget(ofile_name)
Пример #7
0
 def run(self):
     # cmd = "vsearch --fastx_filter {input} --fastq_maxee 1.0 --fastq_trunclen 240 --fastaout {output}".format(input=input,output=output)
     merged_fa = self.input().path
     filtered_fa = self.output().path
     valid_path(filtered_fa, check_ofile=1)
     cmd = f"{vsearch} --fastx_filter {merged_fa} --fastq_maxee 1.0 --fastq_trunclen 240 --fastaout {filtered_fa}"
     run_cmd(cmd, dry_run=self.dry_run, log_file=self.get_log_path())
     if self.dry_run:
         for _o in [self.output()]:
             run_cmd("touch %s" % _o.path, dry_run=False)
Пример #8
0
    def run(self):
        derep_fa = self.input()[0].path
        sorted_d2_fa = self.output().path
        valid_path([sorted_d2_fa], check_ofile=1)
        cmd = f"{vsearch} --sortbysize  {derep_fa} --output {sorted_d2_fa} --minsize 1"
        run_cmd(cmd, dry_run=self.dry_run, log_file=self.get_log_path())

        if self.dry_run:
            for _o in [self.output()]:
                run_cmd("touch %s" % _o.path, dry_run=False)
Пример #9
0
    def run(self):
        # cmd3 = """vsearch --uchime_ref v_analysis/all.denovo.nonchimeras.fastaq \
        # 	--db /home/liaoth/data2/project/16s_pipelines/microbiome_utils/vsearch_pipeliens/rdp_gold.fa --sizein --sizeout --fasta_width 0 \
        # 	--nonchimeras v_analysis/all.ref.nonchimeras.fasta"""
        denovo_nonchimer_fa = self.input().path
        ref_nonchimer_fa = self.output().path
        valid_path(ref_nonchimer_fa, check_ofile=1)
        cmd = f"{vsearch} --uchime_ref {denovo_nonchimer_fa} --db {rdp_gold} --sizein --sizeout --fasta_width 0 --nonchimeras {ref_nonchimer_fa}"
        run_cmd(cmd, dry_run=self.dry_run, log_file=self.get_log_path())

        if self.dry_run:
            for _o in [self.output()]:
                run_cmd("touch %s" % _o.path, dry_run=False)
Пример #10
0
    def run(self):
        # cmd2 = """vsearch --uchime_deno v_analysis/all.preclustered.fasta \
        # 	--sizein --sizeout --fasta_width 0 \
        # 	--nonchimeras v_analysis/all.denovo.nonchimeras.fastaq"""
        precluster_fa = self.input()[0].path
        denovo_nonchimer_fa = self.output().path
        valid_path([denovo_nonchimer_fa], check_ofile=1)
        cmd = f"{vsearch} --uchime_deno {precluster_fa} --sizein --sizeout --fasta_width 0 --nonchimeras {denovo_nonchimer_fa}"
        run_cmd(cmd, dry_run=self.dry_run, log_file=self.get_log_path())

        if self.dry_run:
            for _o in [self.output()]:
                run_cmd("touch %s" % _o.path, dry_run=False)
Пример #11
0
    def run(self):
        # cmd5 = """perl /home/liaoth/data2/project/16s_pipelines/microbiome_utils/vsearch_pipeliens/map.pl splited/derep.fa splited/all.derep.uc v_analysis/all.nonchimeras.derep.fasta > v_analysis/all.nonchimeras.fasta"""
        derep_fa = self.input()["derep"][0].path
        derep_uc = self.input()["derep"][1].path
        nonchimera_derep_fa = self.input()["map1"].path
        nonchimera_fa = self.output().path
        valid_path(nonchimera_fa, check_ofile=1)
        cmd = f"perl {map_pl} {derep_fa} {derep_uc} {nonchimera_derep_fa} > {nonchimera_fa}"
        run_cmd(cmd, dry_run=self.dry_run, log_file=self.get_log_path())

        if self.dry_run:
            for _o in [self.output()]:
                run_cmd("touch %s" % _o.path, dry_run=False)
Пример #12
0
 def run(self):
     valid_path(self.output().path, check_ofile=1)
     if config.PCR_ON:
         cmdline = "touch %s" % self.output().path
     else:
         cmdline = "java {java_option} -jar {pircard_jar} MarkDuplicates INPUT={input} OUTPUT={output} METRICS_FILE={odir}/dedup_metrics.txt CREATE_INDEX=true REMOVE_DUPLICATES=true AS=true".format(
             pircard_jar=config.pircard_jar,
             java_option=config.java_option,
             input=self.input().path,
             output=self.output().path,
             odir=os.path.dirname(self.output().path))
     run_cmd(cmdline, dry_run=self.dry_run, log_file=self.get_log_path())
     if self.dry_run:
         run_cmd("touch %s" % self.output().path, dry_run=False)
Пример #13
0
    def run(self):
        # cmd2 = """vsearch --usearch_global splited/filtered_uparsed.fa --db v_analysis/all.otus.fasta --strand plus --id 0.97 --uc v_analysis/map.txt --otutabout v_analysis/otu_raw.tab    """
        filtered_fa = self.input()['filter'].path
        rep_fa = self.input()['cluster'][0].path

        map_output = self.output()[1].path
        raw_otutab = self.output()[0].path
        valid_path([map_output, raw_otutab], check_ofile=1)
        cmd = f"{vsearch} --usearch_global {filtered_fa} --db {rep_fa} --strand plus --id 0.97 --uc {map_output} --otutabout {raw_otutab}"

        run_cmd(cmd, dry_run=self.dry_run, log_file=self.get_log_path())

        if self.dry_run:
            for _o in self.output():
                run_cmd("touch %s" % _o.path, dry_run=False)
Пример #14
0
    def run(self):
        valid_path(self.output().path, check_ofile=1)
        if config.PCR_ON:
            cmdline = "touch %s" % self.output().path
        else:
            cmdline = "{gatk4} MarkDuplicates --java-options '{java_option}' --INPUT {input_f} --OUTPUT {output_f} --METRICS_FILE {odir}/dedup_metrics.txt --CREATE_INDEX true --REMOVE_DUPLICATES true -AS true".format(
                gatk4=config.gatk_pro,
                java_option=config.java_option,
                input_f=self.input().path,
                output_f=self.output().path,
                odir=dirname(self.output().path))

        run_cmd(cmdline, dry_run=self.dry_run, log_file=self.get_log_path())
        if self.dry_run:
            run_cmd("touch %s" % self.output().path, dry_run=False)
Пример #15
0
    def run(self):
        fq_screen = soft_db_path.fq_screen
        if exists(self.PE2):
            clean_pe1 = self.input()[0].path
            clean_pe2 = self.input()[1].path
            infiles = ' '.join([clean_pe1, clean_pe2])
        else:
            clean_pe1 = self.input()[0].path
            clean_pe2 = ''
            infiles = clean_pe1

        outdir = join(dirname(dirname(self.output()[0].path)),
                      "_screened_cache")
        valid_path(outdir, check_odir=1)
        # anyone could be ok
        threads = default_params.fq_screen_thread
        cmdline = f"{fq_screen} {infiles} --outdir {outdir} --nohits --aligner bowtie2 --threads {threads}"
        run_cmd(
            cmdline,
            log_file=self.get_log_path(),
            dry_run=self.dry_run,
        )
        ############################################################
        # renamed
        name1 = basename(clean_pe1).replace('.fq.gz', '')
        name2 = basename(clean_pe2).replace('.fq.gz', '')
        filtered_r1 = join(outdir, "{}.tagged_filter.fastq.gz".format(name1))
        filtered_r2 = join(outdir, "{}.tagged_filter.fastq.gz".format(name2))

        for filtered_f, output_target in zip([filtered_r1, filtered_r2],
                                             self.output()):
            opath = output_target.path.replace('.gz', '')
            with open(opath, 'w') as new_file:
                stream = SeqIO.parse(gzip.open(filtered_f, 'rt'),
                                     format='fastq')
                for read in stream:
                    _cache = str(read.description)
                    read.id = read.description = read.name = ''
                    _cache = _cache.rpartition('#FQST')[0]
                    read.id = _cache
                    SeqIO.write(read, new_file, format='fastq')
            run_cmd("gzip -f %s" % opath,
                    dry_run=self.dry_run,
                    log_file=self.get_log_path())

        if self.dry_run:
            for _o in self.output():
                run_cmd("touch %s" % _o.path, dry_run=False)
Пример #16
0
    def run(self):
        valid_path(self.output().path, check_ofile=1)
        if config.PCR_ON:
            input_f = self.input()[1].path
        else:
            input_f = self.input()[0].path
        cmdline = "java -jar {gatk} -T RealignerTargetCreator -nt {thread} -R {REF} -I {input_f} --known {known_gold_vcf} -o {output_f}".format(
            gatk=config.gatkv36_path,
            thread=config.gatk_thread,
            REF=config.REF_file_path,
            input_f=input_f,
            output_f=self.output().path,
            known_gold_vcf=config.known_gold_vcf)

        run_cmd(cmdline, dry_run=self.dry_run, log_file=self.get_log_path())
        if self.dry_run:
            run_cmd("touch %s" % self.output().path, dry_run=False)
Пример #17
0
    def run(self):
        # cmd = """vsearch --cluster_size v_analysis/all.nonchimeras.fasta --id 0.97 \
        # 	--sizein --sizeout --fasta_width 0 \
        # 	--uc v_analysis/all.clustered.uc \
        # 	--relabel OTU --centroids v_analysis/all.otus.fasta"""

        nonchimera_fa = self.input().path
        rep_fa = self.output()[0].path
        cluster_uc = self.output()[1].path
        valid_path([rep_fa, cluster_uc], check_ofile=1)
        cmd = f"{vsearch} --cluster_size {nonchimera_fa} --id 0.97 --sizein --sizeout --fasta_width 0 --uc {cluster_uc} --relabel OTU --centroids {rep_fa}"

        run_cmd(cmd, dry_run=self.dry_run, log_file=self.get_log_path())

        if self.dry_run:
            for _o in self.output():
                run_cmd("touch %s" % _o.path, dry_run=False)
Пример #18
0
    def output(self):
        odir = join(
            str(self.odir),
            "OTU_pipelines",
            "preprocessed",
            "screened",
        )

        ofile_name1 = join(str(odir), "{}_R1.fq.gz".format(self.sampleid))
        ofile_name2 = join(str(odir), "{}_R2.fq.gz".format(self.sampleid))
        valid_path(ofile_name1, check_ofile=1)
        if exists(self.PE2):

            return [
                luigi.LocalTarget(ofile_name1),
                luigi.LocalTarget(ofile_name2)
            ]
        else:
            return [luigi.LocalTarget(ofile_name1)]
Пример #19
0
def main(tab, indir, outdir):
    valid_path(outdir, check_odir=1)
    df = fileparser(tab)
    total_stats = dict()
    project_name = ''
    for idx, row in df.df.iterrows():
        sample_id = idx
        project_name = row["project_name"]
        infile = join(indir, sample_id + '.merged.anno.hg19_multianno.csv')
        stats = exec_filter(infile,
                            sample_id,
                            output_prefix=join(outdir, sample_id))
        total_stats.update(stats)

    summary_df = pd.DataFrame.from_dict(total_stats, orient="index")
    summary_df = summary_df.T
    summary_df.index.name = 'variants'
    summary_df.to_csv(join(outdir, project_name + '.summary.csv'),
                      sep=',',
                      index=1)
Пример #20
0
    def run(self):
        valid_path(self.output().path, check_ofile=1)
        sample_name = self.infodict.get("SampleID", '')
        input_file1 = self.input()[0].path
        if len(self.input()) == 1:
            input_file2 = ''
        else:
            input_file2 = self.input()[1].path

        cmdline = "bwa mem -M -t {bwa_thread} -k 19 -R '@RG\\tID:{SN}\\tSM:{SN}\\tPL:illumina\\tLB:lib1\\tPU:L001' {REF} {i1} {i2}  > {ofile}".format(
            bwa_thread=config.bwa_thread,
            SN=sample_name,
            REF=config.REF_file_path,
            i1=input_file1,
            i2=input_file2,
            ofile=self.output().path)
        if self.dry_run:
            run_cmd("touch %s" % self.output().path, dry_run=False)

        run_cmd(cmdline, dry_run=self.dry_run, log_file=self.get_log_path())
Пример #21
0
    def output(self):
        odir = join(
            str(self.odir),
            "%s_pipelines" % self.prefix,
            "preprocessed",
            "after_QC",
        )

        ofile_name1 = join(str(odir),
                           "{}_R1.clean.fq.gz".format(str(self.sampleid)))
        ofile_name2 = join(str(odir),
                           "{}_R2.clean.fq.gz".format(str(self.sampleid)))
        valid_path(ofile_name1, check_ofile=1)

        if not exists(self.PE2):
            return [luigi.LocalTarget(ofile_name1)]
        else:
            return [
                luigi.LocalTarget(ofile_name1),
                luigi.LocalTarget(ofile_name2)
            ]
Пример #22
0
    def run(self):
        valid_path(self.output()[0].path, check_ofile=1)
        extra_str = ''
        for p, val in config.dada2_args.items():
            p = p.replace('_', '-')
            if val is True:
                extra_str += ' --p-%s' % p
            elif val is not None and val is not False:
                extra_str += ' --p-%s %s ' % (p, val)

        cmd = """{qiime2_p} dada2 denoise-paired --i-demultiplexed-seqs {input_file} --o-representative-sequences {rep_seq} --o-table {profiling_tab} --o-denoising-stats {stats_file}""".format(
            qiime2_p=config.qiime2_p,
            input_file=self.input().path,
            profiling_tab=self.output()[0].path,
            rep_seq=self.output()[1].path,
            stats_file=self.output()[2].path)

        cmd += extra_str
        run_cmd(cmd, dry_run=self.dry_run, log_file=self.get_log_path())
        if self.dry_run:
            for _o in self.output():
                run_cmd("touch %s" % _o.path, dry_run=False)
Пример #23
0
    def run(self):
        if type(self.input()) == dict:
            input_list = list(self.input().values())
        elif type(self.input()) == list:
            input_list = list(self.input())
        else:
            input_list = [self.input()]

        for _output, _input in zip(self.output(), input_list):
            valid_path(_output.path, check_ofile=1)
            prefix = _input.path.replace('.av', '')
            cmdline = "{annovar_dir}/table_annovar.pl {input_f} {annovar_db} -buildver {genome_version} -protocol {db_names} -operation g,r,r,f,f,f,f,f,f -nastring . --remove --otherinfo --csvout --thread {annovar_thread} --outfile {output_f} --argument '-exonicsplicing -splicing 25',,,,,,,,".format(
                annovar_dir=config.annovar_pro,
                input_f=_input.path,
                annovar_db=config.annovar_db,
                genome_version=config.genome_version,
                db_names=config.db_names,
                annovar_thread=config.annovar_thread,
                output_f=prefix + '.anno')
            run_cmd(cmdline,
                    dry_run=self.dry_run,
                    log_file=self.get_log_path())
            if self.dry_run:
                run_cmd("touch %s" % _output.path, dry_run=False)
Пример #24
0
 def output(self):
     odir = join(str(self.odir), "OTU_pipelines", 'dechimera')
     ofile = join(odir, 'all.nonchimeras.fasta')
     valid_path(ofile, check_ofile=1)
     return luigi.LocalTarget(ofile)
Пример #25
0
 def output(self):
     odir = join(str(self.odir), "OTU_pipelines", 'preprocessed')
     ofile = join(odir, 'merged.fastq')
     valid_path(ofile, check_ofile=1)
     return luigi.LocalTarget(ofile)
Пример #26
0
 def output(self):
     odir = join(str(self.odir), "q2_pipelines")
     ofile = join(odir, 'joined_seq.qza')
     valid_path(ofile, check_ofile=1)
     return luigi.LocalTarget(ofile)
Пример #27
0
 def output(self):
     odir = join(str(self.odir), "OTU_pipelines", 'derep')
     ofile = join(odir, 'sorted_d2_fa.fa')
     valid_path(ofile, check_ofile=1)
     return luigi.LocalTarget(ofile)