def run_gc_depth(genome, fastq_list, name, window, thread, job_type, concurrent, refresh, work_dir, out_dir): genome, fastq_list = check_paths([genome, fastq_list]) sort_bam, genome = bwa_mem(fastq_list=fastq_list, genome=genome, name=name, number=5000000, data_type='', thread=thread, job_type=job_type, concurrent=concurrent, refresh=refresh, work_dir=work_dir, out_dir=work_dir) sort_bam = check_paths(sort_bam) dag = DAG("gc_depth") gc_depth_task, gc_depth_png = stat_gc_depth_task(genome=genome, bam=sort_bam, name=name, window=window, job_type=job_type, work_dir=work_dir, out_dir=out_dir) dag.add_task(gc_depth_task) do_dag(dag, concurrent, refresh) return gc_depth_png
def stat_gc_depth_task(genome, bam, name, window, job_type, work_dir, out_dir): bam = check_paths(bam) task = Task(id="stat_coverage", work_dir=work_dir, type=job_type, option="-pe smp 1", script=""" export PATH={samtools}:{python}:$PATH samtools depth -aa {bam} > {work_dir}/{name}.depth python {script}/stat_coverage.py -i {work_dir}/{name}.depth -d 1,5,10,20 -o {out_dir}/{name}.coverage.xlsx python {script}/stat_length_gc.py -d {work_dir}/{name}.depth -g {genome} -n {out_dir}/{name} python {script}/stat_gc_depth.py -d {work_dir}/{name}.depth -g {genome} -b 1000 -w 5000 -e 100 -n {work_dir}/{name} python {script}/draw_depth_gc.py -gcd {work_dir}/{name}.stat_gc_depth.tsv -n {out_dir}/{name} #python {script}/plot_gc_depth.py -gcd {work_dir}/{name}.stat_gc_depth.tsv -n {out_dir}/{name} """.format(samtools=SAMTOOLS_BIN, script=SCRIPTS, python=PYTHON_BIN, genome=genome, bam=bam, name=name, window=window, work_dir=work_dir, out_dir=out_dir)) return task, os.path.join(out_dir, "%s.gc_depth.png" % name)
def minimap(r1, r2, genome, name, split, platform, number, thread, job_type, concurrent, refresh, work_dir, out_dir): genome = check_path(genome) work_dir = mkdir(work_dir) out_dir = mkdir(out_dir) r1 = check_paths(r1) if r2 != "": r2 = check_paths(r2) options = {"software": OrderedDict(), "database": OrderedDict()} data_work = mkdir(os.path.join(work_dir, '00_data')) reads = split_data(r1=r1, r2=r2, name=name, number=number, job_type=job_type, work_dir=data_work, out_dir=out_dir) bam, option = run_minimap(reads=reads, genome=genome, platform=platform, name=name, split=split, thread=thread, job_type=job_type, concurrent=concurrent, refresh=refresh, work_dir=work_dir, out_dir=out_dir) options["software"] = option with open(os.path.join(out_dir, "minimap2.json"), "w") as fh: json.dump(options, fh, indent=2) return bam
def run_gc_depth(genome, r1, r2, name, platform, split, window, thread, job_type, concurrent, refresh, work_dir, out_dir): genome = check_path(genome) r1 = check_paths(r1) r2 = check_paths(r2) sort_bam = minimap(r1=r1, r2=r2, genome=genome, name=name, split=split, platform=platform, number=5000000, thread=thread, job_type=job_type, concurrent=concurrent, refresh=refresh, work_dir=work_dir, out_dir=out_dir) sort_bam = check_paths(sort_bam) dag = DAG("gc_depth") gc_depth_task, gc_depth_png = stat_gc_depth_task(genome=genome, bam=sort_bam, name=name, window=window, job_type=job_type, work_dir=work_dir, out_dir=out_dir) dag.add_task(gc_depth_task) do_dag(dag, concurrent, refresh) return gc_depth_png
def bwa_mem(fastq_list, genome, name, number, data_type, thread, job_type, concurrent, refresh, work_dir, out_dir): genome, fastq_list = check_paths([genome, fastq_list]) work_dir = mkdir(work_dir) out_dir = mkdir(out_dir) dag = DAG("split_ngs") split_work = mkdir(os.path.join(work_dir, "00_data")) split_out = mkdir(os.path.join(out_dir, "00_data")) splitfp_task, fq_path, r1_name, r2_name = split_ngs_task( fastq_list=fastq_list, name=name, number=number, data_type=data_type, job_type=job_type, work_dir=split_work, out_dir=split_out) dag.add_task(splitfp_task) do_dag(dag, concurrent, refresh) dag = DAG("bwa_mem") index_task, bwa_tasks, merge_task, sorted_bam, genome = run_bwa_mem( fq_path=fq_path, r1_name=r1_name, r2_name=r2_name, genome=genome, name=name, thread=thread, job_type=job_type, work_dir=work_dir, out_dir=out_dir) dag.add_task(index_task) dag.add_task(*bwa_tasks) dag.add_task(merge_task) index_task.set_downstream(*bwa_tasks) merge_task.set_upstream(*bwa_tasks) do_dag(dag, concurrent, refresh) return sorted_bam, genome
def run_survey(r1, r2, name, trim, kingdom, kmer_length, sample_depth, thread, asm, window, job_type, queue, concurrent, refresh, work_dir, out_dir): work_dir = mkdir(work_dir) out_dir = mkdir(out_dir) r1 = check_paths(r1) r2 = check_paths(r2) dag = DAG("survey_qc") merge_task, qc_task, cont_task, result_task, clean1, clean2, quality, content, gc, stat_qc, poll_png, poll_tsv = ngs_qc_tasks( name=name, r1=r1, r2=r2, trim=trim, thread=thread, job_type=job_type, work_dir=work_dir, out_dir=out_dir) data_work = mkdir(os.path.join(work_dir, "choose_data")) freq_task1, histo1, kmer_stat, estimate1 = kmerfreq_task( r1=clean1, r2=clean2, name=name, kmer_length=kmer_length, thread=thread, job_type=job_type, work_dir=data_work, out_dir=data_work) dag.add_task(merge_task) dag.add_task(qc_task) qc_task.set_upstream(merge_task) dag.add_task(cont_task) dag.add_task(result_task) dag.add_task(freq_task1) freq_task1.set_upstream(qc_task) cont_task.set_upstream(qc_task) result_task.set_upstream(qc_task) do_dag(dag, concurrent, refresh) for line in read_tsv(kmer_stat): if line[0] == "kmer_depth": kmer_depth = int(line[1]) if sample_depth > kmer_depth: LOG.debug( 'The amount of sequencing data may be insufficient. Sequencing depth is only %s X' % kmer_depth) sample_depth = kmer_depth proportion = sample_depth * 1.0 / kmer_depth dag = DAG("survey") choose_task, freq_task, heter_task, jellyfish_task, gse_scope_task, denovo_task, stat_heter, heter_png, scope_txt, gse_txt, scope_png, gse_png, stat_genome, genome, ngs_list = kmer_denovo_tasks( r1=clean1, r2=clean2, name=name, kmer_length=kmer_length, proportion=proportion, kingdom=kingdom, thread=thread, job_type=job_type, queue=queue, work_dir=work_dir, out_dir=out_dir) if asm == "true": dag.add_task(denovo_task) else: genome = "false" stat_genome = "false" ngs_list = "false" dag.add_task(choose_task) dag.add_task(freq_task) dag.add_task(heter_task) freq_task.set_upstream(choose_task) dag.add_task(jellyfish_task) jellyfish_task.set_upstream(choose_task) dag.add_task(gse_scope_task) heter_task.set_upstream(freq_task) gse_scope_task.set_upstream(jellyfish_task) do_dag(dag, concurrent, refresh) if ngs_list == "false": print("Genomics are not assembled") gc_depth_png = heter_png else: depth_work = mkdir(os.path.join(work_dir, "05_GC-depth")) depth_out = mkdir(os.path.join(out_dir, "05_GC-depth")) gc_depth_png = run_gc_depth(genome=genome, fastq_list=ngs_list, name=name, window=window, thread=thread, job_type=job_type, concurrent=concurrent, refresh=refresh, work_dir=depth_work, out_dir=depth_out) run_report(name, asm, kmer_length, stat_qc, quality, content, gc, poll_tsv, poll_png, stat_heter, heter_png, scope_txt, gse_txt, scope_png, gse_png, stat_genome, gc_depth_png, out_dir) return stat_qc, quality, content, gc, poll_png, poll_tsv, stat_heter, heter_png, scope_txt, gse_txt, scope_png, gse_png, stat_genome
def run_survey(r1, r2, name, trim, kingdom, mode, cratio, kmer_length, kmer_depth, thread, asm, window, job_type, queue, concurrent, refresh, work_dir, out_dir, split=""): work_dir = mkdir(work_dir) out_dir = mkdir(out_dir) r1 = check_paths(r1) r2 = check_paths(r2) clean1, clean2, taxid, stat_qc, quality, content, gc, cont_tsv, cont_png = run_ngs_qc( r1=r1, r2=r2, name=name, trim=trim, kingdom=kingdom, thread=thread, job_type=job_type, concurrent=concurrent, refresh=refresh, work_dir=os.path.join(work_dir, "01_data"), out_dir=os.path.join(out_dir, "01_data")) stat_heter, heter_png, scope_txt, gse_txt, scope_png, gse_png, stat_genome, gc_depth = run_kmer_denovo( r1=[clean1], r2=[clean2], taxid=taxid, name=name, mode=mode, cratio=cratio, kmer_length=kmer_length, kmer_depth=kmer_depth, kingdom=kingdom, asm=asm, window=window, thread=thread, job_type=job_type, queue=queue, concurrent=concurrent, refresh=refresh, work_dir=work_dir, out_dir=out_dir, split=split, platform="illumina") run_report(name, asm, kmer_length, stat_qc, quality, content, gc, cont_tsv, cont_png, stat_heter, heter_png, scope_txt, gse_txt, scope_png, gse_png, stat_genome, gc_depth, out_dir)
def run_filter_contamination(r1, r2, name, kmer_length, kmer_depth, taxid, kingdom, thread, job_type, concurrent, refresh, work_dir, out_dir, split, mode="fast", cratio=10): work_dir = mkdir(work_dir) out_dir = mkdir(out_dir) r1 = check_paths(r1) r2 = check_paths(r2) taxid = check_path(taxid) options = { "software": OrderedDict(), "database": OrderedDict() } option, r1, r2 = choose_data( r1=r1, r2=r2, name=name, kmer_length=kmer_length, kmer_depth=kmer_depth, thread=thread, job_type=job_type, concurrent=concurrent, refresh=refresh, work_dir=work_dir, out_dir=out_dir) options["software"].update(option) if mode!="fast": work_dict = { "data": "00_data", "ref": "01_ref", "ump": "02_ump" } for k, v in work_dict.items(): mkdir(os.path.join(work_dir, v)) reads = split_data( r1=r1, r2=r2, name=name, number=2000000, job_type=job_type, work_dir=os.path.join(work_dir, work_dict["data"]), concurrent=concurrent, refresh=refresh, out_dir=out_dir, platform="illumina") dag = DAG("unmap_data") ref_task, ref= obtain_contamination_task( taxid=taxid, name=name, kingdom=kingdom, job_type=job_type, work_dir=os.path.join(work_dir, work_dict["ref"]), out_dir=out_dir, mode=mode, cratio=cratio) dag.add_task(ref_task) unmap_tasks, reads, option = create_unmap_tasks( name=name, reference=ref, reads=reads, thread=thread, job_type=job_type, work_dir=os.path.join(work_dir, work_dict["ump"]), out_dir=out_dir, split=split) dag.add_task(*unmap_tasks) ref_task.set_downstream(*unmap_tasks) do_dag(dag, concurrent, refresh) options["software"].update(option) reads = [reads] else: reads = [r1, r2] return reads, options
def run_kmer_denovo(r1, r2, name, kingdom, kmer_length, sample_depth, thread, asm, window, job_type, queue, concurrent, refresh, work_dir, out_dir): work_dir = mkdir(work_dir) out_dir = mkdir(out_dir) r1 = check_paths(r1) r2 = check_paths(r2) if r1[0].endswith(".gz") or r2[0].endswith(".gz"): tools = "zcat" else: tools = "cat" dag_data = DAG("survey_data") data_work = mkdir(os.path.join(work_dir, "choose_data")) cat_data_task, clean1, clean2 = merge_raw_data_task(name=name, r1=" ".join(r1), r2=" ".join(r2), tools=tools, job_type=job_type, work_dir=data_work, out_dir=data_work) freq_task1, histo1, kmer_stat, estimate1 = kmerfreq_task( r1=clean1, r2=clean2, name=name, kmer_length=17, thread=thread, job_type=job_type, work_dir=data_work, out_dir=data_work) dag_data.add_task(cat_data_task) dag_data.add_task(freq_task1) freq_task1.set_upstream(cat_data_task) do_dag(dag_data, concurrent, refresh) for line in read_tsv(kmer_stat): if line[0] == "kmer_depth": kmer_depth = int(line[1]) if sample_depth > kmer_depth: LOG.debug( 'The amount of sequencing data may be insufficient. Sequencing depth is only %s X' % kmer_depth) sample_depth = kmer_depth proportion = sample_depth * 1.0 / kmer_depth dag = DAG("survey") choose_task, freq_task, heter_task, jellyfish_task, gse_scope_task, denovo_task, stat_heter, heter_png, scope_txt, gse_txt, scope_png, gse_png, stat_genome, genome, ngs_list = kmer_denovo_tasks( r1=clean1, r2=clean2, name=name, kmer_length=kmer_length, proportion=proportion, kingdom=kingdom, thread=thread, job_type=job_type, queue=queue, work_dir=work_dir, out_dir=out_dir) if asm == "true": dag.add_task(denovo_task) else: genome = "false" stat_genome = "false" ngs_list = "false" dag.add_task(choose_task) dag.add_task(freq_task) dag.add_task(heter_task) freq_task.set_upstream(choose_task) dag.add_task(jellyfish_task) jellyfish_task.set_upstream(choose_task) dag.add_task(gse_scope_task) heter_task.set_upstream(freq_task) gse_scope_task.set_upstream(jellyfish_task) do_dag(dag, concurrent, refresh) if ngs_list == "false": print("Genomics are not assembled") gc_depth_png = heter_png else: depth_work = mkdir(os.path.join(work_dir, "05_GC-depth")) depth_out = mkdir(os.path.join(out_dir, "05_GC-depth")) gc_depth_png = run_gc_depth(genome=genome, fastq_list=ngs_list, name=name, window=window, thread=thread, job_type=job_type, concurrent=concurrent, refresh=refresh, work_dir=depth_work, out_dir=depth_out) return stat_heter, heter_png, scope_txt, gse_txt, scope_png, gse_png, stat_genome
def kmer_denovo_tasks(r1, r2, name, kmer_length, proportion, kingdom, thread, job_type, queue, work_dir, out_dir): work_dir = mkdir(work_dir) out_dir = mkdir(out_dir) clean1 = check_paths(r1) clean2 = check_paths(r2) work_dict = { "choose": "choose_data", "gse_scope": "02_gse_scope", "kmerfreq": "03_Kmerfreq", "denovo": "04_Soapdenovo", "gc_depth": "05_GC-depth" } choose_work = mkdir(os.path.join(work_dir, work_dict["choose"])) choose_task, choose_r1, choose_r2, choose_r = sample_fastq_task( r1=clean1, r2=clean2, proportion=proportion, name=name, job_type=job_type, work_dir=choose_work) heter_work = mkdir(os.path.join(work_dir, work_dict["kmerfreq"])) heter_out = mkdir(os.path.join(out_dir, work_dict["kmerfreq"])) freq_task, histo, kmer_depth, estimate = kmerfreq_task(r1=choose_r1, r2=choose_r2, name=name, kmer_length=17, thread=thread, job_type=job_type, work_dir=heter_work, out_dir=heter_out) heter_task, stat_heter, heter_png = get_heterozygosity_task( histo=histo, estimate=estimate, kingdom=kingdom, name=name, job_type=job_type, work_dir=heter_work, out_dir=heter_out) scope_work = mkdir(os.path.join(work_dir, work_dict["gse_scope"])) scope_out = mkdir(os.path.join(out_dir, work_dict["gse_scope"])) jellyfish_task, histogram = get_jellyfish_task(fastq=choose_r, name=name, depth=40 * 100, thread=thread, job_type=job_type, work_dir=scope_work, out_dir=scope_out) gse_scope_task, scope_txt, gse_txt, scope_png, gse_png = get_gse_scope_task( histogram=histogram, name=name, kmer_length=kmer_length, job_type=job_type, work_dir=scope_work, out_dir=scope_out) denovo_work = mkdir(os.path.join(work_dir, work_dict["denovo"])) denovo_out = mkdir(os.path.join(out_dir, work_dict["denovo"])) denovo_task, genome, stat_genome, ngs_list = soapdenovo_task( r1=clean1, r2=clean2, name=name, thread=thread * 2, queue=queue, job_type=job_type, work_dir=denovo_work, out_dir=denovo_out) return choose_task, freq_task, heter_task, jellyfish_task, gse_scope_task, denovo_task, stat_heter, heter_png, scope_txt, gse_txt, scope_png, gse_png, stat_genome, genome, ngs_list
def run_kmer_denovo(r1, r2, taxid, name, mode, cratio, kmer_length, kmer_depth, kingdom, asm, window, thread, job_type, queue, concurrent, refresh, work_dir, out_dir, split, platform="illumina"): work_dir = mkdir(work_dir) out_dir = mkdir(out_dir) r1 = check_paths(r1) r2 = check_paths(r2) work_dict = { "contamination": "01_contamination", "gse_scope": "02_gse_scope", "kmerfreq": "03_Kmerfreq", "denovo": "04_Soapdenovo", "gc_depth": "05_GC-depth" } for k, v in work_dict.items(): mkdir(os.path.join(work_dir, v)) if k == "contamination": continue mkdir(os.path.join(out_dir, v)) reads, options = run_filter_contamination(r1=r1, r2=r2, name=name, kmer_length=kmer_length, kmer_depth=kmer_depth, taxid=taxid, kingdom=kingdom, thread=thread, job_type=job_type, concurrent=concurrent, refresh=refresh, work_dir=os.path.join( work_dir, work_dict["contamination"]), out_dir=out_dir, mode=mode, cratio=cratio, split=split) dag = DAG("kmer_denovo") jellyfish_task, gse_scope_task, scope_txt, gse_txt, scope_png, gse_png, option = gse_scope( reads=" ".join(reads), name=name, kmer_length=kmer_length, thread=thread, job_type=job_type, work_dir=os.path.join(work_dir, work_dict["gse_scope"]), out_dir=os.path.join(out_dir, work_dict["gse_scope"]), mode=mode) options["software"].update(option) dag.add_task(jellyfish_task) dag.add_task(gse_scope_task) kmerfreq_task, heter_task, stat_heter, heter_png, option = kmerfreq( reads=" ".join(reads), name=name, kingdom=kingdom, kmer_length=kmer_length, thread=thread, job_type=job_type, work_dir=os.path.join(work_dir, work_dict["kmerfreq"]), out_dir=os.path.join(out_dir, work_dict["kmerfreq"])) options["software"].update(option) dag.add_task(kmerfreq_task) dag.add_task(heter_task) denovo_task, genome, stat_genome, option = create_soapdenovo_task( r1=" ".join(r1), r2=" ".join(r2), name=name, thread=thread, queue=queue, job_type=job_type, work_dir=os.path.join(work_dir, work_dict["denovo"]), out_dir=os.path.join(out_dir, work_dict["denovo"])) if asm == "true": dag.add_task(denovo_task) else: genome = "false" stat_genome = "false" do_dag(dag, concurrent, refresh) if asm == "true": gc_depth = run_gc_depth(genome=genome, r1=" ".join(r1), r2=" ".join(r2), name=name, platform=platform, split="no_split", window=window, thread=thread, job_type=job_type, concurrent=concurrent, refresh=refresh, work_dir=os.path.join(work_dir, work_dict["gc_depth"]), out_dir=os.path.join(out_dir, work_dict["gc_depth"])) else: gc_depth = heter_png with open(os.path.join(out_dir, "kmer_denovo.json"), "w") as fh: json.dump(options, fh, indent=2) return stat_heter, heter_png, scope_txt, gse_txt, scope_png, gse_png, stat_genome, gc_depth