Пример #1
0
def run_gc_depth(genome, fastq_list, name, window, thread, job_type,
                 concurrent, refresh, work_dir, out_dir):

    genome, fastq_list = check_paths([genome, fastq_list])

    sort_bam, genome = bwa_mem(fastq_list=fastq_list,
                               genome=genome,
                               name=name,
                               number=5000000,
                               data_type='',
                               thread=thread,
                               job_type=job_type,
                               concurrent=concurrent,
                               refresh=refresh,
                               work_dir=work_dir,
                               out_dir=work_dir)

    sort_bam = check_paths(sort_bam)
    dag = DAG("gc_depth")

    gc_depth_task, gc_depth_png = stat_gc_depth_task(genome=genome,
                                                     bam=sort_bam,
                                                     name=name,
                                                     window=window,
                                                     job_type=job_type,
                                                     work_dir=work_dir,
                                                     out_dir=out_dir)

    dag.add_task(gc_depth_task)
    do_dag(dag, concurrent, refresh)

    return gc_depth_png
Пример #2
0
def stat_gc_depth_task(genome, bam, name, window, job_type, work_dir, out_dir):

    bam = check_paths(bam)

    task = Task(id="stat_coverage",
                work_dir=work_dir,
                type=job_type,
                option="-pe smp 1",
                script="""
export PATH={samtools}:{python}:$PATH
samtools depth -aa {bam} > {work_dir}/{name}.depth
python {script}/stat_coverage.py -i {work_dir}/{name}.depth -d 1,5,10,20 -o {out_dir}/{name}.coverage.xlsx
python {script}/stat_length_gc.py -d {work_dir}/{name}.depth -g {genome} -n {out_dir}/{name}
python {script}/stat_gc_depth.py -d {work_dir}/{name}.depth -g {genome} -b 1000 -w 5000 -e 100 -n {work_dir}/{name}
python {script}/draw_depth_gc.py -gcd {work_dir}/{name}.stat_gc_depth.tsv -n {out_dir}/{name}
#python {script}/plot_gc_depth.py -gcd {work_dir}/{name}.stat_gc_depth.tsv -n {out_dir}/{name}
""".format(samtools=SAMTOOLS_BIN,
           script=SCRIPTS,
           python=PYTHON_BIN,
           genome=genome,
           bam=bam,
           name=name,
           window=window,
           work_dir=work_dir,
           out_dir=out_dir))

    return task, os.path.join(out_dir, "%s.gc_depth.png" % name)
Пример #3
0
def minimap(r1, r2, genome, name, split, platform, number, thread, job_type,
            concurrent, refresh, work_dir, out_dir):

    genome = check_path(genome)
    work_dir = mkdir(work_dir)
    out_dir = mkdir(out_dir)
    r1 = check_paths(r1)

    if r2 != "":
        r2 = check_paths(r2)

    options = {"software": OrderedDict(), "database": OrderedDict()}

    data_work = mkdir(os.path.join(work_dir, '00_data'))

    reads = split_data(r1=r1,
                       r2=r2,
                       name=name,
                       number=number,
                       job_type=job_type,
                       work_dir=data_work,
                       out_dir=out_dir)

    bam, option = run_minimap(reads=reads,
                              genome=genome,
                              platform=platform,
                              name=name,
                              split=split,
                              thread=thread,
                              job_type=job_type,
                              concurrent=concurrent,
                              refresh=refresh,
                              work_dir=work_dir,
                              out_dir=out_dir)

    options["software"] = option

    with open(os.path.join(out_dir, "minimap2.json"), "w") as fh:
        json.dump(options, fh, indent=2)

    return bam
Пример #4
0
def run_gc_depth(genome, r1, r2, name, platform, split, window, thread,
                 job_type, concurrent, refresh, work_dir, out_dir):

    genome = check_path(genome)
    r1 = check_paths(r1)
    r2 = check_paths(r2)

    sort_bam = minimap(r1=r1,
                       r2=r2,
                       genome=genome,
                       name=name,
                       split=split,
                       platform=platform,
                       number=5000000,
                       thread=thread,
                       job_type=job_type,
                       concurrent=concurrent,
                       refresh=refresh,
                       work_dir=work_dir,
                       out_dir=out_dir)

    sort_bam = check_paths(sort_bam)
    dag = DAG("gc_depth")

    gc_depth_task, gc_depth_png = stat_gc_depth_task(genome=genome,
                                                     bam=sort_bam,
                                                     name=name,
                                                     window=window,
                                                     job_type=job_type,
                                                     work_dir=work_dir,
                                                     out_dir=out_dir)

    dag.add_task(gc_depth_task)
    do_dag(dag, concurrent, refresh)

    return gc_depth_png
Пример #5
0
def bwa_mem(fastq_list, genome, name, number, data_type, thread, job_type,
            concurrent, refresh, work_dir, out_dir):

    genome, fastq_list = check_paths([genome, fastq_list])
    work_dir = mkdir(work_dir)
    out_dir = mkdir(out_dir)

    dag = DAG("split_ngs")
    split_work = mkdir(os.path.join(work_dir, "00_data"))
    split_out = mkdir(os.path.join(out_dir, "00_data"))

    splitfp_task, fq_path, r1_name, r2_name = split_ngs_task(
        fastq_list=fastq_list,
        name=name,
        number=number,
        data_type=data_type,
        job_type=job_type,
        work_dir=split_work,
        out_dir=split_out)
    dag.add_task(splitfp_task)
    do_dag(dag, concurrent, refresh)

    dag = DAG("bwa_mem")
    index_task, bwa_tasks, merge_task, sorted_bam, genome = run_bwa_mem(
        fq_path=fq_path,
        r1_name=r1_name,
        r2_name=r2_name,
        genome=genome,
        name=name,
        thread=thread,
        job_type=job_type,
        work_dir=work_dir,
        out_dir=out_dir)

    dag.add_task(index_task)
    dag.add_task(*bwa_tasks)
    dag.add_task(merge_task)
    index_task.set_downstream(*bwa_tasks)
    merge_task.set_upstream(*bwa_tasks)

    do_dag(dag, concurrent, refresh)

    return sorted_bam, genome
Пример #6
0
def run_survey(r1, r2, name, trim, kingdom, kmer_length, sample_depth, thread,
               asm, window, job_type, queue, concurrent, refresh, work_dir,
               out_dir):

    work_dir = mkdir(work_dir)
    out_dir = mkdir(out_dir)
    r1 = check_paths(r1)
    r2 = check_paths(r2)

    dag = DAG("survey_qc")
    merge_task, qc_task, cont_task, result_task, clean1, clean2, quality, content, gc, stat_qc, poll_png, poll_tsv = ngs_qc_tasks(
        name=name,
        r1=r1,
        r2=r2,
        trim=trim,
        thread=thread,
        job_type=job_type,
        work_dir=work_dir,
        out_dir=out_dir)

    data_work = mkdir(os.path.join(work_dir, "choose_data"))
    freq_task1, histo1, kmer_stat, estimate1 = kmerfreq_task(
        r1=clean1,
        r2=clean2,
        name=name,
        kmer_length=kmer_length,
        thread=thread,
        job_type=job_type,
        work_dir=data_work,
        out_dir=data_work)

    dag.add_task(merge_task)
    dag.add_task(qc_task)
    qc_task.set_upstream(merge_task)
    dag.add_task(cont_task)
    dag.add_task(result_task)
    dag.add_task(freq_task1)
    freq_task1.set_upstream(qc_task)
    cont_task.set_upstream(qc_task)
    result_task.set_upstream(qc_task)

    do_dag(dag, concurrent, refresh)

    for line in read_tsv(kmer_stat):
        if line[0] == "kmer_depth":
            kmer_depth = int(line[1])

    if sample_depth > kmer_depth:
        LOG.debug(
            'The amount of sequencing data may be insufficient. Sequencing depth is only %s X'
            % kmer_depth)
        sample_depth = kmer_depth
    proportion = sample_depth * 1.0 / kmer_depth

    dag = DAG("survey")
    choose_task, freq_task, heter_task, jellyfish_task, gse_scope_task, denovo_task, stat_heter, heter_png, scope_txt, gse_txt, scope_png, gse_png, stat_genome, genome, ngs_list = kmer_denovo_tasks(
        r1=clean1,
        r2=clean2,
        name=name,
        kmer_length=kmer_length,
        proportion=proportion,
        kingdom=kingdom,
        thread=thread,
        job_type=job_type,
        queue=queue,
        work_dir=work_dir,
        out_dir=out_dir)
    if asm == "true":
        dag.add_task(denovo_task)
    else:
        genome = "false"
        stat_genome = "false"
        ngs_list = "false"

    dag.add_task(choose_task)
    dag.add_task(freq_task)
    dag.add_task(heter_task)
    freq_task.set_upstream(choose_task)
    dag.add_task(jellyfish_task)
    jellyfish_task.set_upstream(choose_task)
    dag.add_task(gse_scope_task)
    heter_task.set_upstream(freq_task)
    gse_scope_task.set_upstream(jellyfish_task)
    do_dag(dag, concurrent, refresh)

    if ngs_list == "false":
        print("Genomics are not assembled")
        gc_depth_png = heter_png
    else:
        depth_work = mkdir(os.path.join(work_dir, "05_GC-depth"))
        depth_out = mkdir(os.path.join(out_dir, "05_GC-depth"))
        gc_depth_png = run_gc_depth(genome=genome,
                                    fastq_list=ngs_list,
                                    name=name,
                                    window=window,
                                    thread=thread,
                                    job_type=job_type,
                                    concurrent=concurrent,
                                    refresh=refresh,
                                    work_dir=depth_work,
                                    out_dir=depth_out)

    run_report(name, asm, kmer_length, stat_qc, quality, content, gc, poll_tsv,
               poll_png, stat_heter, heter_png, scope_txt, gse_txt, scope_png,
               gse_png, stat_genome, gc_depth_png, out_dir)

    return stat_qc, quality, content, gc, poll_png, poll_tsv, stat_heter, heter_png, scope_txt, gse_txt, scope_png, gse_png, stat_genome
Пример #7
0
def run_survey(r1,
               r2,
               name,
               trim,
               kingdom,
               mode,
               cratio,
               kmer_length,
               kmer_depth,
               thread,
               asm,
               window,
               job_type,
               queue,
               concurrent,
               refresh,
               work_dir,
               out_dir,
               split=""):

    work_dir = mkdir(work_dir)
    out_dir = mkdir(out_dir)
    r1 = check_paths(r1)
    r2 = check_paths(r2)

    clean1, clean2, taxid, stat_qc, quality, content, gc, cont_tsv, cont_png = run_ngs_qc(
        r1=r1,
        r2=r2,
        name=name,
        trim=trim,
        kingdom=kingdom,
        thread=thread,
        job_type=job_type,
        concurrent=concurrent,
        refresh=refresh,
        work_dir=os.path.join(work_dir, "01_data"),
        out_dir=os.path.join(out_dir, "01_data"))

    stat_heter, heter_png, scope_txt, gse_txt, scope_png, gse_png, stat_genome, gc_depth = run_kmer_denovo(
        r1=[clean1],
        r2=[clean2],
        taxid=taxid,
        name=name,
        mode=mode,
        cratio=cratio,
        kmer_length=kmer_length,
        kmer_depth=kmer_depth,
        kingdom=kingdom,
        asm=asm,
        window=window,
        thread=thread,
        job_type=job_type,
        queue=queue,
        concurrent=concurrent,
        refresh=refresh,
        work_dir=work_dir,
        out_dir=out_dir,
        split=split,
        platform="illumina")

    run_report(name, asm, kmer_length, stat_qc, quality, content, gc, cont_tsv,
               cont_png, stat_heter, heter_png, scope_txt, gse_txt, scope_png,
               gse_png, stat_genome, gc_depth, out_dir)
Пример #8
0
def run_filter_contamination(r1, r2, name, kmer_length, kmer_depth, taxid, kingdom, thread, job_type, concurrent, refresh, work_dir, out_dir, split, mode="fast", cratio=10):

    work_dir = mkdir(work_dir)
    out_dir = mkdir(out_dir)
    r1 = check_paths(r1)
    r2 = check_paths(r2)
    taxid = check_path(taxid)
    options = {
        "software": OrderedDict(),
        "database": OrderedDict()
    }

    option, r1, r2 = choose_data(
        r1=r1,
        r2=r2,
        name=name,
        kmer_length=kmer_length,
        kmer_depth=kmer_depth,
        thread=thread,
        job_type=job_type,
        concurrent=concurrent,
        refresh=refresh,
        work_dir=work_dir,
        out_dir=out_dir)
    options["software"].update(option)

    if mode!="fast":
        work_dict = {
            "data": "00_data",
            "ref": "01_ref",
            "ump": "02_ump"
        }
        for k, v in work_dict.items():
            mkdir(os.path.join(work_dir, v))

        reads = split_data(
            r1=r1,
            r2=r2,
            name=name,
            number=2000000,
            job_type=job_type,
            work_dir=os.path.join(work_dir, work_dict["data"]),
            concurrent=concurrent,
            refresh=refresh,
            out_dir=out_dir,
            platform="illumina")

        dag = DAG("unmap_data")
        ref_task, ref= obtain_contamination_task(
            taxid=taxid,
            name=name,
            kingdom=kingdom,
            job_type=job_type,
            work_dir=os.path.join(work_dir, work_dict["ref"]),
            out_dir=out_dir,
            mode=mode,
            cratio=cratio)
        dag.add_task(ref_task)

        unmap_tasks, reads, option = create_unmap_tasks(
            name=name,
            reference=ref,
            reads=reads,
            thread=thread,
            job_type=job_type,
            work_dir=os.path.join(work_dir, work_dict["ump"]),
            out_dir=out_dir,
            split=split)
        dag.add_task(*unmap_tasks)
        ref_task.set_downstream(*unmap_tasks)
        do_dag(dag, concurrent, refresh)
        options["software"].update(option)

        reads = [reads]
    else:
        reads = [r1, r2]

    return reads, options
Пример #9
0
def run_kmer_denovo(r1, r2, name, kingdom, kmer_length, sample_depth, thread,
                    asm, window, job_type, queue, concurrent, refresh,
                    work_dir, out_dir):

    work_dir = mkdir(work_dir)
    out_dir = mkdir(out_dir)
    r1 = check_paths(r1)
    r2 = check_paths(r2)

    if r1[0].endswith(".gz") or r2[0].endswith(".gz"):
        tools = "zcat"
    else:
        tools = "cat"

    dag_data = DAG("survey_data")

    data_work = mkdir(os.path.join(work_dir, "choose_data"))
    cat_data_task, clean1, clean2 = merge_raw_data_task(name=name,
                                                        r1=" ".join(r1),
                                                        r2=" ".join(r2),
                                                        tools=tools,
                                                        job_type=job_type,
                                                        work_dir=data_work,
                                                        out_dir=data_work)

    freq_task1, histo1, kmer_stat, estimate1 = kmerfreq_task(
        r1=clean1,
        r2=clean2,
        name=name,
        kmer_length=17,
        thread=thread,
        job_type=job_type,
        work_dir=data_work,
        out_dir=data_work)

    dag_data.add_task(cat_data_task)
    dag_data.add_task(freq_task1)
    freq_task1.set_upstream(cat_data_task)
    do_dag(dag_data, concurrent, refresh)

    for line in read_tsv(kmer_stat):
        if line[0] == "kmer_depth":
            kmer_depth = int(line[1])

    if sample_depth > kmer_depth:
        LOG.debug(
            'The amount of sequencing data may be insufficient. Sequencing depth is only %s X'
            % kmer_depth)
        sample_depth = kmer_depth
    proportion = sample_depth * 1.0 / kmer_depth

    dag = DAG("survey")

    choose_task, freq_task, heter_task, jellyfish_task, gse_scope_task, denovo_task, stat_heter, heter_png, scope_txt, gse_txt, scope_png, gse_png, stat_genome, genome, ngs_list = kmer_denovo_tasks(
        r1=clean1,
        r2=clean2,
        name=name,
        kmer_length=kmer_length,
        proportion=proportion,
        kingdom=kingdom,
        thread=thread,
        job_type=job_type,
        queue=queue,
        work_dir=work_dir,
        out_dir=out_dir)
    if asm == "true":
        dag.add_task(denovo_task)
    else:
        genome = "false"
        stat_genome = "false"
        ngs_list = "false"

    dag.add_task(choose_task)
    dag.add_task(freq_task)
    dag.add_task(heter_task)
    freq_task.set_upstream(choose_task)
    dag.add_task(jellyfish_task)
    jellyfish_task.set_upstream(choose_task)
    dag.add_task(gse_scope_task)
    heter_task.set_upstream(freq_task)
    gse_scope_task.set_upstream(jellyfish_task)
    do_dag(dag, concurrent, refresh)

    if ngs_list == "false":
        print("Genomics are not assembled")
        gc_depth_png = heter_png
    else:
        depth_work = mkdir(os.path.join(work_dir, "05_GC-depth"))
        depth_out = mkdir(os.path.join(out_dir, "05_GC-depth"))
        gc_depth_png = run_gc_depth(genome=genome,
                                    fastq_list=ngs_list,
                                    name=name,
                                    window=window,
                                    thread=thread,
                                    job_type=job_type,
                                    concurrent=concurrent,
                                    refresh=refresh,
                                    work_dir=depth_work,
                                    out_dir=depth_out)
    return stat_heter, heter_png, scope_txt, gse_txt, scope_png, gse_png, stat_genome
Пример #10
0
def kmer_denovo_tasks(r1, r2, name, kmer_length, proportion, kingdom, thread,
                      job_type, queue, work_dir, out_dir):

    work_dir = mkdir(work_dir)
    out_dir = mkdir(out_dir)
    clean1 = check_paths(r1)
    clean2 = check_paths(r2)
    work_dict = {
        "choose": "choose_data",
        "gse_scope": "02_gse_scope",
        "kmerfreq": "03_Kmerfreq",
        "denovo": "04_Soapdenovo",
        "gc_depth": "05_GC-depth"
    }

    choose_work = mkdir(os.path.join(work_dir, work_dict["choose"]))
    choose_task, choose_r1, choose_r2, choose_r = sample_fastq_task(
        r1=clean1,
        r2=clean2,
        proportion=proportion,
        name=name,
        job_type=job_type,
        work_dir=choose_work)

    heter_work = mkdir(os.path.join(work_dir, work_dict["kmerfreq"]))
    heter_out = mkdir(os.path.join(out_dir, work_dict["kmerfreq"]))
    freq_task, histo, kmer_depth, estimate = kmerfreq_task(r1=choose_r1,
                                                           r2=choose_r2,
                                                           name=name,
                                                           kmer_length=17,
                                                           thread=thread,
                                                           job_type=job_type,
                                                           work_dir=heter_work,
                                                           out_dir=heter_out)

    heter_task, stat_heter, heter_png = get_heterozygosity_task(
        histo=histo,
        estimate=estimate,
        kingdom=kingdom,
        name=name,
        job_type=job_type,
        work_dir=heter_work,
        out_dir=heter_out)

    scope_work = mkdir(os.path.join(work_dir, work_dict["gse_scope"]))
    scope_out = mkdir(os.path.join(out_dir, work_dict["gse_scope"]))
    jellyfish_task, histogram = get_jellyfish_task(fastq=choose_r,
                                                   name=name,
                                                   depth=40 * 100,
                                                   thread=thread,
                                                   job_type=job_type,
                                                   work_dir=scope_work,
                                                   out_dir=scope_out)

    gse_scope_task, scope_txt, gse_txt, scope_png, gse_png = get_gse_scope_task(
        histogram=histogram,
        name=name,
        kmer_length=kmer_length,
        job_type=job_type,
        work_dir=scope_work,
        out_dir=scope_out)

    denovo_work = mkdir(os.path.join(work_dir, work_dict["denovo"]))
    denovo_out = mkdir(os.path.join(out_dir, work_dict["denovo"]))
    denovo_task, genome, stat_genome, ngs_list = soapdenovo_task(
        r1=clean1,
        r2=clean2,
        name=name,
        thread=thread * 2,
        queue=queue,
        job_type=job_type,
        work_dir=denovo_work,
        out_dir=denovo_out)

    return choose_task, freq_task, heter_task, jellyfish_task, gse_scope_task, denovo_task, stat_heter, heter_png, scope_txt, gse_txt, scope_png, gse_png, stat_genome, genome, ngs_list
Пример #11
0
def run_kmer_denovo(r1,
                    r2,
                    taxid,
                    name,
                    mode,
                    cratio,
                    kmer_length,
                    kmer_depth,
                    kingdom,
                    asm,
                    window,
                    thread,
                    job_type,
                    queue,
                    concurrent,
                    refresh,
                    work_dir,
                    out_dir,
                    split,
                    platform="illumina"):

    work_dir = mkdir(work_dir)
    out_dir = mkdir(out_dir)
    r1 = check_paths(r1)
    r2 = check_paths(r2)

    work_dict = {
        "contamination": "01_contamination",
        "gse_scope": "02_gse_scope",
        "kmerfreq": "03_Kmerfreq",
        "denovo": "04_Soapdenovo",
        "gc_depth": "05_GC-depth"
    }

    for k, v in work_dict.items():
        mkdir(os.path.join(work_dir, v))
        if k == "contamination":
            continue
        mkdir(os.path.join(out_dir, v))

    reads, options = run_filter_contamination(r1=r1,
                                              r2=r2,
                                              name=name,
                                              kmer_length=kmer_length,
                                              kmer_depth=kmer_depth,
                                              taxid=taxid,
                                              kingdom=kingdom,
                                              thread=thread,
                                              job_type=job_type,
                                              concurrent=concurrent,
                                              refresh=refresh,
                                              work_dir=os.path.join(
                                                  work_dir,
                                                  work_dict["contamination"]),
                                              out_dir=out_dir,
                                              mode=mode,
                                              cratio=cratio,
                                              split=split)

    dag = DAG("kmer_denovo")
    jellyfish_task, gse_scope_task, scope_txt, gse_txt, scope_png, gse_png, option = gse_scope(
        reads=" ".join(reads),
        name=name,
        kmer_length=kmer_length,
        thread=thread,
        job_type=job_type,
        work_dir=os.path.join(work_dir, work_dict["gse_scope"]),
        out_dir=os.path.join(out_dir, work_dict["gse_scope"]),
        mode=mode)
    options["software"].update(option)
    dag.add_task(jellyfish_task)
    dag.add_task(gse_scope_task)

    kmerfreq_task, heter_task, stat_heter, heter_png, option = kmerfreq(
        reads=" ".join(reads),
        name=name,
        kingdom=kingdom,
        kmer_length=kmer_length,
        thread=thread,
        job_type=job_type,
        work_dir=os.path.join(work_dir, work_dict["kmerfreq"]),
        out_dir=os.path.join(out_dir, work_dict["kmerfreq"]))
    options["software"].update(option)
    dag.add_task(kmerfreq_task)
    dag.add_task(heter_task)

    denovo_task, genome, stat_genome, option = create_soapdenovo_task(
        r1=" ".join(r1),
        r2=" ".join(r2),
        name=name,
        thread=thread,
        queue=queue,
        job_type=job_type,
        work_dir=os.path.join(work_dir, work_dict["denovo"]),
        out_dir=os.path.join(out_dir, work_dict["denovo"]))
    if asm == "true":
        dag.add_task(denovo_task)
    else:
        genome = "false"
        stat_genome = "false"
    do_dag(dag, concurrent, refresh)

    if asm == "true":
        gc_depth = run_gc_depth(genome=genome,
                                r1=" ".join(r1),
                                r2=" ".join(r2),
                                name=name,
                                platform=platform,
                                split="no_split",
                                window=window,
                                thread=thread,
                                job_type=job_type,
                                concurrent=concurrent,
                                refresh=refresh,
                                work_dir=os.path.join(work_dir,
                                                      work_dict["gc_depth"]),
                                out_dir=os.path.join(out_dir,
                                                     work_dict["gc_depth"]))
    else:
        gc_depth = heter_png

    with open(os.path.join(out_dir, "kmer_denovo.json"), "w") as fh:
        json.dump(options, fh, indent=2)

    return stat_heter, heter_png, scope_txt, gse_txt, scope_png, gse_png, stat_genome, gc_depth