Python get_config示例，baseq.mgt.get_config Python示例

示例#1

0

显示文件

文件： gatk.py 项目： basedata10/baseq

def selectvar(rawvcf, selectvcf, filtervcf, genome, run=True):
    """
    This function selects SNPs from a VCF file which is usually the output file of
    HaplotypeCaller. Then, all SNPs are filtered by certain criteria based on INFO and/or FORMAT annotations.
    Criteria used here is "QD < 2.0 || FS > 60.0 || MQ < 40.0".
    More details about SelectVariants_ and VariantFiltration_

    .. _SelectVariants: https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_walkers_variantutils_SelectVariants.php
    .. _VariantFiltration: https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_walkers_filters_VariantFiltration.php

    Usage:
    ::
      baseq-SNV run_selectvar -r Test.raw.indel.snp.vcf -s Test.raw.snp.vcf -f Test.filtered.snp.vcf -g hg38

    Return:
    ::
      Test.raw.snp.vcf
      Test.filtered.snp.vcf
    """
    GATK = get_config("SNV", "GATK")
    index = get_config("SNV_ref_" + genome, "bwa_index")
    selectvar_cmd = selectvar_cmd_script.format(gatk=GATK,
                                                index=index,
                                                rawvcf=rawvcf,
                                                selectvcf=selectvcf,
                                                filtervcf=filtervcf)
    if run:
        run_cmd("SelectVariants", "".join(selectvar_cmd))
    return selectvar_cmd

示例#2

0

显示文件

文件： gatk.py 项目： basedata10/baseq

def run_callvar(bqsrbam, rawvcf, genome, disable_dup_filter=False):
    """
    Call germline SNPs and indels via local re-assembly of haplotypes. BAM file recalbrated by BQSR do recommand as
    input BAM file and this functin only run the single sample genotypeVCF calling. More details see also
    HaplotypeCaller_

    .. _HaplotypeCaller: https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_walkers_haplotypecaller_HaplotypeCaller.php

    Usage:
    ::
      baseq-SNV run_callvar -q Test.marked.bqsr.bam -r Test.raw.indel.snp.vcf -g hg38

    Return:
    ::
      Test.raw.indel.snp.vcf

    """
    GATK = get_config("SNV", "GATK")
    index = get_config("SNV_ref_" + genome, "bwa_index")
    interval = get_config("SNV_ref_" + genome, "interval")
    extra = ""
    if disable_dup_filter:
        extra = "--disable-read-filter NotDuplicateReadFilter"
    callvar_cmd = callvar_cmd_script.format(gatk=GATK,
                                            index=index,
                                            interval=interval,
                                            bqsrbam=bqsrbam,
                                            rawvcf=rawvcf,
                                            extrainfos=extra)
    run_cmd("call variants", "".join(callvar_cmd))
    return callvar_cmd

示例#3

0

显示文件

文件： bowtie2.py 项目： basedata10/baseq

def bowtie2_sort(fq1, fq2, bamfile, genome, reads=5*1000*1000, thread=8):
    bowtie2 = get_config("CNV", "bowtie2")
    bowtie2_ref = genome
    samtools = get_config("CNV", "samtools")

    samfile = bamfile+".sam"
    bamfile = bamfile
    statsfile = bamfile+".stat"

    print("[info] Bamfile Path : {}".format(bamfile))

    #Run Bowtie
    if fq1 and fq2:
        bowtie_cmd = [bowtie2, '-p', str(thread), '-x', bowtie2_ref, '-u', str(reads), '-1', fq1, '-2', fq2, '>', samfile]
    else:
        bowtie_cmd = [bowtie2, '-p', str(thread), '-x', bowtie2_ref, '-u', str(reads), '-U', fq1, '>', samfile]
    run_cmd("bowtie alignment", " ".join(bowtie_cmd))

    #run Samtools
    samtools_sort = [samtools, 'sort -@ ', str(thread), '-o', bamfile, samfile, ";", samtools, "index", bamfile, "; rm", samfile]
    run_cmd("samtools sort", " ".join(samtools_sort))

    #run flagstats
    cmd_stats = [samtools, "flagstat", bamfile, ">", statsfile]
    run_cmd("samtools stats", " ".join(cmd_stats))

    return bamfile

示例#4

0

显示文件

文件： gatk.py 项目： basedata10/baseq

def run_markdup(bamfile, markedbam):
    """
    Run MarkDuplicates of Picard. this function tags duplicate reads with "markduplicate" in BAM file.
    See also MarkDuplicates_ in GATK.


    .. _MarkDuplicates: https://software.broadinstitute.org/gatk/documentation/tooldocs/current/picard_sam_markduplicates_MarkDuplicates.php

    Usage:
    ::
      baseq-SNV run_markdup -b Test.bam -m Test.marked.bam

    Return:
    metrics file indicates the numbers of duplicates for both single- and paired-end reads
    ::
      Test.marked.bam
      Test.marked.bam.bai
      Test.marked.bam.metrics
    """
    java = get_config("SNV", "java")
    picard = get_config("SNV", "picard")
    samtools = get_config("RNA", "samtools")
    cmd = markdup_cmd_script.format(java=java,
                                    picard=picard,
                                    samtools=samtools,
                                    markedbam=markedbam,
                                    bamfile=bamfile)
    run_cmd("Mark duplicates", "".join(cmd))
    return cmd

示例#5

0

显示文件

def run_cufflinks(genome, method):
    cufflinks = get_config("RNA", "cufflinks")
    cufflinks_anno = get_config("RNA_ref_" + genome, "cufflinks_anno")
    if method == "star":
        cufflinks_cmd = "{0} -q -o ./ -p 8 -G {1} Aligned.sortedByCoord.out.bam".format(
            cufflinks, cufflinks_anno)
    if method == "hisat":
        cufflinks_cmd = "{0} -q -o ./ -p 8 -G {1} hisat2_sorted.bam".format(
            cufflinks, cufflinks_anno)
    return cufflinks_cmd

示例#6

0

显示文件

文件： annovar.py 项目： basedata10/baseq

def run_annovar(filtervcf, annovarfile, name, genome, run=True):
    annovar = get_config("Annovar", "annovar")
    if genome == "hg38":
        ref = get_config("Annovar", "annovar_db_hg38")
    elif genome in ["hg37", "hg19"]:
        ref = get_config("Annovar", "annovar_db_hg19")
        genome = "hg19"
    annovar_cmd = annovar_cmd_script.format(annovar=annovar,
                                            filtervcf=filtervcf,
                                            annovarfile=annovarfile,
                                            ref_annovar=ref,
                                            name=name,
                                            genome=genome)
    if run:
        run_cmd("convert vcf file to aninput format", "".join(annovar_cmd))
    return annovar_cmd

示例#7

0

显示文件

文件： deseq2.py 项目： basedata10/baseq

def deseq2(config, tpmfile, countfile, groupfile, comparefile, outpath):
    """ Run DNACopy.R file ...
    input:
        tmp file
        count file
        group file: tell the group name for each
            samplename/groups/
        compare file: which groups should be compared...
            compare_name/group1/group2
        output path
    output:
        under the output path, for each
    """
    print(config, "XXXXXX")
    if config:
        df_cfg = pd.read_excel(config, sheet_name=["sample", "compare"])
        print(df_cfg["sample"])
        print(df_cfg["compare"])
        #write the sample file and group compare file ...

    Rscript = get_config("RNA", "deseq")
    script = os.path.join(r_script_dir, "DESeq2.R")
    cmd = "{} {} {} {} {} {} {}".format(Rscript, script, tpmfile, countfile,
                                        groupfile, comparefile, outpath)

    if not os.path.exists(outpath):
        os.mkdir(outpath)
        print("[info] Create OutDir {}".format(outpath))
    try:
        run_cmd("DESeq2", cmd)
    except:
        sys.exit("[error] Failed to run the Normalize Rscript ...")

示例#8

0

显示文件

文件： gatk.py 项目： basedata10/baseq

def run_createsomatic_pon(path_pon, path):
    gatk = get_config("SNV", "GATK")
    normalargs = listofvcf(path_pon)
    ponvcf = os.path.join(path, "pon.vcf.gz")
    pon_cmd = pon_cmd_script.format(gatk=gatk,
                                    normalvcfs=normalargs,
                                    ponvcf=ponvcf)
    run_cmd("create panel of normals", "".join(pon_cmd))

示例#9

0

显示文件

文件： gatk.py 项目： basedata10/baseq

def alignment(fq1, fq2, sample, genome, thread=8):
    """
    Map low-divergent sequences against reference genome using BWA.
    Add ReadGroup(more details about ReadGroup_ )to bamfile using the input sample name.
    Outfile is in BAM format and indexed for the downstream analysis.

    .. _ReadGroup: https://software.broadinstitute.org/gatk/documentation/article.php?id=6472

    Usage:
    ::
      baseq-SNV run_bwa -1 Reads.1.fq.gz -2 Read.2.fq.gz -g hg38 -n Test

    Return:
    ::
      Test.bam
      Test.bam.bai

    """
    bwa = get_config("SNV", "bwa")
    samtools = get_config("SNV", "samtools")
    genome = get_config("SNV_ref_" + genome, "bwa_index")
    viewedbam = sample + ".view.bam"
    samfile = sample + ".sam"
    if fq1 and fq2 and os.path.exists(fq1) and os.path.exists(fq2):
        bwa_cmd = bwa_cmd_script_p.format(bwa=bwa,
                                          sample=sample,
                                          genome=genome,
                                          fq1=fq1,
                                          fq2=fq2,
                                          samfile=samfile,
                                          thread=thread)
    elif fq1 and os.path.exists(fq1):
        bwa_cmd = bwa_cmd_script_s.format(bwa=bwa,
                                          sample=sample,
                                          genome=genome,
                                          fq1=fq1,
                                          samfile=samfile,
                                          thread=thread)
    sort_index_cmd = sort_index_cmd_script.format(samtools=samtools,
                                                  sample=sample,
                                                  samfile=samfile,
                                                  viewedbam=viewedbam)
    run_cmd("bwa alignment", "".join(bwa_cmd))
    run_cmd("samtools sort", "".join(sort_index_cmd))
    return bwa_cmd + "\n" + sort_index_cmd

示例#10

0

显示文件

文件： gatk.py 项目： basedata10/baseq

def filter_mutect_vcf(somaticvcf, calcontam_table, filter_call):
    gatk = get_config("SNV", "GATK")
    filtercall_cmd = filtercall_cmd_script.format(
        gatk=gatk,
        somaticvcf=somaticvcf,
        calcontam_table=calcontam_table,
        filter_call=filter_call)
    run_cmd("filter mutect calls using contamination table",
            "".join(filtercall_cmd))

示例#11

0

显示文件

def run_salmon(fq1, fq2, genome, outdir):
    salmon = get_config("RNA", "salmon")
    salmon_ref = get_config("RNA_ref_" + genome, "salmon_index")
    gene_map = get_config("RNA_ref_" + genome, "gene_map")
    if fq1 and fq2 and os.path.exists(fq1) and os.path.exists(fq2):
        salmon_cmd = [
            salmon, 'quant', '-i', salmon_ref, '-l A', '-1', fq1, '-2', fq2,
            '-p 8', '-g', gene_map, '-o', outdir
        ]
    elif fq1 and os.path.exists(fq1):
        salmon_cmd = [
            salmon, 'quant', '-i', salmon_ref, '-l A', '-r', fq1, '-p 8', '-g',
            gene_map, '-o', outdir
        ]
    else:
        sys.exit("[error]")
    run_cmd("Salmon Quantification", " ".join(salmon_cmd))
    return salmon_cmd

示例#12

0

显示文件

文件： gatk.py 项目： basedata10/baseq

def get_filter_table(tumorbam, resource, gps_table, calcontam_table):
    gatk = get_config("SNV", "GATK")
    filtertable_cmd = filtertable_cmd_script.format(
        gatk=gatk,
        tumorbam=tumorbam,
        resource=resource,
        gps_table=gps_table,
        calcontam_table=calcontam_table)
    run_cmd("obatin filter table for mutect calls", "".join(filtertable_cmd))

示例#13

0

显示文件

文件： star.py 项目： basedata10/baseq

def run_star(fq1, fq2, genome, outdir, run=True):
    star = get_config("RNA", "star")
    star_index = get_config("RNA_ref_" + genome, "star_index")
    samtools = get_config("RNA", "samtools")
    # Run hisat, samtools and cufflinks
    if not os.path.exists(outdir):
        os.mkdir(outdir)
        print("[info] Create outdir in: {}".format(outdir))
    if fq1 and fq2 and os.path.exists(fq1) and os.path.exists(fq2):
        star_cmd = script.format(outdir, star, star_index, fq1, fq2,samtools)
    elif fq1 and os.path.exists(fq1):
        star_cmd = script1.format(outdir, star, star_index, fq1,samtools)
    else:
        pass
    cufflinks_cmd = run_cufflinks(genome, method="star")
    if run:
        run_cmd("star analysis", "".join(star_cmd))
        run_cmd("cufflinks analysis", "".join(cufflinks_cmd))
    return star_cmd + "\n" + cufflinks_cmd

示例#14

0

显示文件

文件： gatk.py 项目： basedata10/baseq

def bqsr(markedbam, bqsrbam, genome, disable_dup_filter=False):
    """
    Run BQSR_. This function performs the two-steps process called base quality score recalibration. the first
    ster generates a recalibration table based on various covariates which is recruited to the second step to
    correct the systematic bias in input BAM file. More details about BaseRecalibrator_ and ApplyBQSR_ .


    .. _BQSR: https://gatkforums.broadinstitute.org/gatk/discussion/44/base-quality-score-recalibration-bqsr
    .. _BaseRecalibrator: https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_walkers_bqsr_BaseRecalibrator.php
    .. _ApplyBQSR: https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_walkers_bqsr_ApplyBQSR.php

    Usage:

    * Default mode filters duplicate reads (reads with "markduplicate" tags) before applying BQSR
      ::
         baseq-SNV run_bqsr -m Test.marked.bam -g hg38 -q Test.marked.bqsr.bam

    * Disable reads filter before analysis.
      ::
        baseq-SNV run_bqsr -m Test.marked.bam -g hg38 -q Test.marked.bqsr.bam -f Yes

    Return:
    ::
      Test.marked.bam.table
      Test.marked.bqsr.bai
      Test.marked.bqsr.bam
    """
    gatk = get_config("SNV", "GATK")
    index = get_config("SNV_ref_" + genome, "bwa_index")
    DBSNP = get_config("SNV_ref_" + genome, "DBSNP")
    SNP = get_config("SNV_ref_" + genome, "SNP")
    INDEL = get_config("SNV_ref_" + genome, "INDEL")
    interval = get_config("SNV_ref_" + genome, "interval")

    if not disable_dup_filter:
        bqsr_cmd = bqsr_cmd_script.format(gatk=gatk,
                                          index=index,
                                          interval=interval,
                                          markedbam=markedbam,
                                          bqsrbam=bqsrbam,
                                          dbsnp=DBSNP,
                                          snp=SNP,
                                          indel=INDEL)
    else:
        bqsr_cmd = bqsr_cmd_script_DRF.format(gatk=gatk,
                                              index=index,
                                              interval=interval,
                                              markedbam=markedbam,
                                              bqsrbam=bqsrbam,
                                              dbsnp=DBSNP,
                                              snp=SNP,
                                              indel=INDEL)
    run_cmd("BaseRecalibrator", "".join(bqsr_cmd))

示例#15

0

显示文件

def bin_counting(genome, bamfile, out):
    """
    bin counting using bisect for dynamicbin...
    """

    import bisect
    dynamic_bin = get_config("CNV_ref_" + genome, "dynamic_bin")
    df_bin = pd.read_table(dynamic_bin, sep="\t")

    chrs = df_bin["chr"].tolist()
    start = df_bin["start"].tolist()
    end = df_bin["end"].tolist()

    #Build region position for each chromosome...
    chr_bin_starts = {}
    chr_bin_ends = {}
    chr_bin_idx = {}
    for idx, chr in enumerate(chrs):
        if chr not in chr_bin_starts:
            chr_bin_starts[chr] = [start[idx]]
            chr_bin_ends[chr] = [end[idx]]
            chr_bin_idx[chr] = [idx]
        else:
            chr_bin_starts[chr].append(start[idx])
            chr_bin_ends[chr].append(end[idx])
            chr_bin_idx[chr].append(idx)

    lines = len(chrs)
    counts = [0] * lines

    #Read bam file
    reading = Popen(["samtools", "view", bamfile],
                    stdout=PIPE,
                    bufsize=1000000)
    infile = reading.stdout

    while True:
        data = infile.readline().decode('utf8')
        if data == "":
            break
        infos = data.split()

        #Filter on quality
        quality = int(infos[4])
        if quality < 40:
            continue

        #Counting reads to bins
        chr = infos[2]
        pos = int(infos[3])
        if not chr in chr_bin_starts:
            continue

        idx_chr = bisect.bisect_right(chr_bin_starts[chr], pos)

        if idx_chr < len(
                chr_bin_ends[chr]) and chr_bin_ends[chr][idx_chr] >= pos:
            idx = chr_bin_idx[chr][idx_chr]
            counts[idx] += 1

    df_bin["counts"] = counts
    print("[info] Bin Count File: {}".format(out))
    df_bin['counts'].to_csv(out, header=True, sep="\t")

示例#16

0

显示文件

文件： gatk.py 项目： basedata10/baseq

def create_pon(genome, list, path, interval):
    """
    Create_pon function helps you create PoN(panel of normals) file necessary for mutect2 calling. The PoN captures
    common artifactual and germline variant sites. Mutect2 then uses the PoN to filter variants at the site-level.

    Example of samples list (tab delimited):

    * Content of columns are normal sample name, normal BAM file, tumor sample name, tumor BAM file(order cannot be distruped).
      Absoulte path of all BAM files should be added if directory of BAM files and analysis directory are different.
      ::
        N504    N504_marked_bqsr.bam   T504    T504_marked_bqsr.bam
        N505    N505_marked_bqsr.bam   T505    T505_marked_bqsr.bam
        N506    N506_marked_bqsr.bam   T506    T506_marked_bqsr.bam
        N509    N509_marked_bqsr.bam   T509    T509_marked_bqsr.bam
        N510    N510_marked_bqsr.bam   T510    T510_marked_bqsr.bam

    Usage:
    
    * Interval list defines genomic regions where analysis is restricted. Introduction of interval list format and its function, please see here_.
      ::
        # designated a intervals.list
        baseq-SNV create_pon -g hg37 -l sample_list.txt -p ./ -L interval.list
        # Using the dafalut intervals.list
        baseq-SNV create_pon -g hg37 -l sample_list.txt -p ./

    .. _here: https://software.broadinstitute.org/gatk/documentation/article?id=11009
    """
    index = get_config("SNV_ref_" + genome, "bwa_index")
    gatk = get_config("SNV", "GATK")
    if not os.path.exists(path):
        print("[ERROR] No such file or directory")
    else:
        path_pon = os.path.join(path, "pon")
        if not os.path.exists(path_pon):
            os.mkdir(path_pon)

    with open(list, "r") as file:
        lines = file.readlines()
    sample_info = [line.strip().split() for line in lines]
    import multiprocessing as mp
    pool = mp.Pool(processes=6)
    results = []
    for sample in sample_info:
        normalvcf = os.path.join(path_pon,
                                 "{}_tumor-only.vcf.gz".format(sample[0]))
        if interval:
            normalvcf_cmd = normalvcf_cmd_script.format(gatk=gatk,
                                                        index=index,
                                                        normalbam=sample[1],
                                                        samplename=sample[0],
                                                        normalvcf=normalvcf,
                                                        interval=interval)
        else:
            normalvcf_cmd = normalvcf_cmd_script1.format(gatk=gatk,
                                                         index=index,
                                                         normalbam=sample[1],
                                                         samplename=sample[0],
                                                         normalvcf=normalvcf)
        results.append(
            pool.apply_async(
                run_cmd, ("creat normal vcf file", "".join(normalvcf_cmd, ))))
    pool.close()
    pool.join()
    [x.get() for x in results]
    normalargs = listofvcf(path_pon)
    ponvcf = os.path.join(path, "pon.vcf.gz")
    pon_cmd = pon_cmd_script.format(gatk=gatk,
                                    normalvcfs=normalargs,
                                    ponvcf=ponvcf)
    run_cmd("create panel of normals", "".join(pon_cmd))

示例#17

0

显示文件

文件： gatk.py 项目： basedata10/baseq

def mutect2(genome, normalname, normalbam, tumorname, tumorbam, vcffile, pon,
            germline):
    """
    Mutect2 is aim to call somatic SNVs and indels via local assembly of haplotypes. This function requires both
    tumor BAM file and its matched normal BAM file. tumorname and normalname should be consistent with the ReadGroup(ID) of tumor
    BAM file and normal BAM file respectively. PoN is refer to panel of normals callset(more infomation about PoN and how to
    create it, please see PoN_ ). Germline resource, also in VCF format, is used to annotate variant alleles. Default germline resource is
    downloaded from here_ .

    .. _here: https://software.broadinstitute.org/gatk/download/bundle
    .. _PoN: https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_walkers_mutect_CreateSomaticPanelOfNormals.php

    Usage:

    * Simplified Mutect2 command line
      ::
        # single sample
        baseq-SNV run_mutect2 -g hg37 -n normal -N normal_marked_bqsr.bam \\
                                      -t tumor -T tumor_marked_bqsr.bam -o ./
        # multiple samples
        baseq-SNV run_mutect2 -g hg37 -l sample_list.txt -o ./

    * Specify PoN(panels of normals) VCF file and germline VCF file
      Default germline VCF file comes form GATK resource bundle and is recruited if germline isn't designated.
      ::
        # single sample
        baseq-SNV run_mutect2 -g hg37 -n normal -N normal_marked_bqsr.bam \\
                                      -t tumor -T tumor_marked_bqsr.bam -o ./ \\
                                      -p pon.vcf.gz -G af-only-gnomad.raw.sites.b37.vcf.gz
        # multiple samples
        baseq-SNV run_mutect2 -g hg37 -l sample_list.txt -o ./ \\
                                      -p pon.vcf.gz -G af-only-gnomad.raw.sites.b37.vcf.gz

    """

    gatk = get_config("SNV", "GATK")
    index = get_config("SNV_ref_" + genome, "bwa_index")
    if pon:
        if not germline:
            germline = get_config("SNV_ref_" + genome, "germline")
            mutect2_cmd = mutect2_cmd_stardand_script.format(
                gatk=gatk,
                index=index,
                normalbam=normalbam,
                normalname=normalname,
                tumorbam=tumorbam,
                tumorname=tumorname,
                vcffile=vcffile,
                germline=germline,
                pon=pon)
        else:
            mutect2_cmd = mutect2_cmd_stardand_script.format(
                gatk=gatk,
                index=index,
                normalbam=normalbam,
                normalname=normalname,
                tumorbam=tumorbam,
                tumorname=tumorname,
                vcffile=vcffile,
                germline=germline,
                pon=pon)
    else:
        mutect2_cmd = mutect2_cmd_simplified_script.format(
            gatk=gatk,
            index=index,
            normalbam=normalbam,
            normalname=normalname,
            tumorbam=tumorbam,
            tumorname=tumorname,
            vcffile=vcffile)

    run_cmd("mutect annlysis", "".join(mutect2_cmd))

示例#18

0

显示文件

def Normalize_GC_py(genome, bincount, outpath):
    """Normalize the Raw bin counts with GC contents...
    Run a Rscript named as 'Lowess.R'.
    Args:
        genome : Raw Read Counts
        bincount : Genome GC content file
    Process:
        Read Counts are first normalized by mean;
        Then normalized by GC using Lowess Regression;
    Output:
        GC_content_image: images
        Normalized bin counts (1M)
    """
    dynamic_bin = get_config("CNV_ref_"+genome, "dynamic_bin")
    df_counts = pd.read_table(bincount)
    df = pd.read_table(dynamic_bin)
    df['counts'] = df_counts['counts']
    #First, Aggregate the Bins into 500kb...
    print("[info] Aggregate the bins into 500kb...")
    df = df.groupby(df.index // 10).agg({"chr":"first", "start": "mean", "absstart": "mean", "GC": "mean", "length" : "sum", "counts" : "sum"})

    #Do the normalization on Length
    df['norm_counts'] = df['counts']/df['length']
    df['norm_counts'] = df['norm_counts']/df['norm_counts'].mean()
    df['norm_counts_log'] = np.log(df.norm_counts + 0.01)

    #Do Norm on GC...
    lowess = sm.nonparametric.lowess
    z = lowess(df.norm_counts_log, df.GC)
    f = interp1d(list(zip(*z))[0], list(zip(*z))[1], bounds_error = False)
    df['norm_by_GC'] = np.exp(df.norm_counts_log - f(df['GC']))-0.01
    df['norm_by_GC'] = df['norm_by_GC']/df['norm_by_GC'].mean()

    #plot GC correction...
    plt.figure(figsize=(10, 10))
    plt.subplot(2, 2, 1)
    plt.title('Raw Normalized Reads (500Kb)')
    plt.scatter(df.GC, df.norm_counts, facecolors='none', edgecolors='r')

    plt.subplot(2, 2, 2)
    plt.title('GC corrected (500Kb)')
    plt.scatter(df.GC, df.norm_by_GC, facecolors='none', edgecolors='b')

    #Peaks Detection
    plt.subplot(2, 2, 3)
    plt.title('Peaks')
    plt.hist(df.norm_by_GC, bins=300, density=True)

    #Peaks Detection..
    Ploidy_Lists = [x/40 for x in range(60, 240, 1)]
    SoS = []
    for ploidy in Ploidy_Lists:
        errors = round(df['norm_by_GC']*ploidy)-df['norm_by_GC']*ploidy
        SoS.append(sum([x*x for x in errors]))
    estimate_ploidy = Ploidy_Lists[SoS.index(min(SoS))]

    print("[info] The estimated plodity is {}".format(estimate_ploidy))
    plt.subplot(2, 2, 4)
    plt.title('Plodity Estimate: {}'.format(estimate_ploidy))
    plt.ylabel("Errors (Should be Minimized)")
    plt.xlabel("Ploidy")
    plt.plot(Ploidy_Lists, SoS)
    df['norm_by_GC_Ploidy'] =  df['norm_by_GC']*estimate_ploidy

    df_export = df[['chr', 'start', 'absstart', 'norm_by_GC', 'norm_by_GC_Ploidy']]
    print("[info] Write to {}".format(outpath))
    df_export.to_csv(outpath, sep="\t", float_format='%.2f')

    plt.savefig("Normalize.png")
    print(df)