Пример #1
0
def main():
    """Evaluate Canvas results on simulated data"""

    parser = OptionParser()
    parser.add_option("-s", "--canvas_sim_id", dest="canvas_sim_id", type="string", help="simulation id (find Canvas results directories strating with sim_id")
    parser.add_option("-d", "--canvas_sim_dir", dest="canvas_sim_dir", type="string", help="Canvas results directory")
    (options, args) = parser.parse_args()

    if options.canvas_sim_dir is None and options.canvas_sim_id is None:
        parser.error("Specify Canvas results simulation id or directory")
    if options.canvas_sim_dir is not None and options.canvas_sim_id is not None:
        parser.error("Canvas results simulation id and directory are mutually exclusive options")
    if options.canvas_sim_dir is not None:
        res_dirs = [options.canvas_sim_dir]
    if options.canvas_sim_id is not None:
        canvas_dir = "/illumina/scratch/tmp/users/ccolombo/Canvas/"
        res_dirs = [canvas_dir + cdir for cdir in os.listdir(canvas_dir) if os.path.isdir((canvas_dir + cdir)) and cdir.startswith(options.canvas_sim_id)]
        if len(res_dirs) == 0:
            print "No Canvas results directories found"
    hg_file = "/home/ccolombo/filtered_human.hg19.genome"

    for res_dir in res_dirs:

        print res_dir

        sim_dir = re.sub("/Canvas/", "/simulation/", res_dir)
        res_id = re.sub("sim", "ev", res_dir.split("/")[-1])

        if not os.path.exists(res_dir):
            print("\nCanvas results directory %s does not exist\n\n" % res_dir)
            continue
        if not os.path.exists(res_dir):
            print("\nSimulation directory %s does not exist\n\n" % sim_dir)
            continue
        if os.path.basename(res_dir).startswith("simNorm") or res_dir.endswith("_purity100"):
            continue

        out_dir = os.path.join("/illumina/scratch/tmp/users/ccolombo/evaluation/EvaluateCNV", res_id)
        if not os.path.exists(out_dir):
            os.mkdir(out_dir)
        truth_file = create_het_cn_file(sim_dir, os.path.join(out_dir, "het_truth_file.bed"), no_dipl=True, hap_split=True, round=True)
        #truth_file = "/illumina/scratch/tmp/users/ccolombo/simulation/HCC2218flt.bed"
        truth_file = add_BED_complement(truth_file, hg_file, sort=False, out_dir=out_dir, hap_split=True)
        perc_file = create_var_perc_file(sim_dir, os.path.join(out_dir, "var_perc.bed"), no_dipl=True)

        # Write simulation parameters to file
        out_file = os.path.join(out_dir, res_id + "_par.txt")
        if not os.path.exists(out_file):
            get_sim_params(sim_dir, truth_file, perc_file, out_file, hg_file)

        # Run EvaluateCNV
        out_file = os.path.join(out_dir, res_id + ".txt")
        excl_file = "/illumina/scratch/tmp/users/ccolombo/evaluation/sim_filter.bed"
        if not os.path.exists(out_file):
        #if True:
            evaluate_CNV(res_dir, truth_file, excl_file, out_file)

        # Run EvaluateCNV only on heterogeneous variants
        out_file = os.path.join(out_dir, res_id + "_onlyhet.txt")
        #if True:
        if not os.path.exists(out_file):
            with open(excl_file, "r") as ef:
                excl_vars = ef.readlines()
            with open(perc_file, "r") as pf:
                for line in pf:
                    (chr_id, start, end, cnA, cnB, perc) = line.strip().split("\t")
                    if float(perc) >= 0.8 and chr_id not in ["chrX", "chrY", "chrM"]:
                        excl_vars.append("\t".join([chr_id, start, end]) + "\n")
            os.system("rm %s" % (perc_file))
            excl_file = os.path.join(out_dir, "filter_onlyhet.bed")
            with open(excl_file, "w") as wf:
                wf.writelines(excl_vars)

            evaluate_CNV(res_dir, truth_file, excl_file, out_file)
Пример #2
0
def create_ev_cov_files(cnv_file, cnv_type, part_file, out_dir, out_filename,
                        tumor_purity, excl_file, save_tmp, add_perc, add_shift,
                        hg_file):

    out_file = os.path.join(out_dir, out_filename)
    out_file_bin = re.sub(".bed$", "_bin.bed", out_file)
    hap_coverage = 0
    if cnv_type == "canvas":
        cnv_file, tumor_purity, hap_coverage = init_param_canvas_file(
            cnv_file, out_file)
    elif cnv_type == "truth":
        cnv_file = init_param_truth_file(cnv_file, out_dir, hg_file)
    rm_files = [cnv_file]
    print "Tumor purity: %f" % tumor_purity

    # Exclude regions
    if excl_file is not None:
        if not os.path.exists(excl_file):
            print "File for excluded regions %s does not exist\n" % excl_file
        else:
            cmd = "/illumina/thirdparty/bedtools/bedtools2-2.22.1/bin/bedtools subtract"
            cmd += " -a %s" % cnv_file
            cmd += " -b %s" % excl_file
            cnv_file = run_bedtools_cmd(cmd, cnv_file, "_excl")
            rm_files.append(cnv_file)

    # Extract partitioned file
    extr_part_file = re.sub(".bed$", "_partitioned.bed", out_file)
    pc = subprocess.Popen("gunzip -c %s > %s" % (part_file, extr_part_file),
                          shell=True)
    pc.wait()

    # Intersect variants file with bin file
    cmd = "/illumina/thirdparty/bedtools/bedtools2-2.22.1/bin/bedtools intersect"
    cmd += " -a %s" % cnv_file
    cmd += " -b %s" % extr_part_file
    cmd += " -wa -wb"
    intersect_file = run_bedtools_cmd(cmd, out_file, "_int")
    rm_files.extend([intersect_file, extr_part_file])

    with open(intersect_file, "r") as ifile:
        int_lines = ifile.readlines()
        if hap_coverage == 0:
            hap_coverage = numpy.median([
                float(int_line.strip().split("\t")[7])
                for int_line in int_lines
                if float(int_line.strip().split("\t")[3]) == 2
            ]) / 2
            print "Haplotype coverage: %f (median)" % hap_coverage
        wlines = ["#chr\tstart\tend\tCN\tobservedCoverage\texpectedCoverage"]
        for int_line in int_lines:
            (chr_segm, start_segm, end_segm, cn_segm, chr_bin, start_bin,
             end_bin, cov_bin, segm_bin) = int_line.strip().split("\t")
            expected_coverage = round((float(cn_segm) * tumor_purity + 2 *
                                       (1 - tumor_purity)) * hap_coverage, 3)
            wlines.append("\t".join([
                chr_bin, start_bin, end_bin, cn_segm, cov_bin,
                str(expected_coverage)
            ]))
    with open(out_file_bin, "w") as ofile:
        ofile.writelines("\n".join(wlines))

    # # Sort chromosome intersect file
    # cmd = "/illumina/thirdparty/bedtools/bedtools2-2.22.1/bin/bedtools sort"
    # cmd += " -i %s" % intersect_file
    # sorted_file = run_bedtools_cmd(cmd, out_file, "_sort")
    # rm_files.append(sorted_file)
    # Sort chromosome intersect file
    cmd = "sort -k 1,1 -k2,2n %s" % intersect_file
    sorted_file = run_bedtools_cmd(cmd, out_file, "_sort")
    rm_files.append(sorted_file)

    # Merge intersect file (median observed coverage)
    cmd = "/illumina/thirdparty/bedtools/bedtools2-2.22.1/bin/bedtools merge"
    cmd += " -i %s" % intersect_file
    cmd += " -d -1"
    cmd += " -c 4,8"
    cmd += " -o mean,median"
    merged_file = run_bedtools_cmd(cmd, intersect_file, "_merg")
    rm_files.append(merged_file)

    # Calculate expected coverage for each segment
    with open(merged_file, "r") as mfile:
        wlines = ["#chr\tstart\tend\tCN\tobservedCoverage\texpectedCoverage"]
        for cnv_line in mfile.readlines():
            (chrid, start, end, cn, cov) = cnv_line.strip().split("\t")
            expected_coverage = round((float(cn) * tumor_purity + 2 *
                                       (1 - tumor_purity)) * hap_coverage, 3)
            wlines.append(cnv_line.strip() + "\t" + str(expected_coverage))
    with open(out_file, "w") as ofile:
        ofile.writelines("\n".join(wlines))

    # Calculate expected coverage for each segment with shifted
    cn_shift_suffix = []
    if add_shift:
        for cn_shift in [-2, -1, 1, 2]:
            with open(merged_file, "r") as mfile:
                wlines = [
                    "#chr\tstart\tend\tCN\tobservedCoverage\texpectedCovearge"
                ]
                for cnv_line in mfile.readlines():
                    (chrid, start, end, cn, cov) = cnv_line.strip().split("\t")
                    expected_coverage = round(
                        ((float(cn) + cn_shift) * tumor_purity + 2 *
                         (1 - tumor_purity)) * hap_coverage, 3)
                    wlines.append(cnv_line.strip() + "\t" +
                                  str(expected_coverage))
                cn_shift_suffix.append("_CNshift_" + str(cn_shift))
                with open(
                        re.sub(".bed$", cn_shift_suffix[-1] + ".bed",
                               out_file), "w") as ofile:
                    ofile.writelines("\n".join(wlines))

    # Remove temporary files
    if not save_tmp:
        for rmf in rm_files:
            print "rm %s" % os.path.join(out_dir, rmf)
            subprocess.Popen("rm %s" % os.path.join(out_dir, rmf), shell=True)

    print "Output bed file written"

    if add_perc:
        if os.path.basename(out_file)[:-4].endswith("_truth"):
            sim_dir = "/illumina/scratch/tmp/users/ccolombo/simulation/" + os.path.basename(
                out_file)[4:-10]
        else:
            sim_dir = "/illumina/scratch/tmp/users/ccolombo/simulation/" + os.path.basename(
                out_file)[4:-4]
        if os.path.exists(sim_dir):

            # For each variant: clonal percentage, coverage of overlapping segments
            perc_file = create_var_perc_file(sim_dir, None, True)
            cmd = "/illumina/thirdparty/bedtools/bedtools2-2.22.1/bin/bedtools intersect"
            cmd += " -a %s" % perc_file
            cmd += " -b %s" % out_file
            cmd += " -wa -wb"
            run_bedtools_cmd(cmd, out_file, "_perc")

            # For the whole genome: clonal percentage, coverage of overlapping bins
            cmd = "/illumina/thirdparty/bedtools/bedtools2-2.22.1/bin/bedtools complement"
            cmd += " -i %s" % perc_file
            cmd += " -g %s" % hg_file
            pc = subprocess.Popen(cmd,
                                  stdout=subprocess.PIPE,
                                  stderr=subprocess.PIPE,
                                  shell=True)
            compl = pc.stdout.readlines()
            with open(perc_file, "r") as pbed:
                lines = pbed.readlines()
            for line in compl:
                lines.append("\n" + line.rstrip() + "\t" + str(1) + "\t" +
                             str(1) + "\t" + str(1.0))
            with open(perc_file, "w") as pbed:
                pbed.writelines(lines)
            cmd = "/illumina/thirdparty/bedtools/bedtools2-2.22.1/bin/bedtools intersect"
            cmd += " -a %s" % perc_file
            cmd += " -b %s" % out_file_bin
            cmd += " -wa -wb"
            run_bedtools_cmd(cmd, out_file_bin, "_perc_all")
            print("rm %s" % perc_file)
            subprocess.Popen("rm %s" % perc_file, shell=True)
            print "Output bed file with variants percentages written"

        else:
            print "Cannot find simulation directory"
Пример #3
0
def create_ev_cov_files(cnv_file, cnv_type, part_file, out_dir, out_filename, tumor_purity, excl_file, save_tmp, add_perc, add_shift, hg_file):

    out_file = os.path.join(out_dir, out_filename)
    out_file_bin = re.sub(".bed$", "_bin.bed", out_file)
    hap_coverage = 0
    if cnv_type == "canvas":
        cnv_file, tumor_purity, hap_coverage = init_param_canvas_file(cnv_file, out_file)
    elif cnv_type == "truth":
        cnv_file = init_param_truth_file(cnv_file, out_dir, hg_file)
    rm_files = [cnv_file]
    print "Tumor purity: %f" % tumor_purity

    # Exclude regions
    if excl_file is not None:
            if not os.path.exists(excl_file):
                print "File for excluded regions %s does not exist\n" % excl_file
            else:
                cmd = "/illumina/thirdparty/bedtools/bedtools2-2.22.1/bin/bedtools subtract"
                cmd += " -a %s" % cnv_file
                cmd += " -b %s" % excl_file
                cnv_file = run_bedtools_cmd(cmd, cnv_file, "_excl")
                rm_files.append(cnv_file)

    # Extract partitioned file
    extr_part_file = re.sub(".bed$", "_partitioned.bed", out_file)
    pc = subprocess.Popen("gunzip -c %s > %s" % (part_file, extr_part_file), shell=True)
    pc.wait()

    # Intersect variants file with bin file
    cmd = "/illumina/thirdparty/bedtools/bedtools2-2.22.1/bin/bedtools intersect"
    cmd += " -a %s" % cnv_file
    cmd += " -b %s" % extr_part_file
    cmd += " -wa -wb"
    intersect_file = run_bedtools_cmd(cmd, out_file, "_int")
    rm_files.extend([intersect_file, extr_part_file])

    with open(intersect_file, "r") as ifile:
        int_lines = ifile.readlines()
        if hap_coverage==0:
            hap_coverage = numpy.median([float(int_line.strip().split("\t")[7]) for int_line in int_lines if float(int_line.strip().split("\t")[3])==2])/2
            print "Haplotype coverage: %f (median)" % hap_coverage
        wlines = ["#chr\tstart\tend\tCN\tobservedCoverage\texpectedCoverage"]
        for int_line in int_lines:
            (chr_segm, start_segm, end_segm, cn_segm, chr_bin, start_bin, end_bin, cov_bin, segm_bin) = int_line.strip().split("\t")
            expected_coverage = round((float(cn_segm)*tumor_purity + 2*(1-tumor_purity)) * hap_coverage, 3)
            wlines.append("\t".join([chr_bin, start_bin, end_bin, cn_segm, cov_bin, str(expected_coverage)]))
    with open(out_file_bin, "w") as ofile:
        ofile.writelines("\n".join(wlines))

    # # Sort chromosome intersect file
    # cmd = "/illumina/thirdparty/bedtools/bedtools2-2.22.1/bin/bedtools sort"
    # cmd += " -i %s" % intersect_file
    # sorted_file = run_bedtools_cmd(cmd, out_file, "_sort")
    # rm_files.append(sorted_file)
    # Sort chromosome intersect file
    cmd = "sort -k 1,1 -k2,2n %s" % intersect_file
    sorted_file = run_bedtools_cmd(cmd, out_file, "_sort")
    rm_files.append(sorted_file)

    # Merge intersect file (median observed coverage)
    cmd = "/illumina/thirdparty/bedtools/bedtools2-2.22.1/bin/bedtools merge"
    cmd += " -i %s" % intersect_file
    cmd += " -d -1"
    cmd += " -c 4,8"
    cmd += " -o mean,median"
    merged_file = run_bedtools_cmd(cmd, intersect_file, "_merg")
    rm_files.append(merged_file)

    # Calculate expected coverage for each segment
    with open(merged_file, "r") as mfile:
        wlines = ["#chr\tstart\tend\tCN\tobservedCoverage\texpectedCoverage"]
        for cnv_line in mfile.readlines():
            (chrid, start, end, cn, cov) = cnv_line.strip().split("\t")
            expected_coverage = round((float(cn)*tumor_purity + 2*(1-tumor_purity)) * hap_coverage, 3)
            wlines.append(cnv_line.strip() + "\t" + str(expected_coverage))
    with open(out_file, "w") as ofile:
        ofile.writelines("\n".join(wlines))

    # Calculate expected coverage for each segment with shifted
    cn_shift_suffix = []
    if add_shift:
        for cn_shift in [-2, -1, 1, 2]:
            with open(merged_file, "r") as mfile:
                wlines = ["#chr\tstart\tend\tCN\tobservedCoverage\texpectedCovearge"]
                for cnv_line in mfile.readlines():
                    (chrid, start, end, cn, cov) = cnv_line.strip().split("\t")
                    expected_coverage = round(((float(cn) + cn_shift)*tumor_purity + 2*(1-tumor_purity)) * hap_coverage, 3)
                    wlines.append(cnv_line.strip() + "\t" + str(expected_coverage))
                cn_shift_suffix.append("_CNshift_" + str(cn_shift))
                with open(re.sub(".bed$", cn_shift_suffix[-1] + ".bed", out_file), "w") as ofile:
                    ofile.writelines("\n".join(wlines))

    # Remove temporary files
    if not save_tmp:
        for rmf in rm_files:
            print "rm %s" % os.path.join(out_dir, rmf)
            subprocess.Popen("rm %s" % os.path.join(out_dir, rmf), shell=True)

    print "Output bed file written"

    if add_perc:
        if os.path.basename(out_file)[:-4].endswith("_truth"):
            sim_dir = "/illumina/scratch/tmp/users/ccolombo/simulation/" + os.path.basename(out_file)[4:-10]
        else:
            sim_dir = "/illumina/scratch/tmp/users/ccolombo/simulation/" + os.path.basename(out_file)[4:-4]
        if os.path.exists(sim_dir):

            # For each variant: clonal percentage, coverage of overlapping segments
            perc_file = create_var_perc_file(sim_dir, None, True)
            cmd = "/illumina/thirdparty/bedtools/bedtools2-2.22.1/bin/bedtools intersect"
            cmd += " -a %s" % perc_file
            cmd += " -b %s" % out_file
            cmd += " -wa -wb"
            run_bedtools_cmd(cmd, out_file, "_perc")

            # For the whole genome: clonal percentage, coverage of overlapping bins
            cmd = "/illumina/thirdparty/bedtools/bedtools2-2.22.1/bin/bedtools complement"
            cmd += " -i %s" % perc_file
            cmd += " -g %s" % hg_file
            pc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
            compl = pc.stdout.readlines()
            with open(perc_file, "r") as pbed:
                lines = pbed.readlines()
            for line in compl:
                lines.append("\n" + line.rstrip() + "\t" + str(1) + "\t" + str(1) + "\t" + str(1.0))
            with open(perc_file, "w") as pbed:
                pbed.writelines(lines)
            cmd = "/illumina/thirdparty/bedtools/bedtools2-2.22.1/bin/bedtools intersect"
            cmd += " -a %s" % perc_file
            cmd += " -b %s" % out_file_bin
            cmd += " -wa -wb"
            run_bedtools_cmd(cmd, out_file_bin, "_perc_all")
            print("rm %s" % perc_file)
            subprocess.Popen("rm %s" % perc_file, shell=True)
            print "Output bed file with variants percentages written"

        else:
            print "Cannot find simulation directory"
Пример #4
0
def main():
    """Evaluate Canvas results on simulated data"""

    parser = OptionParser()
    parser.add_option(
        "-s",
        "--canvas_sim_id",
        dest="canvas_sim_id",
        type="string",
        help=
        "simulation id (find Canvas results directories strating with sim_id")
    parser.add_option("-d",
                      "--canvas_sim_dir",
                      dest="canvas_sim_dir",
                      type="string",
                      help="Canvas results directory")
    (options, args) = parser.parse_args()

    if options.canvas_sim_dir is None and options.canvas_sim_id is None:
        parser.error("Specify Canvas results simulation id or directory")
    if options.canvas_sim_dir is not None and options.canvas_sim_id is not None:
        parser.error(
            "Canvas results simulation id and directory are mutually exclusive options"
        )
    if options.canvas_sim_dir is not None:
        res_dirs = [options.canvas_sim_dir]
    if options.canvas_sim_id is not None:
        canvas_dir = "/illumina/scratch/tmp/users/ccolombo/Canvas/"
        res_dirs = [
            canvas_dir + cdir for cdir in os.listdir(canvas_dir)
            if os.path.isdir((canvas_dir +
                              cdir)) and cdir.startswith(options.canvas_sim_id)
        ]
        if len(res_dirs) == 0:
            print "No Canvas results directories found"
    hg_file = "/home/ccolombo/filtered_human.hg19.genome"

    for res_dir in res_dirs:

        print res_dir

        sim_dir = re.sub("/Canvas/", "/simulation/", res_dir)
        res_id = re.sub("sim", "ev", res_dir.split("/")[-1])

        if not os.path.exists(res_dir):
            print("\nCanvas results directory %s does not exist\n\n" % res_dir)
            continue
        if not os.path.exists(res_dir):
            print("\nSimulation directory %s does not exist\n\n" % sim_dir)
            continue
        if os.path.basename(res_dir).startswith("simNorm") or res_dir.endswith(
                "_purity100"):
            continue

        out_dir = os.path.join(
            "/illumina/scratch/tmp/users/ccolombo/evaluation/EvaluateCNV",
            res_id)
        if not os.path.exists(out_dir):
            os.mkdir(out_dir)
        truth_file = create_het_cn_file(sim_dir,
                                        os.path.join(out_dir,
                                                     "het_truth_file.bed"),
                                        no_dipl=True,
                                        hap_split=True,
                                        round=True)
        #truth_file = "/illumina/scratch/tmp/users/ccolombo/simulation/HCC2218flt.bed"
        truth_file = add_BED_complement(truth_file,
                                        hg_file,
                                        sort=False,
                                        out_dir=out_dir,
                                        hap_split=True)
        perc_file = create_var_perc_file(sim_dir,
                                         os.path.join(out_dir, "var_perc.bed"),
                                         no_dipl=True)

        # Write simulation parameters to file
        out_file = os.path.join(out_dir, res_id + "_par.txt")
        if not os.path.exists(out_file):
            get_sim_params(sim_dir, truth_file, perc_file, out_file, hg_file)

        # Run EvaluateCNV
        out_file = os.path.join(out_dir, res_id + ".txt")
        excl_file = "/illumina/scratch/tmp/users/ccolombo/evaluation/sim_filter.bed"
        if not os.path.exists(out_file):
            #if True:
            evaluate_CNV(res_dir, truth_file, excl_file, out_file)

        # Run EvaluateCNV only on heterogeneous variants
        out_file = os.path.join(out_dir, res_id + "_onlyhet.txt")
        #if True:
        if not os.path.exists(out_file):
            with open(excl_file, "r") as ef:
                excl_vars = ef.readlines()
            with open(perc_file, "r") as pf:
                for line in pf:
                    (chr_id, start, end, cnA, cnB,
                     perc) = line.strip().split("\t")
                    if float(perc) >= 0.8 and chr_id not in [
                            "chrX", "chrY", "chrM"
                    ]:
                        excl_vars.append("\t".join([chr_id, start, end]) +
                                         "\n")
            os.system("rm %s" % (perc_file))
            excl_file = os.path.join(out_dir, "filter_onlyhet.bed")
            with open(excl_file, "w") as wf:
                wf.writelines(excl_vars)

            evaluate_CNV(res_dir, truth_file, excl_file, out_file)