def main(): """Evaluate Canvas results on simulated data""" parser = OptionParser() parser.add_option("-s", "--canvas_sim_id", dest="canvas_sim_id", type="string", help="simulation id (find Canvas results directories strating with sim_id") parser.add_option("-d", "--canvas_sim_dir", dest="canvas_sim_dir", type="string", help="Canvas results directory") (options, args) = parser.parse_args() if options.canvas_sim_dir is None and options.canvas_sim_id is None: parser.error("Specify Canvas results simulation id or directory") if options.canvas_sim_dir is not None and options.canvas_sim_id is not None: parser.error("Canvas results simulation id and directory are mutually exclusive options") if options.canvas_sim_dir is not None: res_dirs = [options.canvas_sim_dir] if options.canvas_sim_id is not None: canvas_dir = "/illumina/scratch/tmp/users/ccolombo/Canvas/" res_dirs = [canvas_dir + cdir for cdir in os.listdir(canvas_dir) if os.path.isdir((canvas_dir + cdir)) and cdir.startswith(options.canvas_sim_id)] if len(res_dirs) == 0: print "No Canvas results directories found" hg_file = "/home/ccolombo/filtered_human.hg19.genome" for res_dir in res_dirs: print res_dir sim_dir = re.sub("/Canvas/", "/simulation/", res_dir) res_id = re.sub("sim", "ev", res_dir.split("/")[-1]) if not os.path.exists(res_dir): print("\nCanvas results directory %s does not exist\n\n" % res_dir) continue if not os.path.exists(res_dir): print("\nSimulation directory %s does not exist\n\n" % sim_dir) continue if os.path.basename(res_dir).startswith("simNorm") or res_dir.endswith("_purity100"): continue out_dir = os.path.join("/illumina/scratch/tmp/users/ccolombo/evaluation/EvaluateCNV", res_id) if not os.path.exists(out_dir): os.mkdir(out_dir) truth_file = create_het_cn_file(sim_dir, os.path.join(out_dir, "het_truth_file.bed"), no_dipl=True, hap_split=True, round=True) #truth_file = "/illumina/scratch/tmp/users/ccolombo/simulation/HCC2218flt.bed" truth_file = add_BED_complement(truth_file, hg_file, sort=False, out_dir=out_dir, hap_split=True) perc_file = create_var_perc_file(sim_dir, os.path.join(out_dir, "var_perc.bed"), no_dipl=True) # Write simulation parameters to file out_file = os.path.join(out_dir, res_id + "_par.txt") if not os.path.exists(out_file): get_sim_params(sim_dir, truth_file, perc_file, out_file, hg_file) # Run EvaluateCNV out_file = os.path.join(out_dir, res_id + ".txt") excl_file = "/illumina/scratch/tmp/users/ccolombo/evaluation/sim_filter.bed" if not os.path.exists(out_file): #if True: evaluate_CNV(res_dir, truth_file, excl_file, out_file) # Run EvaluateCNV only on heterogeneous variants out_file = os.path.join(out_dir, res_id + "_onlyhet.txt") #if True: if not os.path.exists(out_file): with open(excl_file, "r") as ef: excl_vars = ef.readlines() with open(perc_file, "r") as pf: for line in pf: (chr_id, start, end, cnA, cnB, perc) = line.strip().split("\t") if float(perc) >= 0.8 and chr_id not in ["chrX", "chrY", "chrM"]: excl_vars.append("\t".join([chr_id, start, end]) + "\n") os.system("rm %s" % (perc_file)) excl_file = os.path.join(out_dir, "filter_onlyhet.bed") with open(excl_file, "w") as wf: wf.writelines(excl_vars) evaluate_CNV(res_dir, truth_file, excl_file, out_file)
def create_ev_cov_files(cnv_file, cnv_type, part_file, out_dir, out_filename, tumor_purity, excl_file, save_tmp, add_perc, add_shift, hg_file): out_file = os.path.join(out_dir, out_filename) out_file_bin = re.sub(".bed$", "_bin.bed", out_file) hap_coverage = 0 if cnv_type == "canvas": cnv_file, tumor_purity, hap_coverage = init_param_canvas_file( cnv_file, out_file) elif cnv_type == "truth": cnv_file = init_param_truth_file(cnv_file, out_dir, hg_file) rm_files = [cnv_file] print "Tumor purity: %f" % tumor_purity # Exclude regions if excl_file is not None: if not os.path.exists(excl_file): print "File for excluded regions %s does not exist\n" % excl_file else: cmd = "/illumina/thirdparty/bedtools/bedtools2-2.22.1/bin/bedtools subtract" cmd += " -a %s" % cnv_file cmd += " -b %s" % excl_file cnv_file = run_bedtools_cmd(cmd, cnv_file, "_excl") rm_files.append(cnv_file) # Extract partitioned file extr_part_file = re.sub(".bed$", "_partitioned.bed", out_file) pc = subprocess.Popen("gunzip -c %s > %s" % (part_file, extr_part_file), shell=True) pc.wait() # Intersect variants file with bin file cmd = "/illumina/thirdparty/bedtools/bedtools2-2.22.1/bin/bedtools intersect" cmd += " -a %s" % cnv_file cmd += " -b %s" % extr_part_file cmd += " -wa -wb" intersect_file = run_bedtools_cmd(cmd, out_file, "_int") rm_files.extend([intersect_file, extr_part_file]) with open(intersect_file, "r") as ifile: int_lines = ifile.readlines() if hap_coverage == 0: hap_coverage = numpy.median([ float(int_line.strip().split("\t")[7]) for int_line in int_lines if float(int_line.strip().split("\t")[3]) == 2 ]) / 2 print "Haplotype coverage: %f (median)" % hap_coverage wlines = ["#chr\tstart\tend\tCN\tobservedCoverage\texpectedCoverage"] for int_line in int_lines: (chr_segm, start_segm, end_segm, cn_segm, chr_bin, start_bin, end_bin, cov_bin, segm_bin) = int_line.strip().split("\t") expected_coverage = round((float(cn_segm) * tumor_purity + 2 * (1 - tumor_purity)) * hap_coverage, 3) wlines.append("\t".join([ chr_bin, start_bin, end_bin, cn_segm, cov_bin, str(expected_coverage) ])) with open(out_file_bin, "w") as ofile: ofile.writelines("\n".join(wlines)) # # Sort chromosome intersect file # cmd = "/illumina/thirdparty/bedtools/bedtools2-2.22.1/bin/bedtools sort" # cmd += " -i %s" % intersect_file # sorted_file = run_bedtools_cmd(cmd, out_file, "_sort") # rm_files.append(sorted_file) # Sort chromosome intersect file cmd = "sort -k 1,1 -k2,2n %s" % intersect_file sorted_file = run_bedtools_cmd(cmd, out_file, "_sort") rm_files.append(sorted_file) # Merge intersect file (median observed coverage) cmd = "/illumina/thirdparty/bedtools/bedtools2-2.22.1/bin/bedtools merge" cmd += " -i %s" % intersect_file cmd += " -d -1" cmd += " -c 4,8" cmd += " -o mean,median" merged_file = run_bedtools_cmd(cmd, intersect_file, "_merg") rm_files.append(merged_file) # Calculate expected coverage for each segment with open(merged_file, "r") as mfile: wlines = ["#chr\tstart\tend\tCN\tobservedCoverage\texpectedCoverage"] for cnv_line in mfile.readlines(): (chrid, start, end, cn, cov) = cnv_line.strip().split("\t") expected_coverage = round((float(cn) * tumor_purity + 2 * (1 - tumor_purity)) * hap_coverage, 3) wlines.append(cnv_line.strip() + "\t" + str(expected_coverage)) with open(out_file, "w") as ofile: ofile.writelines("\n".join(wlines)) # Calculate expected coverage for each segment with shifted cn_shift_suffix = [] if add_shift: for cn_shift in [-2, -1, 1, 2]: with open(merged_file, "r") as mfile: wlines = [ "#chr\tstart\tend\tCN\tobservedCoverage\texpectedCovearge" ] for cnv_line in mfile.readlines(): (chrid, start, end, cn, cov) = cnv_line.strip().split("\t") expected_coverage = round( ((float(cn) + cn_shift) * tumor_purity + 2 * (1 - tumor_purity)) * hap_coverage, 3) wlines.append(cnv_line.strip() + "\t" + str(expected_coverage)) cn_shift_suffix.append("_CNshift_" + str(cn_shift)) with open( re.sub(".bed$", cn_shift_suffix[-1] + ".bed", out_file), "w") as ofile: ofile.writelines("\n".join(wlines)) # Remove temporary files if not save_tmp: for rmf in rm_files: print "rm %s" % os.path.join(out_dir, rmf) subprocess.Popen("rm %s" % os.path.join(out_dir, rmf), shell=True) print "Output bed file written" if add_perc: if os.path.basename(out_file)[:-4].endswith("_truth"): sim_dir = "/illumina/scratch/tmp/users/ccolombo/simulation/" + os.path.basename( out_file)[4:-10] else: sim_dir = "/illumina/scratch/tmp/users/ccolombo/simulation/" + os.path.basename( out_file)[4:-4] if os.path.exists(sim_dir): # For each variant: clonal percentage, coverage of overlapping segments perc_file = create_var_perc_file(sim_dir, None, True) cmd = "/illumina/thirdparty/bedtools/bedtools2-2.22.1/bin/bedtools intersect" cmd += " -a %s" % perc_file cmd += " -b %s" % out_file cmd += " -wa -wb" run_bedtools_cmd(cmd, out_file, "_perc") # For the whole genome: clonal percentage, coverage of overlapping bins cmd = "/illumina/thirdparty/bedtools/bedtools2-2.22.1/bin/bedtools complement" cmd += " -i %s" % perc_file cmd += " -g %s" % hg_file pc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) compl = pc.stdout.readlines() with open(perc_file, "r") as pbed: lines = pbed.readlines() for line in compl: lines.append("\n" + line.rstrip() + "\t" + str(1) + "\t" + str(1) + "\t" + str(1.0)) with open(perc_file, "w") as pbed: pbed.writelines(lines) cmd = "/illumina/thirdparty/bedtools/bedtools2-2.22.1/bin/bedtools intersect" cmd += " -a %s" % perc_file cmd += " -b %s" % out_file_bin cmd += " -wa -wb" run_bedtools_cmd(cmd, out_file_bin, "_perc_all") print("rm %s" % perc_file) subprocess.Popen("rm %s" % perc_file, shell=True) print "Output bed file with variants percentages written" else: print "Cannot find simulation directory"
def create_ev_cov_files(cnv_file, cnv_type, part_file, out_dir, out_filename, tumor_purity, excl_file, save_tmp, add_perc, add_shift, hg_file): out_file = os.path.join(out_dir, out_filename) out_file_bin = re.sub(".bed$", "_bin.bed", out_file) hap_coverage = 0 if cnv_type == "canvas": cnv_file, tumor_purity, hap_coverage = init_param_canvas_file(cnv_file, out_file) elif cnv_type == "truth": cnv_file = init_param_truth_file(cnv_file, out_dir, hg_file) rm_files = [cnv_file] print "Tumor purity: %f" % tumor_purity # Exclude regions if excl_file is not None: if not os.path.exists(excl_file): print "File for excluded regions %s does not exist\n" % excl_file else: cmd = "/illumina/thirdparty/bedtools/bedtools2-2.22.1/bin/bedtools subtract" cmd += " -a %s" % cnv_file cmd += " -b %s" % excl_file cnv_file = run_bedtools_cmd(cmd, cnv_file, "_excl") rm_files.append(cnv_file) # Extract partitioned file extr_part_file = re.sub(".bed$", "_partitioned.bed", out_file) pc = subprocess.Popen("gunzip -c %s > %s" % (part_file, extr_part_file), shell=True) pc.wait() # Intersect variants file with bin file cmd = "/illumina/thirdparty/bedtools/bedtools2-2.22.1/bin/bedtools intersect" cmd += " -a %s" % cnv_file cmd += " -b %s" % extr_part_file cmd += " -wa -wb" intersect_file = run_bedtools_cmd(cmd, out_file, "_int") rm_files.extend([intersect_file, extr_part_file]) with open(intersect_file, "r") as ifile: int_lines = ifile.readlines() if hap_coverage==0: hap_coverage = numpy.median([float(int_line.strip().split("\t")[7]) for int_line in int_lines if float(int_line.strip().split("\t")[3])==2])/2 print "Haplotype coverage: %f (median)" % hap_coverage wlines = ["#chr\tstart\tend\tCN\tobservedCoverage\texpectedCoverage"] for int_line in int_lines: (chr_segm, start_segm, end_segm, cn_segm, chr_bin, start_bin, end_bin, cov_bin, segm_bin) = int_line.strip().split("\t") expected_coverage = round((float(cn_segm)*tumor_purity + 2*(1-tumor_purity)) * hap_coverage, 3) wlines.append("\t".join([chr_bin, start_bin, end_bin, cn_segm, cov_bin, str(expected_coverage)])) with open(out_file_bin, "w") as ofile: ofile.writelines("\n".join(wlines)) # # Sort chromosome intersect file # cmd = "/illumina/thirdparty/bedtools/bedtools2-2.22.1/bin/bedtools sort" # cmd += " -i %s" % intersect_file # sorted_file = run_bedtools_cmd(cmd, out_file, "_sort") # rm_files.append(sorted_file) # Sort chromosome intersect file cmd = "sort -k 1,1 -k2,2n %s" % intersect_file sorted_file = run_bedtools_cmd(cmd, out_file, "_sort") rm_files.append(sorted_file) # Merge intersect file (median observed coverage) cmd = "/illumina/thirdparty/bedtools/bedtools2-2.22.1/bin/bedtools merge" cmd += " -i %s" % intersect_file cmd += " -d -1" cmd += " -c 4,8" cmd += " -o mean,median" merged_file = run_bedtools_cmd(cmd, intersect_file, "_merg") rm_files.append(merged_file) # Calculate expected coverage for each segment with open(merged_file, "r") as mfile: wlines = ["#chr\tstart\tend\tCN\tobservedCoverage\texpectedCoverage"] for cnv_line in mfile.readlines(): (chrid, start, end, cn, cov) = cnv_line.strip().split("\t") expected_coverage = round((float(cn)*tumor_purity + 2*(1-tumor_purity)) * hap_coverage, 3) wlines.append(cnv_line.strip() + "\t" + str(expected_coverage)) with open(out_file, "w") as ofile: ofile.writelines("\n".join(wlines)) # Calculate expected coverage for each segment with shifted cn_shift_suffix = [] if add_shift: for cn_shift in [-2, -1, 1, 2]: with open(merged_file, "r") as mfile: wlines = ["#chr\tstart\tend\tCN\tobservedCoverage\texpectedCovearge"] for cnv_line in mfile.readlines(): (chrid, start, end, cn, cov) = cnv_line.strip().split("\t") expected_coverage = round(((float(cn) + cn_shift)*tumor_purity + 2*(1-tumor_purity)) * hap_coverage, 3) wlines.append(cnv_line.strip() + "\t" + str(expected_coverage)) cn_shift_suffix.append("_CNshift_" + str(cn_shift)) with open(re.sub(".bed$", cn_shift_suffix[-1] + ".bed", out_file), "w") as ofile: ofile.writelines("\n".join(wlines)) # Remove temporary files if not save_tmp: for rmf in rm_files: print "rm %s" % os.path.join(out_dir, rmf) subprocess.Popen("rm %s" % os.path.join(out_dir, rmf), shell=True) print "Output bed file written" if add_perc: if os.path.basename(out_file)[:-4].endswith("_truth"): sim_dir = "/illumina/scratch/tmp/users/ccolombo/simulation/" + os.path.basename(out_file)[4:-10] else: sim_dir = "/illumina/scratch/tmp/users/ccolombo/simulation/" + os.path.basename(out_file)[4:-4] if os.path.exists(sim_dir): # For each variant: clonal percentage, coverage of overlapping segments perc_file = create_var_perc_file(sim_dir, None, True) cmd = "/illumina/thirdparty/bedtools/bedtools2-2.22.1/bin/bedtools intersect" cmd += " -a %s" % perc_file cmd += " -b %s" % out_file cmd += " -wa -wb" run_bedtools_cmd(cmd, out_file, "_perc") # For the whole genome: clonal percentage, coverage of overlapping bins cmd = "/illumina/thirdparty/bedtools/bedtools2-2.22.1/bin/bedtools complement" cmd += " -i %s" % perc_file cmd += " -g %s" % hg_file pc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) compl = pc.stdout.readlines() with open(perc_file, "r") as pbed: lines = pbed.readlines() for line in compl: lines.append("\n" + line.rstrip() + "\t" + str(1) + "\t" + str(1) + "\t" + str(1.0)) with open(perc_file, "w") as pbed: pbed.writelines(lines) cmd = "/illumina/thirdparty/bedtools/bedtools2-2.22.1/bin/bedtools intersect" cmd += " -a %s" % perc_file cmd += " -b %s" % out_file_bin cmd += " -wa -wb" run_bedtools_cmd(cmd, out_file_bin, "_perc_all") print("rm %s" % perc_file) subprocess.Popen("rm %s" % perc_file, shell=True) print "Output bed file with variants percentages written" else: print "Cannot find simulation directory"
def main(): """Evaluate Canvas results on simulated data""" parser = OptionParser() parser.add_option( "-s", "--canvas_sim_id", dest="canvas_sim_id", type="string", help= "simulation id (find Canvas results directories strating with sim_id") parser.add_option("-d", "--canvas_sim_dir", dest="canvas_sim_dir", type="string", help="Canvas results directory") (options, args) = parser.parse_args() if options.canvas_sim_dir is None and options.canvas_sim_id is None: parser.error("Specify Canvas results simulation id or directory") if options.canvas_sim_dir is not None and options.canvas_sim_id is not None: parser.error( "Canvas results simulation id and directory are mutually exclusive options" ) if options.canvas_sim_dir is not None: res_dirs = [options.canvas_sim_dir] if options.canvas_sim_id is not None: canvas_dir = "/illumina/scratch/tmp/users/ccolombo/Canvas/" res_dirs = [ canvas_dir + cdir for cdir in os.listdir(canvas_dir) if os.path.isdir((canvas_dir + cdir)) and cdir.startswith(options.canvas_sim_id) ] if len(res_dirs) == 0: print "No Canvas results directories found" hg_file = "/home/ccolombo/filtered_human.hg19.genome" for res_dir in res_dirs: print res_dir sim_dir = re.sub("/Canvas/", "/simulation/", res_dir) res_id = re.sub("sim", "ev", res_dir.split("/")[-1]) if not os.path.exists(res_dir): print("\nCanvas results directory %s does not exist\n\n" % res_dir) continue if not os.path.exists(res_dir): print("\nSimulation directory %s does not exist\n\n" % sim_dir) continue if os.path.basename(res_dir).startswith("simNorm") or res_dir.endswith( "_purity100"): continue out_dir = os.path.join( "/illumina/scratch/tmp/users/ccolombo/evaluation/EvaluateCNV", res_id) if not os.path.exists(out_dir): os.mkdir(out_dir) truth_file = create_het_cn_file(sim_dir, os.path.join(out_dir, "het_truth_file.bed"), no_dipl=True, hap_split=True, round=True) #truth_file = "/illumina/scratch/tmp/users/ccolombo/simulation/HCC2218flt.bed" truth_file = add_BED_complement(truth_file, hg_file, sort=False, out_dir=out_dir, hap_split=True) perc_file = create_var_perc_file(sim_dir, os.path.join(out_dir, "var_perc.bed"), no_dipl=True) # Write simulation parameters to file out_file = os.path.join(out_dir, res_id + "_par.txt") if not os.path.exists(out_file): get_sim_params(sim_dir, truth_file, perc_file, out_file, hg_file) # Run EvaluateCNV out_file = os.path.join(out_dir, res_id + ".txt") excl_file = "/illumina/scratch/tmp/users/ccolombo/evaluation/sim_filter.bed" if not os.path.exists(out_file): #if True: evaluate_CNV(res_dir, truth_file, excl_file, out_file) # Run EvaluateCNV only on heterogeneous variants out_file = os.path.join(out_dir, res_id + "_onlyhet.txt") #if True: if not os.path.exists(out_file): with open(excl_file, "r") as ef: excl_vars = ef.readlines() with open(perc_file, "r") as pf: for line in pf: (chr_id, start, end, cnA, cnB, perc) = line.strip().split("\t") if float(perc) >= 0.8 and chr_id not in [ "chrX", "chrY", "chrM" ]: excl_vars.append("\t".join([chr_id, start, end]) + "\n") os.system("rm %s" % (perc_file)) excl_file = os.path.join(out_dir, "filter_onlyhet.bed") with open(excl_file, "w") as wf: wf.writelines(excl_vars) evaluate_CNV(res_dir, truth_file, excl_file, out_file)