def readBafSamples(baffile, bafrawdata): labels = [] allsamples = [] if BAF_links: baffile = readlink(BAF_dir + patient + "_BAF.txt") else: baffile = BAF_dir + patient + "_BAF.txt" if not(isfile(baffile)): print("ERROR: no BAF file found for patient", patient) bafrawdata = {} return print("Reading BAF sample data for patient", patient) baffile = open(baffile, "r") allbafs = {} allbafs["cnvi"] = [] allbafs["normal"] = [] for line in baffile: lvec = line.split() if line.find("Chr") != -1: labels = lvec for l in range(2,len(labels)): allsamples.append(labels[l].split('"')[1]) continue chr = lvec[1].split('"')[1] pos = int(lvec[2]) if chr not in bafrawdata: continue if pos not in bafrawdata[chr]: continue for p in range(3,len(lvec)): sample = labels[p-1].split('"')[1] try: bafrawdata[chr][pos][sample] = float(lvec[p]) if line.find("cnvi") != -1: allbafs["cnvi"].append(float(lvec[p])) else: bafrawdata[chr][pos][sample] = float(lvec[p]) allbafs["normal"].append(float(lvec[p])) except: continue baffile.close() if showgraphs: print("Sample CNVI bafs for normal CNVIs that were 0.5 in wt:") lsl.createPrintAndSaveHistogram(allbafs["cnvi"], "", .01) # print("all other bafs:") # lsl.createPrintAndSaveHistogram(allbafs["normal"], "", .01) return allbafs, allsamples
loss_data.append(avg_log2r) #loss_data.append(avg_log2r) elif (call == "wt"): wt_data.append(avg_log2r) elif (call == "Gain"): gain_data.append(avg_log2r) elif (call == "Balanced_gain"): balanced_gain_data.append(avg_log2r) else: print "Unknown call ", call binwidth = 0.001 print "Double-loss from doubled genomes histogram:" lsl.createPrintAndSaveHistogram( double_loss_from_doubled_data, "CN_rejoined_histograms/double_loss_from_doubled_hist.txt", binwidth) print "Loss from doubled genomes histogram:" lsl.createPrintAndSaveHistogram( loss_from_doubled_data, "CN_rejoined_histograms/loss_from_doubled_hist.txt", binwidth) print "Double-loss histogram:" lsl.createPrintAndSaveHistogram(double_loss_data, "CN_rejoined_histograms/double_loss.txt", binwidth) print "Loss histogram:" lsl.createPrintAndSaveHistogram(loss_data, "CN_rejoined_histograms/loss.txt", binwidth)
if lvec[n] != "": if call not in data[group]: data[group][call] = [] data[group][call].append(float(lvec[n])) for label in data: for call in data[label]: if len(data[label][call]) == 0: continue if len(data[label][call]) < 100: #Skip groups with fewer than 100 VAFs. continue filename = patient + "_" + sample + "_" + makeFilename( label) + "_" + str(call[0]) + "_" + str(call[1]) + "_hist.png" hist = lsl.createPrintAndSaveHistogram(data[label][call], filename, 0.001, xdata="VAF", savefig=False, show=False) mean = numpy.mean(data[label][call]) stdev = numpy.std(data[label][call]) histmaxes = getHistMaxes(hist) print(patient, sample, label, call) ###THIS IS WHERE YOU FIND THE HISTOGRAM PEAKS### ##Data: data[label][call] ##Peaks: histmaxes ##Peak heights: hist[histmaxes[n]] ##Stdev: stdev emdata = mixture.DataSet() emdata.fromList(data[label][call]) numpeaks = len(histmaxes)
elif (nmarkers < 21): data13_20.append(meanlog2r) elif (nmarkers < 51): data21_50.append(meanlog2r) elif (nmarkers < 501): data51_500.append(meanlog2r) elif (nmarkers < 5001): data501_5000.append(meanlog2r) elif (nmarkers < 50001): data5001_50000.append(meanlog2r) elif (nmarkers < 500001): data50001_plus.append(meanlog2r) binwidth = 0.001 lsl.createPrintAndSaveHistogram(data10_12, "full_segmentation_histograms/data10_12.txt", binwidth) lsl.createPrintAndSaveHistogram(data13_20, "full_segmentation_histograms/data13_20.txt", binwidth) lsl.createPrintAndSaveHistogram(data21_50, "full_segmentation_histograms/data21_50.txt", binwidth) lsl.createPrintAndSaveHistogram(data51_500, "full_segmentation_histograms/data51_500.txt", binwidth) lsl.createPrintAndSaveHistogram( data501_5000, "full_segmentation_histograms/data501_5000.txt", binwidth) lsl.createPrintAndSaveHistogram( data5001_50000, "full_segmentation_histograms/data5001_50000.txt", binwidth)
# -*- coding: utf-8 -*- """ Created on Mon Aug 29 16:18:22 2016 @author: lpsmith """ import lucianSNPLibrary as lsl prune = open("prunebreaks.txt", "r") data = [] for line in prune: point = int(line) data.append(point) print max(data) binwidth = 1 label = "position" lsl.createPrintAndSaveHistogram(data, "mary_out.txt", binwidth, xdata=label)
for validity in valid_out: allinvalid.write(str(validity) + "\t") length = int(segid[2]) - int(segid[1]) invalidlengths[int(segid[3])].append(length) allinvalid.write(str(length) + "\n") outfile.close() allvalid.close() allinvalid.close() for n in range(8): if n==1: continue if len(validlengths[n]) > 10: print("Valid lengths for copy number call", str(n)) print(" number of calls:", str(len(validlengths[n]))) print(" mean: ", str(np.mean(validlengths[n]))) print(" median: ", str(np.median(validlengths[n]))) print(" stdev: ", str(np.std(validlengths[n]))) if len(validlengths[n]) > 30: x = lsl.createPrintAndSaveHistogram(validlengths[n], "", 10000) if len(invalidlengths[n]) > 10: print("Invalid lengths for copy number call", str(n)) print(" number of calls:", str(len(invalidlengths[n]))) print(" mean: ", str(np.mean(invalidlengths[n]))) print(" median: ", str(np.median(invalidlengths[n]))) print(" stdev: ", str(np.std(invalidlengths[n]))) if len(invalidlengths[n]) > 30: y = lsl.createPrintAndSaveHistogram(invalidlengths[n], "", 10000)
if bal_calls[chr][b_seg] == "Balanced": allbal.append(rawA) # print("Balanced:", chr, ub_seg) else: allunbal.append(rawA) # print("Unbalanced:", chr, ub_seg) allbal = [] allunbal = [] files = [] for (__, __, f) in walk(CNdir): files += f for f in files: if "nonint" not in f: continue (patient, sample, ploidy) = f.split("_")[0:3] if onlysomepatients and patient not in somepatients: continue unbal_calls = readAmbiguousCallsFromASCAT(f) bal_calls = lsl.readBalancedCalls(patient, sample) print("Comparing", patient, sample, ploidy) compareAndReport(unbal_calls, bal_calls, patient, sample, ploidy, allbal, allunbal) print("All balanced values:") lsl.createPrintAndSaveHistogram(allbal, "", 0.001) print("All unbalanced values:") x = lsl.createPrintAndSaveHistogram(allunbal, "", 0.001, axis=[0, 5, 0])
def combineTwoBafs(patient1, patient2): bafrawdata = {} bafwt = {} brd = {} allwtbafs = [] for patient in (patient1, patient2): brd[patient] = {} bafwt[patient] = {} if BAF_links: bafnormal = readlink(BAF_dir + patient + "_Normal_BAF.txt") else: bafnormal = BAF_dir + patient + "_Normal_BAF.txt" if not(isfile(bafnormal)): print("ERROR: no Normal BAF file found for patient", patient) return ({}, {}) bafnormal = open(bafnormal, "r") print("Reading BAF normal data for patient", patient) for line in bafnormal: # if (line.find("cnvi") != -1): # continue lvec = line.split() if line.find("Chr") != -1: continue try: value = float(lvec[3]) except: continue if (value < 0.1 or value > 0.9): continue allwtbafs.append(value) if (value < bafWtLow or value > bafWtHigh): continue chr = lvec[1].split('"')[1] pos = int(lvec[2]) if chr not in brd[patient]: brd[patient][chr] = {} if chr not in bafwt[patient]: bafwt[patient][chr] = {} #print("Adding", chr, "to patient", patient) brd[patient][chr][pos] = {} bafwt[patient][chr][pos] = value bafnormal.close() # print("Number of 0.5 BAFs for patient", patient, ":") # for chr in brd[patient]: # print(len(brd[patient][chr])) if showgraphs: compareNormalBafs(bafwt, patient1, patient2) lsl.createPrintAndSaveHistogram(allwtbafs, "", 0.01) todelete = [] for chr in brd[patient1]: for pos in brd[patient1][chr]: if pos not in brd[patient2][chr]: todelete.append((chr, pos)) for (chr, pos) in todelete: del brd[patient1][chr][pos] bafrawdata = brd[patient1] # print("Number of 0.5 BAFs for both combined:") # for chr in bafrawdata: # print(len(bafrawdata[chr])) allsamples = [] for patient in [patient1, patient2]: labels = [] if BAF_links: baffile = readlink(BAF_dir + patient + "_BAF.txt") else: baffile = BAF_dir + patient + "_BAF.txt" if not(isfile(baffile)): print("ERROR: no BAF file found for patient", patient) bafrawdata = {} return print("Reading BAF sample data for patient", patient) baffile = open(baffile, "r") allbafs = {} allbafs["cnvi"] = [] allbafs["normal"] = [] for line in baffile: lvec = line.split() if line.find("Chr") != -1: labels = lvec for lv in range(2,len(lvec)): allsamples.append(lvec[lv].split('"')[1]) continue chr = lvec[1].split('"')[1] pos = int(lvec[2]) if chr not in bafrawdata: continue if pos not in bafrawdata[chr]: continue for p in range(3,len(lvec)): sample = labels[p-1].split('"')[1] try: bafrawdata[chr][pos][sample] = float(lvec[p]) if line.find("cnvi") != -1: allbafs["cnvi"].append(float(lvec[p])) else: bafrawdata[chr][pos][sample] = float(lvec[p]) allbafs["normal"].append(float(lvec[p])) except: continue baffile.close() outfile = open(outdir + "two_patients_input.txt", "w") outfile.write("Chr\tpos") for sample in allsamples: outfile.write("\t" + sample) outfile.write("\n") for chr in bafrawdata: for pos in bafrawdata[chr]: outfile.write(chr + "\t" + str(pos)) for sample in allsamples: outfile.write("\t") if sample in bafrawdata[chr][pos]: outfile.write(str(bafrawdata[chr][pos][sample])) else: outfile.write("NA") outfile.write("\n") bafwt = bafwt[patient1] return bafrawdata, bafwt, allbafs, allwtbafs, allsamples
Created on Thu Jul 7 10:33:59 2016 @author: lpsmith """ from __future__ import division from os import walk import lucianSNPLibrary as lsl import numpy # read the filtered data that compares Xiaohong's segmentation data with raw SNP data #filenames = ["1049_20780_avglog2rs.txt", "1049_20782_avglog2rs.txt"] filename = "diseqs.txt" all_data = [] file = open(filename, "r") id = 0 all_data = [] for line in file: id += 1 all_data = numpy.array(map(float, line.rstrip().split())) #binwidth = (max(all_data) - min(all_data))/100 #binwidth = pow(10,int(numpy.floor(numpy.log10(abs(binwidth))))) binwidth = 0.001 lsl.createPrintAndSaveHistogram(all_data, "diseq_" + str(id) + ".txt", binwidth, xdata="diseq")
for line in segmented_file: if (line.find("chr") != -1): continue this_line = patient + "\t" + sample + "\t" + line (chr, start, end, xLog2r, call, nlog2r, log2r, stdev) = line.rstrip().split() chr = int(chr) if (chr != 9): continue start = int(start) if (start > 21995301): continue if (end=="inf"): end = 30000000 #Greater than the end of the gene end = int(end) if (end < 21967752): continue all_data.append(float(log2r)) if call == "Loss": loss_data.append(float(log2r)) elif call == "Double_d": double_loss_data.append(float(log2r)) elif call == "wt": wt_data.append(float(log2r)) outfile.write(this_line) outfile.close() lsl.createPrintAndSaveHistogram(double_loss_data, "short_segments/p16_double" + rejoin + ".txt", 0.001, axis=(-3.5, 1.5, 0)) lsl.createPrintAndSaveHistogram(loss_data, "short_segments/p16_loss" + rejoin + ".txt", 0.001, axis=(-3.5, 1.5, 0)) lsl.createPrintAndSaveHistogram(wt_data, "short_segments/p16_wt" + rejoin + ".txt", 0.001, axis=(-3.5, 1.5, 0)) lsl.createPrintAndSaveHistogram(all_data, "short_segments/p16_all" + rejoin + ".txt", 0.001, axis=(-3.5, 1.5, 0))
for f in flist: if f.find(".spstats") == -1: continue (patient, sample, tag) = f.split("_") statfile = open(directory + f, "r") for line in statfile: if line.find("expands") != -1: continue if line.find("Mean") != -1: continue splitline = line.split() if len(splitline) < 2: continue noisevals.append(float(splitline[1])) statfile.close() if (len(noisevals) > 0): outfile.write(directory + "\t") outfile.write(str(numpy.average(noisevals)) + "\t") outfile.write(str(numpy.std(noisevals)) + "\t") outfile.write(str(numpy.median(noisevals)) + "\t") outfile.write(str(numpy.max(noisevals)) + "\t") outfile.write(str(numpy.min(noisevals)) + "\n") lsl.createPrintAndSaveHistogram(noisevals, "noiseout.txt", 0.01, xdata="noise") outfile.close() of = open("noisevals.txt", "w") of.write(str(noisevals)) of.close()
else: print "Unknown call ", call rangestr = "_" if (use_max): rangestr += "only_" + str(nsamples_min) + "-" + str(nsamples_max) + "_" if (use_length): rangestr = "_only_" + str(length_min) + "-" + str(length_max) + "_" if (use_baf): print "Double-loss histograms:" index = 0 combined_data = [] for dataset in double_loss_data: lsl.createPrintAndSaveHistogram( dataset, output_directory + "double_loss_hist" + rangestr + str(index) + ".txt", g_binwidth) combined_data += dataset index += 1 lsl.createPrintAndSaveHistogram( combined_data, output_directory + "double_loss_hist" + rangestr + "all.txt", g_binwidth) combined_data = [] print "Loss histograms:" index = 0 for dataset in loss_data: lsl.createPrintAndSaveHistogram( dataset, output_directory + "loss_hist" + rangestr + str(index) + ".txt",
for call in calls: overview_out.write("\t" + str(overview_bases[sample][analysis][call])) writeDerivedStatistic(overview_out, overview, sample, analysis, ["TP", "TN"], ["FP", "FN"]) writeDerivedStatistic(overview_out, overview_bases, sample, analysis, ["TP", "TN"], ["FP", "FN"]) overview_out.write("\n") overview_out.close() (Xiaohong_segments, X_totsca) = readAllXiaohongSegmentation() files = [] for (__, __, f) in walk(BAF_dir): files += f for f in files: if f.find("_Normal_BAF.txt") == -1: continue patient = f.split("_")[0] if (onlysomepatients and patient not in somepatients): continue bafrawdata, bafwt = readBafNormal(patient) if (len(bafrawdata) == 0): continue readBafSamples(patient, bafrawdata) lsl.createPrintAndSaveHistogram(allbafs['1m'], "1M BAFs", .01) lsl.createPrintAndSaveHistogram(allbafs['25m'], "2.5M BAFs", .01)
if intA == 2: data_22.append(log2r) elif intA == 3: data_23.append(log2r) elif intB == 3: if intA == 3: data_33.append(log2r) rangestr = "_" + str(nsamples_min) + "-" + str(nsamples_max) + "_" thisaxis = [-3.5, 1.5, 0] print "Double-loss histograms:" lsl.createPrintAndSaveHistogram( double_loss_data, "ASCAT_smoothed_histograms/double_loss_hist_a" + rangestr + ".txt", g_binwidth, axis=[-3.5, 1.5, 0]) print "Loss histograms:" lsl.createPrintAndSaveHistogram(loss_data, "ASCAT_smoothed_histograms/loss_hist_a" + rangestr + "all.txt", g_binwidth, axis=thisaxis) print "WT histograms:" lsl.createPrintAndSaveHistogram(wt_data, "ASCAT_smoothed_histograms/wt_hist_a" + rangestr + "all.txt", g_binwidth,
if f.find("LOH") != -1: readXiaohongWGSLOHFile(Xdir_WGS + f, Xiaohong_segments, totsca) else: readXiaohongCopynumFile(Xdir_WGS + f, Xiaohong_segments, totsca) files = [] for (__, __, f) in walk(Xdir_1M): files += f for f in files: if f.find("read") != -1: continue if f.find("LOH") != -1: readXiaohong1MLOHFile(Xdir_1M + f, Xiaohong_segments, totsca) else: readXiaohongCopynumFile(Xdir_1M + f, Xiaohong_segments, totsca) return Xiaohong_segments, totsca Xsegs, totsca = readAllXiaohongSegmentation() alldiffs = [] for patient in Xsegs: for sample in Xsegs[patient]: for chr in Xsegs[patient][sample]: Xsegs[patient][sample][chr].sort() for i in range(1, len(Xsegs[patient][sample][chr])): endlast = Xsegs[patient][sample][chr][i - 1][1] startnext = Xsegs[patient][sample][chr][i][0] if endlast < startnext: alldiffs.append(numpy.log10(startnext - endlast)) lsl.createPrintAndSaveHistogram(alldiffs, "", 0.01)
loss_from_doubled_data.append(avg_log2r) else: loss_data.append(avg_log2r) #loss_data.append(avg_log2r) elif (call == "wt"): wt_data.append(avg_log2r) elif (call == "Gain"): gain_data.append(avg_log2r) elif (call == "Balanced_gain"): balanced_gain_data.append(avg_log2r) else: print "Unknown call ", call lsl.createPrintAndSaveHistogram(double_loss_from_doubled_data, outdir + str(patient) + "_" + str(sample) + "_smoothhist.txt", binwidth, show=False) print "Double-loss from doubled genomes histogram:" lsl.createPrintAndSaveHistogram(double_loss_from_doubled_data, outdir + "double_loss_from_doubled_hist" + srange + ".txt", binwidth, axis=(-3.5, 1.5, 0)) print "Loss from doubled genomes histogram:" lsl.createPrintAndSaveHistogram(loss_from_doubled_data, outdir + "loss_from_doubled_hist" + srange + ".txt", binwidth,
intersection_file.write(patient) intersection_file.write("\t" + sample) intersection_file.write("\t" + chrom) intersection_file.write("\t" + str(pos)) intersection_file.write("\n") elif compare == "dip_CNLOH": dip_CNLOH_VAFs.append(VAF) intersection_file.close() if just_intersection: continue if not justdip: #print("All VAFs for patient", patient, "sample", sample, ":") hist = lsl.createPrintAndSaveHistogram(allVAFs, VAFpngdir + patient + "_" + sample + "_VAF_hist", 0.001, xdata="VAF", show=runLocally, savefig=True, axis=(0, 1.1, 0)) #print("VAFs for positions called 1/1 in diploid and 2/2 in tetraploid,", patient, "sample", sample, ":") hist = lsl.createPrintAndSaveHistogram(twovfour_VAFs, VAF2v4dir + patient + "_" + sample + "_2v4_VAF_hist", 0.001, xdata="VAF", show=runLocally, savefig=True, axis=(0, 1.1, 0)) #print("VAFs for positions called 01 in diploid but more in tetraploid,", patient, "sample", sample, ":") hist = lsl.createPrintAndSaveHistogram(onevtwo_VAFs, VAF1v2dir + patient + "_" +
mismatchkeys.append(key) else: matchkeys.append(key) for key in matchkeys: outfile.write("\t" + str(key)) for key in mismatchkeys: outfile.write("\t" + str(key)) outfile.write("\n") for (patient, sample) in allcomparisons: comparison = allcomparisons[(patient, sample)] outfile.write(patient + "\t" + sample) for key in matchkeys: if key in comparison: outfile.write("\t" + str(comparison[key])) else: outfile.write("\t0") for key in mismatchkeys: if key in comparison: outfile.write("\t" + str(comparison[key])) else: outfile.write("\t0") outfile.write("\n") outfile.close() # lengthvec = [20, 50, 100, 1000, 10000, 100000, 1000000] # seg1binnedlengths = binLengths(lengthvec, seg1lengths) # seg2binnedlengths = binLengths(lengthvec, seg2lengths) lsl.createPrintAndSaveHistogram(numpy.log10(seg1lengths), comparison_dir + file1 + "_seglengths.txt", 0.01, xdata="Segment lengths", axis=(), show=True) lsl.createPrintAndSaveHistogram(numpy.log10(seg2lengths), comparison_dir + file2 + "_seglengths.txt", 0.01, xdata="Segment lengths", axis=(), show=True)
def processEvidence(evidence, balanced_evidence, osegs, allsamples): good_samples = {} bad_samples = {} missed_samples = {} good_sca = {} bad_sca = {} missed_sca = {} # ev_ratios = [] # balev_ratios = [] #For information: balanced_percs = [] unbalanced_percs = [] crosscheck_percs = [] allbal_percs = [] balpercs_crosspatient = [] balpercs_inpatient = [] unbalpercs_crosspatient = [] unbalpercs_inpatient = [] for sample in allsamples: good_samples[sample] = 0 bad_samples[sample] = 0 missed_samples[sample] = 0 good_sca[sample] = set() bad_sca[sample] = set() missed_sca[sample] = set() good_samples["overall"] = 0 bad_samples["overall"] = 0 missed_samples["overall"] = 0 good_sca["overall"] = set() bad_sca["overall"] = set() missed_sca["overall"] = set() for chr in evidence: for isegrange in evidence[chr]: segpercs = [] minnbaf = 100000000 chrsegrange = (chr, isegrange[0], isegrange[1]) # nbases = isegrange[1] - isegrange[0] for segpair in evidence[chr][isegrange]: [match, antimatch] = evidence[chr][isegrange][segpair] tot = match+antimatch # ev_ratios.append(math.log(nbases/tot)) if tot < 10: continue minnbaf = min(minnbaf, tot) perc = match/tot if mirror_percentages and antimatch>match: perc = antimatch/tot segpercs.append(perc) unbalanced_percs.append(perc) if (segpair[0] in onepatientsamples and segpair[1] in onepatientsamples) or (segpair[0] not in onepatientsamples and segpair[1] not in onepatientsamples): unbalpercs_inpatient.append(perc) else: unbalpercs_crosspatient.append(perc) if perc<0.95 or perc<0.05: #print("bad:", chrsegrange) bad_samples[segpair[0]] += 1 bad_samples[segpair[1]] += 1 bad_samples["overall"] += 1 bad_sca[segpair[0]].add(chrsegrange) bad_sca[segpair[1]].add(chrsegrange) bad_sca["overall"].add(chrsegrange) else: #print("good:", chrsegrange) good_samples[segpair[0]] += 1 good_samples[segpair[1]] += 1 good_samples["overall"] += 1 good_sca[segpair[0]].add(chrsegrange) good_sca[segpair[1]].add(chrsegrange) good_sca["overall"].add(chrsegrange) for segpair in balanced_evidence[chr][isegrange]: [match, antimatch] = balanced_evidence[chr][isegrange][segpair] tot = match+antimatch # balev_ratios.append(math.log(nbases/tot)) if tot < 20: continue minnbaf = min(minnbaf, tot) perc = match/tot if mirror_percentages and antimatch>match: perc = antimatch/tot #print(match, antimatch, tot, perc) balanced_percs.append(perc) if (segpair[0] in onepatientsamples and segpair[1] in onepatientsamples) or (segpair[0] not in onepatientsamples and segpair[1] not in onepatientsamples): balpercs_inpatient.append(perc) else: balpercs_crosspatient.append(perc) unbal_samples = [] for iseg in isegs[chr]: if iseg[0] == isegrange[0] and subdir in iseg[2]: unbal_samples = iseg[2][subdir] if segpair[0] in unbal_samples or segpair[1] in unbal_samples: crosscheck_percs.append(perc) else: allbal_percs.append(perc) if perc>=0.95 or perc <= 0.05: #print("missed_:", chrsegrange) missed_samples[segpair[0]] += 1 missed_samples[segpair[1]] += 1 missed_samples["overall"] += 1 missed_sca[segpair[0]].add(chrsegrange) missed_sca[segpair[1]].add(chrsegrange) missed_sca["overall"].add(chrsegrange) if showgraphs: print("All percent matches for all balanced-to-anything segments:") lsl.createPrintAndSaveHistogram(balanced_percs, "", .001) if twopatientcompare: print("Only in-patient percent matches for all balanced-to-anything segments:") lsl.createPrintAndSaveHistogram(balpercs_inpatient, "", .001) print("Only cross-patient percent matches for all balanced-to-anything segments:") lsl.createPrintAndSaveHistogram(balpercs_crosspatient, "", .001) # print("Number of bases/useable SNPs for unbalanced segments:") # lsl.createPrintAndSaveHistogram(ev_ratios, "", 0.1) # print("Mean of unbalanced segment ratios:", numpy.mean(ev_ratios)) # print("Median of unbalanced segment ratios:", numpy.median(ev_ratios)) # print("Number of bases/useable SNPs for balanced segments:") # lsl.createPrintAndSaveHistogram(balev_ratios, "", 0.1) # print("Mean of balanced segment ratios:", numpy.mean(balev_ratios)) # print("Median of balanced segment ratios:", numpy.median(balev_ratios)) print("Percent matches for all balanced-to-unbalanced checks:") lsl.createPrintAndSaveHistogram(crosscheck_percs, "", .001) print("Percent matches for all balanced-to-balanced checks:") lsl.createPrintAndSaveHistogram(allbal_percs, "", .001) print("All percent matches for unbalanced-to-unbalanced segments:") lsl.createPrintAndSaveHistogram(unbalanced_percs, "", .001) if twopatientcompare: print("Only in-patient percent matches for unbalanced-to-unbalanced segments:") lsl.createPrintAndSaveHistogram(unbalpercs_inpatient, "", .001) print("Only cross-patient percent matches for unbalanced-to-unbalanced segments:") lsl.createPrintAndSaveHistogram(unbalpercs_crosspatient, "", .001) return (good_samples, bad_samples, missed_samples, good_sca, bad_sca, missed_sca)
scoreAnalysis(isegs, Xiaohong_segments[patient], sample, "Xiaohong") #writeSummary(isegs, patient, all_samples, all_analyses) #writeBalanceFiles(isegs, patient, all_samples) unique_ratios = [] used_samples = [] for segpair in allratios: if segpair[0] in used_samples: continue if segpair[1] in used_samples: continue used_samples.append(segpair[0]) used_samples.append(segpair[1]) unique_ratios.extend(allratios[segpair]) lsl.createPrintAndSaveHistogram(unique_ratios, ratiodir + "unique_ratios_why95_01", .01) for segpair in allratios: outfile = open(ratiodir + segpair[0] + "_" + segpair[1], "w") for entry in allratios[segpair]: outfile.write(str(entry) + "\n") oldratios = [] for segpair in allratios: if ("360" in segpair[0] or "672_" in segpair[0]): oldratios.extend(allratios[segpair]) lsl.createPrintAndSaveHistogram(oldratios, ratiodir + "oldratios", .01) v1Mratios = [] v25Mratios = [] cross_ratios = []
continue if line.find("Mean") != -1: continue splitline = line.split() if len(splitline) < 2: continue if seglengths[(patient, sample)] < avgseg: lownoisevals.append(float(splitline[1])) else: highnoisevals.append(float(splitline[1])) statfile.close() if (len(highnoisevals) > 0): print directory, "High:", numpy.average(highnoisevals), numpy.std(highnoisevals), numpy.median(highnoisevals), numpy.max(highnoisevals), numpy.min(highnoisevals) print directory, "Low:", numpy.average(lownoisevals), numpy.std(lownoisevals), numpy.median(lownoisevals), numpy.max(lownoisevals), numpy.min(lownoisevals) lsl.createPrintAndSaveHistogram(highnoisevals, "highnoiseout.txt", 0.01, xdata="noise") lsl.createPrintAndSaveHistogram(lownoisevals, "lownoiseout.txt", 0.01, xdata="noise")
datum = [] for entry in line: entry = int(entry) datum.append(entry) data.append(datum) # classify by distances ############################################################ # write reports results = summarize(data, cisdists, transdists, nesteddists) lsl.createPrintAndSaveHistogram(cisdists, "Cis Distances", 0.5, xdata="distance", axis=(-20, 500, 0)) lsl.createPrintAndSaveHistogram(transdists, "Trans Distances", 0.5, xdata="distance", axis=(-20, 500, 0)) lsl.createPrintAndSaveHistogram(nesteddists, "Nested Distances", 0.5, xdata="distance", axis=(-20, 500, 0)) plt.ylim(0, 400) plt.hist(cisdists, 100)