def merge_calls_across_SD(calls, pipeline, max_distance_to_merge=50, min_sd_percent=0.5): out_calls = [] #pd.DataFrame(columns=list(calls.calls.columns) + ["merge_count"]) chrom_probes = {chrom: pd.DataFrame(p.r.h5file.root.probes._f_getChild("probes_chr%d" % chrom).read()) for chrom in range(1,24)} for sampleID in set(calls.calls["sampleID"]): print sampleID sample_calls = CallTable(calls.calls[calls.calls["sampleID"] == sampleID]) for chrom in set(sample_calls.calls.chromosome): chr_probes = chrom_probes[chrom] calls_to_merge = sample_calls.filter(lambda x: x["chromosome"] == chrom).calls.sort("start") if len(calls_to_merge) <= 1: # no calls to merge out_calls.append(calls_to_merge.ix[calls_to_merge.index[0]])#, ignore_index=True) else: # for each call, first check it is within the max_distance_to_merge # then check if the SD content between them is more than the min_sd_percent first_call = calls_to_merge.ix[calls_to_merge.index[0]] # start iterating on the second call for ix, second_call in calls_to_merge.ix[calls_to_merge.index[1:]].iterrows(): delta = second_call["start_exon"] - first_call["stop_exon"] if (second_call["state"] == first_call["state"]) and (delta < max_distance_to_merge): gap_sd_count = chr_probes.ix[xrange(first_call["stop_exon"],second_call["start_exon"])].isSegDup.sum() gap_sd_percent = float(gap_sd_count)/delta if gap_sd_percent >= min_sd_percent: merged = merge_calls(first_call, second_call) first_call = merged.copy() else: out_calls.append(first_call)#, ignore_index=True) first_call = second_call else: # too far apart, do not merge out_calls.append(first_call)#, ignore_index=True) first_call = second_call #if not last_call_was_merged: out_calls.append(second_call)#, ignore_index=True) out_calls = pd.DataFrame(out_calls) return CallTable(out_calls)
import argparse if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--infile", "-i", action="store", required=True) parser.add_argument("--cohort", action="store", required=True) parser.add_argument("--esp_infile", action="store", required=True) parser.add_argument("--outfile", "-o", action="store", required=True) parser.add_argument("--gamma", type=float, action="store", required=False, default=0.9) parser.add_argument("--cophenetic_cutoff", type=float, action="store", required=False, default=0.85) args = parser.parse_args() assert args.gamma <= 1, "Gamma must be <= 1.00" assert args.cophenetic_cutoff <= 1, "Cophenetic cutoffs must be <= 1.00" calls = CallTable(args.infile) esp_calls = CallTable(args.esp_infile) calls.calls["cohort"] = args.cohort esp_calls.calls["cohort"] = "ESP" calls.appendCalls(esp_calls) calls = calls.clusterCallsByCohort(gamma=args.gamma, cohort_field="cohort", cophenetic_cutoff=args.cophenetic_cutoff) #clean up calls table del calls.calls["cnvrID_ESP"] calls = CallTable(calls.calls.rename(columns={'cnvr_frequency_HSCR': 'cnvr_frequency', 'cnvrID_HSCR': 'cnvrID'})) # filter for original cohort and save calls.filter(lambda x: x["cohort"] == args.cohort).save(args.outfile)
OtherDupFilter = CallFilterTemplate(p, "/net/eichler/vol8/home/nkrumm/REFERENCE_INFO/3copiesin27of34.bed", name="Dup_overlap", filter_type="overlap", func=lambda x: x < 0.5) GeneAnnotation = CallFilterTemplate(p, "/net/eichler/vol8/home/nkrumm/REFERENCE_INFO/hg19.refGene.bed", name="RefSeq", filter_type="name") def signalFilter(x): if x["num_probes"] <= 2: return np.abs(x["median_svdzrpkm"]) >= 1.5 elif x["num_probes"] <= 5: return np.abs(x["median_svdzrpkm"]) >= 1 else: return np.abs(x["median_svdzrpkm"]) >= 0.5 calls = CallTable(args.call_file) calls = calls.filter(signalFilter)\ .filter(lambda x: x["probability"] > 0.99)\ .filter(SDFilter)\ .filter(PPGFilter)\ .filter(OtherDupFilter)\ .annotate(SDCount)\ .annotate(PPG_probe_count) calls.save(args.outfile)
from conifertools import ConiferPipeline, CallTable, CallFilterTemplate import numpy as np import argparse import pandas as pd import os if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--call_file", action="store", required=True) parser.add_argument("--outfile", "-o", action="store", required=True) args = parser.parse_args() calls=CallTable(args.call_file) samples = pd.read_csv("/net/eichler/vol20/projects/epi4k/nobackups/araja/epi4k_exome/xhmm/DATA/conifer_xhmm_overlap_epp.bed", sep="\t") calls.calls = pd.merge(samples,calls.calls) calls.save(args.outfile)
from conifertools import CallTable import argparse if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--infile", "-i", action="store", required=True) parser.add_argument("--outfile", "-o", action="store", required=True) parser.add_argument("--gamma", type=float, action="store", required=False, default=0.9) parser.add_argument("--cophenetic_cutoff", type=float, action="store", required=False, default=0.85) args = parser.parse_args() assert args.gamma <= 1, "Gamma must be <= 1.00" assert args.cophenetic_cutoff <= 1, "All cophenetic cutoffs must be <= 1.00" calls = CallTable(args.infile) calls = calls.clusterCalls(gamma=args.gamma, cophenetic_cutoff=args.cophenetic_cutoff) calls.save(args.outfile)
if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--conifer_file", action="store", required=True) parser.add_argument("--call_file", action="store", required=True) parser.add_argument("--out_dir", action="store", required=True) parser.add_argument("--min_freq", type=int, action="store", required=False, default=0) parser.add_argument("--max_freq", type=int, action="store", required=False, default=30) parser.add_argument("--cnvrID", type=int, nargs="*", action="store", required=False, default=None) parser.add_argument("--cohort", action="store", required=False, default = "SSC") args = parser.parse_args() INHERITED_CODES = ['fa_to_both', 'fa_to_pro', 'fa_to_sib', 'mo_to_both', 'mo_to_pro', 'mo_to_sib'] calls = CallTable(args.call_file) del calls.calls["cnvrID"]# = calls.calls["cnvrID_%s" % args.cohort] if isinstance(args.cnvrID, list): calls = calls.filter(lambda x: x["cnvrID_%s" % args.cohort] in args.cnvrID) calls = calls.filter(lambda x: (x["cnvr_frequency_%s" % args.cohort] >= args.min_freq) & (x["cnvr_frequency_%s" % args.cohort] < args.max_freq)) calls.calls["familyID"] = map(lambda x: x.split(".")[:-1], calls.calls["sampleID"]) offspring_calls = calls.filter(lambda x: x["sampleID"].endswith(("p1","s1","s2","s3"))) parent_calls = calls.filter(lambda x: x["sampleID"].endswith(("mo","fa"))) sibling_calls = calls.filter(lambda x: x["sampleID"].endswith(("s1", "s2", "s3"))) plotters = {} colors = {"fa": "b", "mo": "b", "sib": "g", "pro": "r"} for rel, codes in zip(["pro", "sib", "mo", "fa"], [["p1", "p2"], ["s1", "s2", "s3"], ["mo"], ["fa"]]): # create a plotter for each family member
from conifertools import ConiferPipeline, CallTable import argparse import numpy as np if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("input_call_files", nargs="+") parser.add_argument("--outfile", "-o", action="store", required=True) args = parser.parse_args() calls = CallTable() for filename in args.input_call_files: calls.appendCalls(CallTable(filename)) calls.save(args.outfile)
from conifertools import CallTable import argparse if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--infile", "-i", action="store", required=True) parser.add_argument("--outfile", "-o", action="store", required=True) parser.add_argument("--gamma", type=float, action="store", required=False, default=0.9) parser.add_argument("--cophenetic_cutoff", type=float, action="store", required=False, default=0.85) args = parser.parse_args() assert args.gamma <= 1, "Gamma must be <= 1.00" assert max(args.cophenetic_cutoff) <= 1, "All cophenetic cutoffs must be <= 1.00" calls = CallTable(args.infile) calls = calls.clusterCalls(gamma=args.gamma, cophenetic_cutoff=args.cophenetic_cutoff) calls.save(args.outfile)
return {"mo": (tdist.cdf(p_from_mother), np.median(data["mo"].rpkm[exon_start:exon_stop+1])), "fa": (tdist.cdf(p_from_father), np.median(data["fa"].rpkm[exon_start:exon_stop+1]))} if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--conifer_file", action="store", required=True) parser.add_argument("--call_file", action="store", required=True) parser.add_argument("--outfile", "-o", action="store", required=True) parser.add_argument("--threshold", default=0.99) parser.add_argument("--sample_size", default=None) args = parser.parse_args() p = ConiferPipeline(args.conifer_file) calls = CallTable(args.call_file) calls.calls["familyID"] = map(lambda x: x.split('.')[0], calls.calls["sampleID"]) new_calls = CallTable() offspring_calls = calls.filter(lambda x: x["sampleID"].endswith(("p", "s", "p1"))).calls #offspring_calls = calls.filter(lambda x: x["sampleID"][6] in ["p","s"]).calls parent_calls = calls.filter(lambda x: x["sampleID"].endswith(("m", "f", "mo", "fa"))).calls #parent_calls = calls.filter(lambda x: x["sampleID"][6] in ["m","f"]).calls if args.sample_size: sample_size = int(args.sample_size) else: sample_size = len(p.samples) print sample_size total_calls = len(offspring_calls) cnt = 0 for ix, c in offspring_calls.iterrows():
from conifertools import ConiferPipeline, CallTable, CallFilterTemplate import argparse if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("conifer_file") parser.add_argument("call_file") parser.add_argument("out_file") args = parser.parse_args() calls = CallTable(args.call_file) p = ConiferPipeline(args.conifer_file) GeneAnnotation = CallFilterTemplate(p, "/net/eichler/vol8/home/nkrumm/REFERENCE_INFO/hg19.refGene.bed", name="RefSeq", filter_type="name") calls = calls.annotate(GeneAnnotation) print_cols = ["cnvrID_SSC", "sampleID", "chromosome", "start", "stop", "state", "size_bp", "cnvr_frequency_SSC", "cohort", "median_svdzrpkm", "num_probes", "RefSeq"] calls.save(args.out_file, cols=print_cols)
from conifertools import CallTable import argparse if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--call_files", nargs="+", action="store", required=True) parser.add_argument("--outfile", action="store", required=True) parser.add_argument("--cols", nargs="+", action="store", default=[], required=False) args = parser.parse_args() calls = CallTable(args.call_files) if len(args.cols) > 0: calls.calls[args.cols]\ .sort(["chromosome", "start"])\ .to_csv(args.outfile, sep="\t") else: calls.calls\ .sort(["chromosome", "start"])\ .to_csv(args.outfile, sep="\t")
p, "/net/eichler/vol8/home/nkrumm/REFERENCE_INFO/3copiesin27of34.bed", name="Dup_overlap", filter_type="overlap", func=lambda x: x < 0.5) GeneAnnotation = CallFilterTemplate( p, "/net/eichler/vol8/home/nkrumm/REFERENCE_INFO/hg19.refGene.bed", name="RefSeq", filter_type="name") def signalFilter(x): if x["num_probes"] <= 2: return np.abs(x["median_svdzrpkm"]) >= 1.5 elif x["num_probes"] <= 5: return np.abs(x["median_svdzrpkm"]) >= 1 else: return np.abs(x["median_svdzrpkm"]) >= 0.5 calls = CallTable(args.call_file) calls = calls.filter(signalFilter)\ .filter(lambda x: x["probability"] > 0.99)\ .filter(SDFilter)\ .filter(PPGFilter)\ .filter(OtherDupFilter)\ .annotate(SDCount)\ .annotate(PPG_probe_count) calls.save(args.outfile)