def genotype_DTS_regions(dts_list_file, regions_file, contigs, window): """ Get copy number from each DenseTrackSet for each region in the given file. """ # Load supported contigs. with open(contigs, "r") as fh: supported_contigs = set([line.strip().split("\t")[0] for line in fh]) regions_by_chromosome = get_regions(regions_file, supported_contigs) # Load DTS file list. with open(dts_list_file, "r") as fh: dts_list = [line.strip() for line in fh] # Build a sorted list of sample names. dts_list.sort() sample_names = [os.path.basename(dts) for dts in dts_list] yield ["chromosome", "start", "end"] + sample_names for chromosome, regions in regions_by_chromosome.iteritems(): copies_by_sample = {} for i in xrange(len(dts_list)): dts = dts_list[i] sample_name = sample_names[i] sample = wnd_cp_indiv(dts, contigs, window) copies_by_sample[sample_name] = sample.get_cp_by_regions( chromosome, regions["starts"], regions["ends"] ) for i in xrange(len(regions["starts"])): yield list([chromosome, str(regions["starts"][i]), str(regions["ends"][i])] + [str(copies_by_sample[sample][i]) for sample in sample_names])
parser.add_argument('--window_size', type=int, default=None, help='Size of SUNK/wssd sliding windows') parser.add_argument('--min_ref_cp_delta', dest='min_d', type=float, default=0, help='Smallest difference in cp between ref and sample to consider (Default: %(default)s)') parser.add_argument('--no_P_value_adjust', dest='P_adjust', action='store_false') parser.add_argument('--min_mu', default=0.5, type=float, help='Minimum cluster mean (Default: %(default)s)') parser.add_argument('--subset_indivs', default=None, help='Colon-separated list of individuals to consider (Default: %(default)s)') o = parser.parse_args() subset_indivs = o.subset_indivs if subset_indivs != None: subset_indivs = subset_indivs.split(":") subset_indivs = list(set(subset_indivs)) indiv_DTS = wnd_cp_indiv(o.fn_indiv_DTS, o.fn_contigs, o.window_size) indiv_id = o.fn_indiv_DTS.split("/")[-1].replace("500_bp_","") ref_DTSs = {} dCGHs = {} for fn_ref in o.fn_ref_DTS.split(":"): dCGHs[fn_ref.split("/")[-1].replace("500_bp_","")] = dCGH(o.fn_indiv_DTS, fn_ref, o.fn_contigs, o.window_size) ref_DTSs[fn_ref.split("/")[-1].replace("500_bp_","")] = wnd_cp_indiv(fn_ref, o.fn_contigs, o.window_size) call_table = cluster.indiv_callset_table(o.fn_call_table)
opts.add_option('','--fn_DTS',dest='fn_DTS', default=None) opts.add_option('','--contigs',dest='fn_contigs', default=None) opts.add_option('','--wnd_size',dest='wnd_size', type=int, default=None) #opts.add_option('','--wnd_slide',dest='wnd_slide', type=int, default=None) opts.add_option('','--out_dir',dest='out_dir') opts.add_option('','--fn_out',dest='fn_out') opts.add_option('','--contig_prefix',dest='contig_prefix', default="") opts.add_option('','--DTS_prefix',dest='DTS_prefix', default="500_bp_") opts.add_option('','--output_contigs',dest='output_contigs', default="/net/eichler/vol7/home/psudmant/genomes/contigs/hg19_contigs.txt") (o, args) = opts.parse_args() #usage, init, then run indiv = o.fn_DTS.split("/")[-1].replace("500_bp_","") wnd_cp = wnd_cp_indiv(o.fn_DTS, o.fn_contigs, o.wnd_size) """ outstr: chr start end indiv 0 0 0 color """ c_out = output(o.contig_prefix, o.output_contigs) for contig in wnd_cp.contigs: print >>stderr, contig cps = wnd_cp.get_cps_by_chr(contig) wnd_starts, wnd_ends = wnd_cp.get_wnds_by_chr(contig) prev_start = 0 for i in xrange(0, cps.shape[0]-1): s, e = wnd_starts[i], wnd_ends[i]
def init_from_DTS(cls, **kwargs): """ requires the below inputs gglob.init_from_DTS(DTS_dir = DTS_dir, DTS_prefix = DTS_prefix, sunk_DTS_dir = sunk_DTS_dir, sunk_DTS_prefix = sunk_DTS_prefix, wnd_size = wnd_size, indivs = indivs, contig = contig, fn_contigs = fn_contigs, fn_sunk_contigs = fn_sunk_contigs) """ DTS_dir = kwargs["DTS_dir"] DTS_prefix = kwargs["DTS_prefix"] sunk_DTS_dir = kwargs["sunk_DTS_dir"] sunk_DTS_prefix = kwargs["sunk_DTS_prefix"] wnd_size = kwargs['wnd_size'] wnd_slide = kwargs['wnd_slide'] indivs = kwargs['indivs'] contig = kwargs['contig'] fn_contigs = kwargs['fn_contigs'] fn_sunk_contigs = kwargs['fn_sunk_contigs'] DTS_pre = "%s/%s" % (DTS_dir, DTS_prefix) sunk_DTS_pre = "%s/%s" % (sunk_DTS_dir, DTS_prefix) n_indivs = len(indivs) t = time.time() rand_wnd_cp = wnd_cp_indiv("%s%s" % (DTS_pre, indivs[0]), fn_contigs, wnd_size) wnd_starts, wnd_ends = rand_wnd_cp.get_wnds_by_chr(contig) cp_matrix = np.zeros((n_indivs, wnd_starts.shape[0])) rand_sunk_wnd_cp = wnd_cp_indiv("%s%s" % (sunk_DTS_pre, indivs[0]), fn_sunk_contigs, wnd_size) sunk_wnd_starts, sunk_wnd_ends = rand_sunk_wnd_cp.get_wnds_by_chr( contig) sunk_cp_matrix = np.zeros((n_indivs, sunk_wnd_starts.shape[0])) correct = not (contig in ["chrY", "chrX"]) for i, indiv in enumerate(indivs): print(indiv, file=stderr) wnd_cp = wnd_cp_indiv("%s%s" % (DTS_pre, indiv), fn_contigs, wnd_size) cp_matrix[i, :] = wnd_cp.get_cps_by_chr(contig, correct=correct) sunk_wnd_cp = wnd_cp_indiv("%s%s" % (sunk_DTS_pre, indiv), fn_sunk_contigs, wnd_size) sunk_cp_matrix[i, :] = sunk_wnd_cp.get_cps_by_chr(contig, correct=correct) return cls(indivs=indivs, wnd_size=wnd_size, wnd_slide=wnd_slide, contig=contig, wnd_starts=wnd_starts, wnd_ends=wnd_ends, cp_matrix=cp_matrix, sunk_wnd_starts=sunk_wnd_starts, sunk_wnd_ends=sunk_wnd_ends, sunk_cp_matrix=sunk_cp_matrix)
def init_from_DTS(cls, **kwargs): """ requires the below inputs gglob.init_from_DTS(DTS_dir = DTS_dir, DTS_prefix = DTS_prefix, sunk_DTS_dir = sunk_DTS_dir, sunk_DTS_prefix = sunk_DTS_prefix, wnd_size = wnd_size, indivs = indivs, contig = contig, fn_contigs = fn_contigs, fn_sunk_contigs = fn_sunk_contigs) """ DTS_dir = kwargs["DTS_dir"] DTS_prefix = kwargs["DTS_prefix"] sunk_DTS_dir = kwargs["sunk_DTS_dir"] sunk_DTS_prefix = kwargs["sunk_DTS_prefix"] wnd_size = kwargs['wnd_size'] wnd_slide = kwargs['wnd_slide'] indivs = kwargs['indivs'] contig = kwargs['contig'] fn_contigs = kwargs['fn_contigs'] fn_sunk_contigs = kwargs['fn_sunk_contigs'] DTS_pre="%s/%s"%(DTS_dir, DTS_prefix) sunk_DTS_pre="%s/%s"%(sunk_DTS_dir, DTS_prefix) n_indivs = len(indivs) t = time.time() rand_wnd_cp = wnd_cp_indiv("%s%s"%(DTS_pre, indivs[0]), fn_contigs, wnd_size) wnd_starts, wnd_ends = rand_wnd_cp.get_wnds_by_chr(contig) cp_matrix = np.zeros((n_indivs, wnd_starts.shape[0])) rand_sunk_wnd_cp = wnd_cp_indiv("%s%s"%(sunk_DTS_pre, indivs[0]), fn_sunk_contigs, wnd_size) sunk_wnd_starts, sunk_wnd_ends = rand_sunk_wnd_cp.get_wnds_by_chr(contig) sunk_cp_matrix = np.zeros((n_indivs, sunk_wnd_starts.shape[0])) correct = not (contig in ["chrY", "chrX"]) for i, indiv in enumerate(indivs): print >> stderr, indiv wnd_cp = wnd_cp_indiv("%s%s"%(DTS_pre, indiv), fn_contigs, wnd_size) cp_matrix[i,:] = wnd_cp.get_cps_by_chr(contig, correct=correct) sunk_wnd_cp = wnd_cp_indiv("%s%s"%(sunk_DTS_pre, indiv), fn_sunk_contigs, wnd_size) sunk_cp_matrix[i,:] = sunk_wnd_cp.get_cps_by_chr(contig, correct=correct) return cls(indivs = indivs, wnd_size = wnd_size, wnd_slide = wnd_slide, contig = contig, wnd_starts = wnd_starts, wnd_ends = wnd_ends, cp_matrix = cp_matrix, sunk_wnd_starts = sunk_wnd_starts, sunk_wnd_ends = sunk_wnd_ends, sunk_cp_matrix = sunk_cp_matrix)
opts.add_option('', '--fn_out', dest='fn_out') opts.add_option('', '--contig_prefix', dest='contig_prefix', default="") opts.add_option('', '--DTS_prefix', dest='DTS_prefix', default="500_bp_") opts.add_option( '', '--output_contigs', dest='output_contigs', default= "/net/eichler/vol27/projects/human_diversity/nobackups/hsiehph/genomicData/EEElab/read_depth_genotyper/hg19_contigs.txt" ) (o, args) = opts.parse_args() #usage, init, then run indiv = o.fn_DTS.split("/")[-1].replace("500_bp_", "") wnd_cp = wnd_cp_indiv(o.fn_DTS, o.fn_contigs, o.wnd_size) """ outstr: chr start end indiv 0 0 0 color """ c_out = output(o.contig_prefix, o.output_contigs) for contig in wnd_cp.contigs: print(contig, file=stderr) cps = wnd_cp.get_cps_by_chr(contig) wnd_starts, wnd_ends = wnd_cp.get_wnds_by_chr(contig) prev_start = 0 for i in range(0, cps.shape[0] - 1): s, e = wnd_starts[i], wnd_ends[i]
chrs = [] for c in tbx_gc.contigs: if (not "random" in c) and (not "X" in c) and (not "Y" in c): chrs.append(c) fn_contigs = o.fn_contigs wnd = int(o.window_size) ## #cutoff_scale=float(bp_cutoff_scale/wnd) #max_merge = o.max_merge_dif #cp_data = dCGH(o.fn_in_DTS,o.fn_ref_DTS,o.fn_contigs,wnd) #segment_callset = callset() #caller_by_chr = {} #dCGH(o.fn_in_DTS,o.fn_ref_DTS,o.fn_contigs,wnd) cp_data = wnd_cp_indiv(o.fn_in_DTS,o.fn_contigs,wnd) GC_DTS = DenseTrackSet(o.fn_contigs,o.fn_GC_DTS,overwrite=False,openMode='r') null_dist = null_distribution(tbx_gc) for chr in chrs: print >>stderr,"%s..."%chr magnitude_vect = cp_data.get_cps_by_chr(chr) starts_vect,ends_vect = cp_data.get_wnds_by_chr(chr) #plot_GC(chr,tbx_gc,magnitude_vect,starts_vect,ends_vect) #print magnitude_vect[0:1000] gapped_wnds = cp_data.get_overlapping_wnds(chr,tbx_gaps) segdup_wnds = cp_data.get_overlapping_wnds(chr,tbx_dups) null_dist.add(magnitude_vect,[gapped_wnds,segdup_wnds])
parser.add_argument( '--subset_indivs', default=None, help= 'Colon-separated list of individuals to consider (Default: %(default)s)' ) o = parser.parse_args() subset_indivs = o.subset_indivs if subset_indivs != None: subset_indivs = subset_indivs.split(":") subset_indivs = list(set(subset_indivs)) indiv_DTS = wnd_cp_indiv(o.fn_indiv_DTS, o.fn_contigs, o.window_size) indiv_id = o.fn_indiv_DTS.split("/")[-1].replace("500_bp_", "") ref_DTSs = {} dCGHs = {} for fn_ref in o.fn_ref_DTS.split(":"): dCGHs[fn_ref.split("/")[-1].replace("500_bp_", "")] = dCGH( o.fn_indiv_DTS, fn_ref, o.fn_contigs, o.window_size) ref_DTSs[fn_ref.split("/")[-1].replace("500_bp_", "")] = wnd_cp_indiv( fn_ref, o.fn_contigs, o.window_size) call_table = cluster.indiv_callset_table(o.fn_call_table) if o.limit_to_chr: call_table.filter_by_chr(o.limit_to_chr)