def __init__(self, data, snp, sample_id): ''' Construct a genotype set from data arrays: - snp: SNP metadata record array (contains chromosome, name, morgans, base-pair location) - data: a 3-D genotype data array: (individual x SNP x allele) - sample_id: genotyped individuals' ID set ''' # People's IDs self.sample_id = sample_id self.data = data self._num_snps = self.data.shape[0] self._num_samples = self.data.shape[1] self._snp_range = None # SNP metadata: SNP label, chromosome number, Genetic distance in Morgans, and # base pair location for each SNP self.snp = snp # Base-pair-location to snp-index map, lazily-initialized + cached base_pair = self.snp['base_pair'] self._base_pair = base_pair # np.array([int(base_pair)]) if base_pair.size == 1 else base_pair self._bp_to_snp = dict_invert(dict(enumerate(self._base_pair))) # Construct a BST for fast bp queries self._snp_tree = BinarySearchTree(values=self._base_pair[optimal_insertion_order(self._num_snps)]) self._snp_index_tree = util.list_index_tree(self._base_pair) # A genetic map: lists the two allele letters corresponding to 1 and 2 for each SNP, according # their order in the self.snp array. self.map = [] # General metadata, for easy handling of CGI data self.metadata = [] # samples for which the parent-of-origin phase is determined self.poo_phase = np.zeros((self._num_samples,), dtype=np.byte)
def pipeline_monogenic_validation(work_dir=os.environ['OBER_OUT'] + '/requests/monogenic/work', index_segments_dir=os.environ['OBER_OUT'] + '/requests/monogenic/work/index_segments', region_size=100, theta_affinity=0.95, theta_weight=0.5, regenerate_segments=True, snps=None, # np.array([6, 8]), debug=1, debug_sample=512): # Load SNPs problem = im.io.read_plink(prefix=work_dir + '/monogenic.12', pedigree=im.itu.HUTT_PED, haplotype=None, frames=None) # Testing: simulate aligned samples output (hap types should be 2 in the imputed genotype output line) problem.haplotype.poo_phase = np.zeros((problem.num_samples,), dtype=np.byte) problem.haplotype.poo_phase[np.array([0, 1])] = 1 problem.haplotype.poo_phase[np.array([2, 3])] = -1 # Create segments only for the regions around each snp if regenerate_segments: for row in (problem.info.snp[snps] if snps is not None else problem.info.snp): # Find SNP's region (the one containing its base-pair position) chrom, bp = row['chrom'], row['base_pair'] phasing_dir = '%s/phasing/chr%d' % (os.environ['OBER_OUT'], chrom) index_segments_chrom_dir = '%s/chr%d' % (index_segments_dir, chrom) info_file = '%s/hutt.phased.info.npz' % (phasing_dir,) info = im.io.read_info_npz(info_file) snp_bp = info.snp['base_pair'] snp_index = util.nearest_neighbor_in_list_tree(bp, snp_bp, util.list_index_tree(snp_bp)) snp_index = snp_index if snp_bp[snp_index] <= bp else snp_index - 1 start = region_size * (snp_index / region_size) stop = start + region_size segment_file = '%s/segments-%d-%d.out' % (index_segments_chrom_dir, start, stop) if not os.path.exists(segment_file): util.mkdir_if_not_exists(index_segments_chrom_dir) util.run_command('find-segments-of-snp-range %d %d < %s/segments.out > %s' % (start, stop, phasing_dir, segment_file)) # Index segments if regenerate_segments or \ not os.path.exists('%s/metadata.npz' % (index_segments_chrom_dir,)) or \ not os.path.exists('%s/region-%d.npz' % (index_segments_chrom_dir, start)): index_segments_beagle.main(segment_file, info_file, segment_file, index_segments_chrom_dir, snp_index=snp_index, debug=2, theta_affinity=theta_affinity, theta_weight=theta_weight) # Impute using the newly generated segment index _, t = im.v.iv.impute_problem(problem, debug=debug, remove_partial_calls=True, segment_location=index_segments_dir, # if regenerate_segments else None, snps=snps, debug_sample=debug_sample) im.io.write_plink(im.Problem(genotype=t.imputed, pedigree=im.examples.hutt_pedigree(), haplotype=None, frames=None), work_dir + '/imputed.12', save_frames=False, save_haplotype=False) im.cgi.io_cgi.write_imputed(t, sys.stdout, poo_phase=problem.haplotype.poo_phase) with open(work_dir + '/imputed.12.lgen', 'wb') as f: im.cgi.io_cgi.write_imputed_lgen(t, f) return t
def _load_chrom(self, chrom): '''Load index of a new chromosome number, chrom.''' # Load index metadata self._chrom = chrom metadata = np.load('%s/chr%d/metadata.npz' % (self._index_dir, chrom)) self._snp = metadata['snp'] self._region_size = metadata['region_size'] # Currently-active region is [start,stop) self._start = 0 self._stop = 0 self._region_num = -1 # Construct a BST for fast queries of the left-neighboring SNP of a base-pair position base_pair = self._snp['base_pair'] self._base_pair = np.array([int(base_pair)]) if base_pair.size == 1 else base_pair self._snp_index_tree = util.list_index_tree(self._base_pair)
def _load_chrom(self, chrom): '''Load index of a new chromosome number, chrom.''' # Load index metadata self._chrom = chrom metadata = np.load('%s/chr%d/metadata.npz' % (self._index_dir, chrom)) self._snp = metadata['snp'] self._region_size = metadata['region_size'] # Currently-active region is [start,stop) self._start = 0 self._stop = 0 self._region_num = -1 # Construct a BST for fast queries of the left-neighboring SNP of a base-pair position base_pair = self._snp['base_pair'] self._base_pair = np.array([int(base_pair) ]) if base_pair.size == 1 else base_pair self._snp_index_tree = util.list_index_tree(self._base_pair)
def pipeline_monogenic_validation( work_dir=os.environ['OBER_OUT'] + '/requests/monogenic/work', index_segments_dir=os.environ['OBER_OUT'] + '/requests/monogenic/work/index_segments', region_size=100, theta_affinity=0.95, theta_weight=0.5, regenerate_segments=True, snps=None, # np.array([6, 8]), debug=1, debug_sample=512): # Load SNPs problem = im.io.read_plink(prefix=work_dir + '/monogenic.12', pedigree=im.itu.HUTT_PED, haplotype=None, frames=None) # Testing: simulate aligned samples output (hap types should be 2 in the imputed genotype output line) problem.haplotype.poo_phase = np.zeros((problem.num_samples, ), dtype=np.byte) problem.haplotype.poo_phase[np.array([0, 1])] = 1 problem.haplotype.poo_phase[np.array([2, 3])] = -1 # Create segments only for the regions around each snp if regenerate_segments: for row in (problem.info.snp[snps] if snps is not None else problem.info.snp): # Find SNP's region (the one containing its base-pair position) chrom, bp = row['chrom'], row['base_pair'] phasing_dir = '%s/phasing/chr%d' % (os.environ['OBER_OUT'], chrom) index_segments_chrom_dir = '%s/chr%d' % (index_segments_dir, chrom) info_file = '%s/hutt.phased.info.npz' % (phasing_dir, ) info = im.io.read_info_npz(info_file) snp_bp = info.snp['base_pair'] snp_index = util.nearest_neighbor_in_list_tree( bp, snp_bp, util.list_index_tree(snp_bp)) snp_index = snp_index if snp_bp[snp_index] <= bp else snp_index - 1 start = region_size * (snp_index / region_size) stop = start + region_size segment_file = '%s/segments-%d-%d.out' % (index_segments_chrom_dir, start, stop) if not os.path.exists(segment_file): util.mkdir_if_not_exists(index_segments_chrom_dir) util.run_command( 'find-segments-of-snp-range %d %d < %s/segments.out > %s' % (start, stop, phasing_dir, segment_file)) # Index segments if regenerate_segments or \ not os.path.exists('%s/metadata.npz' % (index_segments_chrom_dir,)) or \ not os.path.exists('%s/region-%d.npz' % (index_segments_chrom_dir, start)): index_segments_beagle.main(segment_file, info_file, segment_file, index_segments_chrom_dir, snp_index=snp_index, debug=2, theta_affinity=theta_affinity, theta_weight=theta_weight) # Impute using the newly generated segment index _, t = im.v.iv.impute_problem( problem, debug=debug, remove_partial_calls=True, segment_location=index_segments_dir, # if regenerate_segments else None, snps=snps, debug_sample=debug_sample) im.io.write_plink(im.Problem(genotype=t.imputed, pedigree=im.examples.hutt_pedigree(), haplotype=None, frames=None), work_dir + '/imputed.12', save_frames=False, save_haplotype=False) im.cgi.io_cgi.write_imputed(t, sys.stdout, poo_phase=problem.haplotype.poo_phase) with open(work_dir + '/imputed.12.lgen', 'wb') as f: im.cgi.io_cgi.write_imputed_lgen(t, f) return t