def __init__(self, problem, fraction=None, test_index=None): '''Initialize an experiment to be run on a problem, clearing out 'fraction' of the data. If test_index is specified, these specific test indices are used; otherwise a random fraction is generated. If test_index = 'hap', data is read from problem.h (haplotype array). The entire array is considered as a test array, but nothing is zeroed out. Useful for phasing result stats.''' # Create a working copy of the problem. Only the data is copied. if not (fraction is not None) ^ (test_index is not None): raise ValueError('Must specify fraction or test_index') self.problem = Problem(problem.pedigree, problem.genotype.copy()) self.h = self.problem.h # Create test set; save original genotypes in g_orig if test_index is None: self.fraction = fraction self.g_orig, i = clear_random_portion(self.problem.genotype.data, fraction) elif test_index == 'hap': # Don't clear anything; call everything a test index. h = problem.h i = tuple( util.flattened_meshgrid(range(h.shape[0]), range(h.shape[1]))) self.g_orig = problem.g self.h = h self.fraction = 1.0 else: self.g_orig, i = clear_index(self.problem.g, test_index) self.fraction = (1.0 * i[0].size) / (self.h.shape[0] * self.h.shape[1]) self.num_tests = i[0].size self.test_index = i self.r_orig = recode.recode_single_genotype(self.g_orig) self.fill = self.problem.fill_fraction()[:, SAMPLE] self.__recode_single_genotype = None
def __init__(self, problem, fraction=None, test_index=None): '''Initialize an experiment to be run on a problem, clearing out 'fraction' of the data. If test_index is specified, these specific test indices are used; otherwise a random fraction is generated. If test_index = 'hap', data is read from problem.h (haplotype array). The entire array is considered as a test array, but nothing is zeroed out. Useful for phasing result stats.''' # Create a working copy of the problem. Only the data is copied. if not (fraction is not None) ^ (test_index is not None): raise ValueError('Must specify fraction or test_index') self.problem = Problem(problem.pedigree, problem.genotype.copy()) self.h = self.problem.h # Create test set; save original genotypes in g_orig if test_index is None: self.fraction = fraction self.g_orig, i = clear_random_portion(self.problem.genotype.data, fraction) elif test_index == 'hap': # Don't clear anything; call everything a test index. h = problem.h i = tuple(util.flattened_meshgrid(range(h.shape[0]), range(h.shape[1]))) self.g_orig = problem.g self.h = h self.fraction = 1.0 else: self.g_orig, i = clear_index(self.problem.g, test_index) self.fraction = (1.0 * i[0].size) / (self.h.shape[0] * self.h.shape[1]) self.num_tests = i[0].size self.test_index = i self.r_orig = recode.recode_single_genotype(self.g_orig) self.fill = self.problem.fill_fraction()[:, SAMPLE] self.__recode_single_genotype = None
def problem_hut(): '''Load the hutterites data set. Cached since it's large.''' if not Templates.PROBLEM_HUT: pedigree = Templates.pedigree_hut() genotype = io_genotype.read('plink', 'genotype', prefix=GENOTYPE_SAMPLE, load_ids=False) Templates.PROBLEM_HUT = Problem(pedigree, genotype) return Templates.PROBLEM_HUT
def pipeline_validation_experiment(location_file, true_type, true_location, pedigree, debug=False, remove_partial_calls=False): '''Load (the ''true'') genotypes from an external source. Load a list of locations from ''location_file''. Impute them and compare with the true genotypes.''' g = extract_genotypes(location_file) t = ImputationSet(pedigree, g) if true_type == 'iplex': true_genotype = im.imputation.reader.iplex_to_genotype(true_location, t) # os.environ['OBER'] + '/data/impute/rare/to_livne_20121205', t) else: raise ValueError('Unsupported true genotype format ''%s''' % (true_type,)) problem = Problem(pedigree, true_genotype) p, t = impute_problem(problem, debug=debug, remove_partial_calls=remove_partial_calls) return p, t
def setUp(self): '''Load test data and expected results.''' unittest.TestCase.setUp(self) # The way to load a pedigree in conjunction with a genotype set is to recode # its sample IDs to consecutive for easier access by phasers. self.problem = io.read_plink(prefix=itu.GENOTYPE_TRIO, haplotype=None, pedigree=itu.GENOTYPE_TRIO + '.tfam') self.phaser = trivial_phaser() # Expected results self.solution = Problem(self.problem.pedigree, io_genotype.read('plink', 'genotype', prefix=itu.GENOTYPE_TRIO_SOLUTION))
def read_npz(in_file): '''Read problem from NPZ file. in_file may be a file name or an open file descriptor.''' files = np.load(in_file) graph = nx.DiGraph() graph.add_nodes_from(files['pedigree_nodes']) graph.add_edges_from(files['pedigree_graph'][0]) p = Pedigree(graph, sample_id=files['pedigree_sample_id'], sex=files['pedigree_sex'], phenotype=files['pedigree_phenotype'], node_type=files['pedigree_node_type'], sample_index=files['pedigree_sample_index'], num_genotyped=files['pedigree_num_genotyped'][0]) g = GenotypeFactory.new_instance('genotype', files['genotype_data'], files['genotype_snp']) h = GenotypeFactory.new_instance('haplotype', files['haplotype_data'], files['haplotype_snp'], qc=MISSING) error = files['error'] h.qc = files['haplotype_qc'] info = files['info'][0] frames = Frames((k, w) for k, v in files['frames'][0].iteritems() for w in v[0]) if files['frames'][0] else None lam = files['lam'] # Optional fields if 'genotype_map' in files.files: g.map = files['genotype_map'] if 'haplotype_poo_phase' in files.files: h.poo_phase = files['haplotype_poo_phase'] if 'haplotype_hap_type' in files.files: h.hap_type = files['haplotype_hap_type'] return Problem(p, g, haplotype=h, error=error, info=info, frames=frames, lam=lam)
def read_plink(**kwargs): '''Load a problem from the following PLINK files: Default Override Option Data Format ====================================================================================== prefix.pdg.tfam pedigree Pedigree adjacency PLINK TFAM (genotyped+nongenotyped samples) prefix.tfam pedigree_genotyped Genotyped sample pedigree (sub-graph of the pedigree) PLINK TFAM corresponding to prefix.tped prefix.tped genotype Genotype data PLINK TPED prefix.hap.tped haplotype* Haplotype data PLINK TPED prefix.err error** Genotype errors flagged Integer array (snps x samples) prefix.info info Problem info pickle (binary) prefix.frm frames LD-independent SNP frames text file prefix.lam lam*** Haplotype est. recombination rate text file * - hap data not loaded if this option is None. ** - errors set to 0 if this file is not found or this option is set to None. *** - data not loaded if if this file is not found. ''' # Read input options verbose = kwargs.get('verbose', False) prefix = kwargs.get('prefix', '') overrideable_option = lambda name, default: kwargs.get(name, default if prefix else None) pedigree = overrideable_option('pedigree', prefix + '.pdg.tfam') pedigree_genotyped = overrideable_option('pedigree_genotyped', prefix + '.tfam') genotype = overrideable_option('genotype', prefix + '.tped') haplotype = overrideable_option('haplotype', prefix + '.hap.tped') error_file = overrideable_option('error', prefix + '.err') info = overrideable_option('info', prefix + '.info') if not np.all([[pedigree, pedigree_genotyped, genotype, error_file] is not None]): raise ValueError('Must specify a prefix or pedigree, pedigree_genotyped, genotype, error files') frames_file = overrideable_option('frames', prefix + '.frm') lam_file = overrideable_option('lam', prefix + '.lam') # Load data print_location = lambda x : x if x else '-' if verbose: print 'Reading pedigree from %s, %s ...' % (print_location(pedigree), print_location(pedigree_genotyped),) p = io_pedigree.read(pedigree, genotyped_id_file=pedigree_genotyped) if verbose: print 'Reading genotype data from %s ...' % (print_location(genotype),) g = io_genotype.read('plink', 'genotype', tped=genotype, load_ids=False) if verbose: print 'Reading haplotype data from %s ...' % (print_location(haplotype),) h = io_genotype.read('plink', 'haplotype', tped=haplotype, load_ids=False) if haplotype else None if verbose: print 'Reading error data from %s ...' % (print_location(error_file),) error = np.loadtxt(error_file) if error_file and os.path.isfile(error_file) else None if verbose: print 'Reading frame data from %s ...' % (print_location(frames_file),) frames = db_gene.snp.ld_graph.read_frames(frames_file) if frames_file else None lam = np.loadtxt(lam_file) if lam_file and os.path.isfile(lam_file) else None # info = ProblemInfo(p, g) if info is None else info problem = Problem(p, g, haplotype=h, error=error, frames=frames, lam=lam) if haplotype and info: if verbose: print 'Reading problem info from %s ...' % (info,) with open(info, 'rb') as fout: problem.info = pickle.load(fout) return problem
class Experiment(object): '''A validation experiment: start with a Problem object, clear a certain portion of the data, run a phaser, and cross-check the hap results against the original genotype data.''' #--------------------------------------------- # Constructors #--------------------------------------------- def __init__(self, problem, fraction=None, test_index=None): '''Initialize an experiment to be run on a problem, clearing out 'fraction' of the data. If test_index is specified, these specific test indices are used; otherwise a random fraction is generated. If test_index = 'hap', data is read from problem.h (haplotype array). The entire array is considered as a test array, but nothing is zeroed out. Useful for phasing result stats.''' # Create a working copy of the problem. Only the data is copied. if not (fraction is not None) ^ (test_index is not None): raise ValueError('Must specify fraction or test_index') self.problem = Problem(problem.pedigree, problem.genotype.copy()) self.h = self.problem.h # Create test set; save original genotypes in g_orig if test_index is None: self.fraction = fraction self.g_orig, i = clear_random_portion(self.problem.genotype.data, fraction) elif test_index == 'hap': # Don't clear anything; call everything a test index. h = problem.h i = tuple(util.flattened_meshgrid(range(h.shape[0]), range(h.shape[1]))) self.g_orig = problem.g self.h = h self.fraction = 1.0 else: self.g_orig, i = clear_index(self.problem.g, test_index) self.fraction = (1.0 * i[0].size) / (self.h.shape[0] * self.h.shape[1]) self.num_tests = i[0].size self.test_index = i self.r_orig = recode.recode_single_genotype(self.g_orig) self.fill = self.problem.fill_fraction()[:, SAMPLE] self.__recode_single_genotype = None #--------------------------------------------- # Methods #--------------------------------------------- def __repr__(self): return 'Experiment[%s, fraction=%.2f%%]' % (repr(self.problem), self.fraction) def run(self, phaser, params=None): '''Run phaser (or more generally, a handler) on a problem.''' phaser.run(self.problem, params=params) self.fill = self.problem.fill_fraction()[:, 1] def num_test_genotypes(self, field): '''Return the number of genotypes in which both alleles were called, broken by field (SNP=0, sample=1).''' return self.__group_by_field(np.arange(len(self.test_index[0])), field) def where_called(self): '''Return the indices of genotypes in which both alleles were called.''' # Positive entries of r = called entries return recode.where_called(self.recoded_genotype)[0] def called(self, field): '''Return the number of genotypes in which both alleles were called, broken by field (SNP=0, sample=1).''' return self.__group_by_field(self.where_called(), field) #--------------------------------------------- # Properties #--------------------------------------------- @property def test_orig(self): '''Return the original set of deleted test genotypes.''' i = self.test_index return recode.recode_single_genotype(self.h[i[SNP], i[SAMPLE], :]) @property def test_called(self): '''Return the called set of haplotypes corersponding to the test genotypes.''' i = self.test_index return self.h[i[SNP], i[SAMPLE], :] @property def recoded_genotype(self): '''Return the genotype test set, recoded as a single number of allele pair.''' if self.__recode_single_genotype is None: self.__recode_single_genotype = recode.recode_single_genotype(self.test_called) return self.__recode_single_genotype @property def total_called(self): '''Return the number of genotypes in which both alleles were called.''' return len(self.where_called()) @property def total_partial_called(self): '''Return the number of genotypes in which one alleles was called.''' # Positive entries of r = called entries return len(recode.where_partial_called(self.recoded_genotype)[0]) @property def total_errors(self): '''Return the number of genotypes that were called incorrectly. (A genotype is an allele pair.)''' # Count entries that were called (positive AND are different than the corresponding original value return len(recode.where_error(self.recoded_genotype, self.r_orig)[0]) @property def total_partial_errors(self): '''Return the number of genotypes that were called incorrectly. (A genotype is an allele pair.)''' # Count entries that were called (positive AND are different than the corresponding original value # This happens when hap=(0,1), genotype=(2,2) or hap(0,2), genotype=(1,1) return len(recode.where_partial_error(self.recoded_genotype, self.r_orig)[0]) @property def full_call_fraction(self): '''Return the % of correctly fully-called test genotypes.''' return (1.0 * self.total_called) / self.num_tests @property def partial_call_fraction(self): '''Return the % of erroneously half-called test genotypes.''' return (1.0 * self.total_partial_called) / self.num_tests @property def full_error_fraction(self): '''Return the % of erroneously fully-called test genotypes.''' return (1.0 * self.total_errors) / self.num_tests @property def partial_error_fraction(self): '''Return the % of erroneously half-called test genotypes.''' return (1.0 * self.total_partial_errors) / self.num_tests @property def stats(self): '''Return a tuple containing all experiment statistics: fraction, all call rates, run time.''' return (self.fraction, self.full_call_fraction, self.partial_call_fraction, self.full_error_fraction, self.partial_error_fraction) #--------------------------------------------- # Private Methods #--------------------------------------------- def __group_by_field(self, i, field): '''Group a test index subset i by field (SNP=0, sample=1).''' size = self.problem.genotype.data.shape[field] group_count = util.dict_to_array(statutil.group_by_value(self.test_index[field][i])) result = np.zeros((size,), dtype=int) result[group_count['k']] = group_count['v'] return result
class Experiment(object): '''A validation experiment: start with a Problem object, clear a certain portion of the data, run a phaser, and cross-check the hap results against the original genotype data.''' #--------------------------------------------- # Constructors #--------------------------------------------- def __init__(self, problem, fraction=None, test_index=None): '''Initialize an experiment to be run on a problem, clearing out 'fraction' of the data. If test_index is specified, these specific test indices are used; otherwise a random fraction is generated. If test_index = 'hap', data is read from problem.h (haplotype array). The entire array is considered as a test array, but nothing is zeroed out. Useful for phasing result stats.''' # Create a working copy of the problem. Only the data is copied. if not (fraction is not None) ^ (test_index is not None): raise ValueError('Must specify fraction or test_index') self.problem = Problem(problem.pedigree, problem.genotype.copy()) self.h = self.problem.h # Create test set; save original genotypes in g_orig if test_index is None: self.fraction = fraction self.g_orig, i = clear_random_portion(self.problem.genotype.data, fraction) elif test_index == 'hap': # Don't clear anything; call everything a test index. h = problem.h i = tuple( util.flattened_meshgrid(range(h.shape[0]), range(h.shape[1]))) self.g_orig = problem.g self.h = h self.fraction = 1.0 else: self.g_orig, i = clear_index(self.problem.g, test_index) self.fraction = (1.0 * i[0].size) / (self.h.shape[0] * self.h.shape[1]) self.num_tests = i[0].size self.test_index = i self.r_orig = recode.recode_single_genotype(self.g_orig) self.fill = self.problem.fill_fraction()[:, SAMPLE] self.__recode_single_genotype = None #--------------------------------------------- # Methods #--------------------------------------------- def __repr__(self): return 'Experiment[%s, fraction=%.2f%%]' % (repr( self.problem), self.fraction) def run(self, phaser, params=None): '''Run phaser (or more generally, a handler) on a problem.''' phaser.run(self.problem, params=params) self.fill = self.problem.fill_fraction()[:, 1] def num_test_genotypes(self, field): '''Return the number of genotypes in which both alleles were called, broken by field (SNP=0, sample=1).''' return self.__group_by_field(np.arange(len(self.test_index[0])), field) def where_called(self): '''Return the indices of genotypes in which both alleles were called.''' # Positive entries of r = called entries return recode.where_called(self.recoded_genotype)[0] def called(self, field): '''Return the number of genotypes in which both alleles were called, broken by field (SNP=0, sample=1).''' return self.__group_by_field(self.where_called(), field) #--------------------------------------------- # Properties #--------------------------------------------- @property def test_orig(self): '''Return the original set of deleted test genotypes.''' i = self.test_index return recode.recode_single_genotype(self.h[i[SNP], i[SAMPLE], :]) @property def test_called(self): '''Return the called set of haplotypes corersponding to the test genotypes.''' i = self.test_index return self.h[i[SNP], i[SAMPLE], :] @property def recoded_genotype(self): '''Return the genotype test set, recoded as a single number of allele pair.''' if self.__recode_single_genotype is None: self.__recode_single_genotype = recode.recode_single_genotype( self.test_called) return self.__recode_single_genotype @property def total_called(self): '''Return the number of genotypes in which both alleles were called.''' return len(self.where_called()) @property def total_partial_called(self): '''Return the number of genotypes in which one alleles was called.''' # Positive entries of r = called entries return len(recode.where_partial_called(self.recoded_genotype)[0]) @property def total_errors(self): '''Return the number of genotypes that were called incorrectly. (A genotype is an allele pair.)''' # Count entries that were called (positive AND are different than the corresponding original value return len(recode.where_error(self.recoded_genotype, self.r_orig)[0]) @property def total_partial_errors(self): '''Return the number of genotypes that were called incorrectly. (A genotype is an allele pair.)''' # Count entries that were called (positive AND are different than the corresponding original value # This happens when hap=(0,1), genotype=(2,2) or hap(0,2), genotype=(1,1) return len( recode.where_partial_error(self.recoded_genotype, self.r_orig)[0]) @property def full_call_fraction(self): '''Return the % of correctly fully-called test genotypes.''' return (1.0 * self.total_called) / self.num_tests @property def partial_call_fraction(self): '''Return the % of erroneously half-called test genotypes.''' return (1.0 * self.total_partial_called) / self.num_tests @property def full_error_fraction(self): '''Return the % of erroneously fully-called test genotypes.''' return (1.0 * self.total_errors) / self.num_tests @property def partial_error_fraction(self): '''Return the % of erroneously half-called test genotypes.''' return (1.0 * self.total_partial_errors) / self.num_tests @property def stats(self): '''Return a tuple containing all experiment statistics: fraction, all call rates, run time.''' return (self.fraction, self.full_call_fraction, self.partial_call_fraction, self.full_error_fraction, self.partial_error_fraction) #--------------------------------------------- # Private Methods #--------------------------------------------- def __group_by_field(self, i, field): '''Group a test index subset i by field (SNP=0, sample=1).''' size = self.problem.genotype.data.shape[field] group_count = util.dict_to_array( statutil.group_by_value(self.test_index[field][i])) result = np.zeros((size, ), dtype=int) result[group_count['k']] = group_count['v'] return result
def read_plink(**kwargs): '''Load a problem from the following PLINK files: Default Override Option Data Format ====================================================================================== prefix.pdg.tfam pedigree Pedigree adjacency PLINK TFAM (genotyped+nongenotyped samples) prefix.tfam pedigree_genotyped Genotyped sample pedigree (sub-graph of the pedigree) PLINK TFAM corresponding to prefix.tped prefix.tped genotype Genotype data PLINK TPED prefix.hap.tped haplotype* Haplotype data PLINK TPED prefix.err error** Genotype errors flagged Integer array (snps x samples) prefix.info info Problem info pickle (binary) prefix.frm frames LD-independent SNP frames text file prefix.lam lam*** Haplotype est. recombination rate text file * - hap data not loaded if this option is None. ** - errors set to 0 if this file is not found or this option is set to None. *** - data not loaded if if this file is not found. ''' # Read input options verbose = kwargs.get('verbose', False) prefix = kwargs.get('prefix', '') overrideable_option = lambda name, default: kwargs.get( name, default if prefix else None) pedigree = overrideable_option('pedigree', prefix + '.pdg.tfam') pedigree_genotyped = overrideable_option('pedigree_genotyped', prefix + '.tfam') genotype = overrideable_option('genotype', prefix + '.tped') haplotype = overrideable_option('haplotype', prefix + '.hap.tped') error_file = overrideable_option('error', prefix + '.err') info = overrideable_option('info', prefix + '.info') if not np.all([[pedigree, pedigree_genotyped, genotype, error_file] is not None]): raise ValueError( 'Must specify a prefix or pedigree, pedigree_genotyped, genotype, error files' ) frames_file = overrideable_option('frames', prefix + '.frm') lam_file = overrideable_option('lam', prefix + '.lam') # Load data print_location = lambda x: x if x else '-' if verbose: print 'Reading pedigree from %s, %s ...' % ( print_location(pedigree), print_location(pedigree_genotyped), ) p = io_pedigree.read(pedigree, genotyped_id_file=pedigree_genotyped) if verbose: print 'Reading genotype data from %s ...' % ( print_location(genotype), ) g = io_genotype.read('plink', 'genotype', tped=genotype, load_ids=False) if verbose: print 'Reading haplotype data from %s ...' % ( print_location(haplotype), ) h = io_genotype.read('plink', 'haplotype', tped=haplotype, load_ids=False) if haplotype else None if verbose: print 'Reading error data from %s ...' % (print_location(error_file), ) error = np.loadtxt( error_file) if error_file and os.path.isfile(error_file) else None if verbose: print 'Reading frame data from %s ...' % ( print_location(frames_file), ) frames = db_gene.snp.ld_graph.read_frames( frames_file) if frames_file else None lam = np.loadtxt( lam_file) if lam_file and os.path.isfile(lam_file) else None # info = ProblemInfo(p, g) if info is None else info problem = Problem(p, g, haplotype=h, error=error, frames=frames, lam=lam) if haplotype and info: if verbose: print 'Reading problem info from %s ...' % (info, ) with open(info, 'rb') as fout: problem.info = pickle.load(fout) return problem