def test_chromosometemplate(): # Test the marker finder c = ChromosomeTemplate() for i in range(1, 100): c.add_genotype(map_position=i, bp=(i * 1e6)) assert c.closest_marker(0) == 0 assert c.closest_marker(5000001) == 4 assert c.closest_marker(5999999) == 5 assert c.closest_marker(1e10) == c.nmark() - 1
def test_chromosometemplate(): # Test the marker finder c = ChromosomeTemplate() for i in range(1,100): c.add_genotype(map_position=i, bp=(i*1e6)) assert c.closest_marker(0) == 0 assert c.closest_marker(5000001) == 4 assert c.closest_marker(5999999) == 5 assert c.closest_marker(1e10) == c.nmark() - 1
def read_vcf(filename, require_pass=False, freq_info=None): """ Reads a VCF file and returns a Population object with the individuals represented in the file Genotypes generated by this function will be sparse :param require_pass: only allow variants with PASS under FILTER :type require_pass: bool :param freq_info: INFO field to get allele frequency from :param freq_info: string :returns: Individuals in the VCF :rtype: Population """ with smartopen(filename) as f: genotypes = [] pop, inds = _vcf_parseheader(f) last_chrom = None chromobj = None for line in f: record = VCFRecord(line) if require_pass and not record.filter_passed: continue if record.chrom != last_chrom: if last_chrom is not None: pop.add_chromosome(chromobj) chromobj = ChromosomeTemplate(label=record.chrom) if freq_info is not None: freq = _vcf_get_infofreq(record.info, freq_info) else: freq = 0 genorow = record.genotypes() genotypes.append(genorow) chromobj.add_genotype(bp=record.pos, label=record.label, frequency=freq) last_chrom = record.chrom pop.add_chromosome(chromobj) pop.chromosomes.finalize() for ind in inds: # Initialize new genotypes ind._init_genotypes(sparse=True) # Now actually sift through markers and assign them to individuals final_indices = [] for chromidx, chromobj in enumerate(pop.chromosomes): indices = zip([chromidx] * chromobj.nmark(), range(chromobj.nmark())) final_indices.extend(indices) raw_indices = range(len(genotypes)) for raw, final in zip(raw_indices, final_indices): chromidx, markidx = final row = genotypes[raw] assign_genorow(row, inds, chromidx, markidx) # Kill the row so we don't end up with the whole dataset in memory twice genotypes[raw] = None return pop
def read_vcf(filename, require_pass=False, freq_info=None, info_filters=None): ''' Reads a VCF file and returns a Population object with the individuals represented in the file ''' if not info_filters: info_filters = [] for filter in info_filters: if not callable(filter): raise ValueError('Filter not callable') with open(filename) as f: pop = Population() last_chrom = None genotypes = [] for i, line in enumerate(f): if line.startswith('##'): continue elif line.startswith('#'): ind_ids = line.strip().split()[9:] inds = [Individual(pop, ind_id) for ind_id in ind_ids] for ind in inds: pop.register_individual(ind) break for i, line in enumerate(f): record = VCFRecord(line) if info_filters and not all(filter(record) for filter in info_filters): continue if require_pass and not record.filter_passed: continue if record.chrom != last_chrom: if last_chrom is not None: chromobj.finalize() pop.add_chromosome(chromobj) chromobj = ChromosomeTemplate(label=record.chrom) if freq_info is not None and freq_info in record.info: freq = record.info[freq_info] if ',' in freq: freq = freq.split(',')[0] freq = float(freq) else: freq = 0 genorow = record.genotypes() genotypes.append(genorow) chromobj.add_genotype(bp=record.pos, label=record.label, frequency=freq) last_chrom = record.chrom chromobj.finalize() pop.add_chromosome(chromobj) for ind in inds: # Initialize new genotypes ind._init_genotypes(sparse=True) # Now actually sift through markers and assign them to individuals final_indices = [] for chromidx, chromobj in enumerate(pop.chromosomes): indices = zip([chromidx]*chromobj.nmark(), range(chromobj.nmark())) final_indices.extend(indices) raw_indices = range(len(genotypes)) for raw, final in zip(raw_indices, final_indices): chromidx, markidx = final row = genotypes[raw] assign_genorow(row, inds, chromidx, markidx) # Kill the row so we don't end up with the whole dataset in memory twice genotypes[raw] = None return pop