def read_beagle_markerfile(filename, label=None): """ Reads marker locations from a BEAGLE formatted file :param filename: The file to be read :param label: An optional label to give the chromosome, since the BEAGLE format does not require it :type filename: string :rtype: ChromosomeTemplate """ with smartopen(filename) as f: chrom = ChromosomeTemplate(label=label) last_pos = -1 for line in f: rec = BeagleMarkerRecord(line) if rec.pos < 0: raise FileFormatError( 'Bad position for genotype: {}'.format(rec.pos)) elif rec.pos <= last_pos: raise FileFormatError('Makers in file out of order') chrom.add_genotype(None, map_position=None, label=rec.label, bp=rec.pos, reference=rec.reference, alternates=rec.alternates) last_pos = rec.pos return chrom
def read_beagle_markerfile(filename, label=None): """ Reads marker locations from a BEAGLE formatted file :param filename: The file to be read :param label: An optional label to give the chromosome, since the BEAGLE format does not require it :type filename: string :rtype: ChromosomeTemplate """ with smartopen(filename) as f: chrom = ChromosomeTemplate(label=label) last_pos = -1 for line in f: rec = BeagleMarkerRecord(line) if rec.pos < 0: raise FileFormatError('Bad position for genotype: {}'.format( rec.pos)) elif rec.pos <= last_pos: raise FileFormatError('Makers in file out of order') chrom.add_genotype(None, map_position=None, label=rec.label, bp=rec.pos, reference=rec.reference, alternates=rec.alternates) last_pos = rec.pos return chrom
def write_sgs(data, filename): """ GERMLINE files are text files with the format: 0) Family ID 1 1) Individual ID 1 2) Family ID 2 3) Individual ID 2 4) Chromosome 5) Segment start (bp/cM) 6) Segment end (bp/cM) 7) Segment start (SNP) 8) Segment end (SNP) 9) Total SNPs in segment 10) Genetic length of segment 11) Units for genetic length (cM or MB) 12) Mismatching SNPs in segment 13) 1 if Individual 1 is homozygous in match; 0 otherwise 14) 1 if Individual 2 is homozygous in match; 0 otherwise """ with smartopen(filename, 'w') as o: for segment in data.segments: oline = [] ind1 = segment.ind1.full_label ind2 = segment.ind2.full_label oline.extend(ind1) oline.extend(ind2) chrom = [segment.chromosome.label] physical = segment.physical_location labs = segment.marker_labels nmark = [segment.nmark] psize = [segment.physical_size / 1e6] # Megabases, not basepairs oline.extend(chrom) oline.extend(physical) oline.extend(labs) oline.extend(nmark) oline.extend(psize) unit = ['MB'] # Extra info GERMLINE gives you like mismatch rate misc = 'X', 'X', 'X' oline.extend(unit) oline.extend(misc) oline = '\t'.join([str(x) for x in oline]) o.write(oline) o.write('\n')
def read_kinship(filename): ''' Reads a KinInbCoef formatted file of kinship and inbreeding coefficients :param filename: the filename to be read :type filename: string Returns: a dictionary in the format {frozenset({(fam, ind_a), (fam, ind_b)}): kinship/inbreeding ''' kindict = {} with smartopen(filename) as f: for line in f: fam, ida, idb, phi = line.strip().split() kindict[frozenset({(fam, ida), (fam, idb)})] = float(phi) return kindict
def read_beagle_genotypefile(filename, pop, missingcode='0'): ''' Reads BEAGLE formatted genotype files Arguments :param filename: Filename of BEAGLE genotype file :param pop: the population to add these individuals to :param missingcode: The value that indicates a missing genotype :type missingcode: string :rtype: void ''' with smartopen(filename) as f: for line in f: rec = BeagleGenotypeRecord(line) if rec.identifier == 'I': inds = [Individual(pop, label) for label in rec.data[::2]] elif rec.is_phenotype_record: for ind, pheno_status in zip(inds, rec.data[::2]): if rec.identifier == 'A': pheno_status = pheno_status == '2' else: try: pheno_status = float(pheno_status) except ValueError: pass ind.phenotypes[rec.label] = pheno_status else: # We've reached the genotypes, and we're skipping out break f.seek(0) gtrows = [ list(grouper(BeagleGenotypeRecord(x).data, 2)) for x in f if x.startswith('M') ] genotypes = zip(*gtrows) for ind, sequentialalleles in zip(inds, genotypes): ind.genotypes = gt_from_seq(ind.chromosomes, sequentialalleles, missing_code=missingcode)
def read_beagle_genotypefile(filename, pop, missingcode='0'): ''' Reads BEAGLE formatted genotype files Arguments :param filename: Filename of BEAGLE genotype file :param pop: the population to add these individuals to :param missingcode: The value that indicates a missing genotype :type missingcode: string :rtype: void ''' with smartopen(filename) as f: for line in f: rec = BeagleGenotypeRecord(line) if rec.identifier == 'I': inds = [Individual(pop, label) for label in rec.data[::2]] elif rec.is_phenotype_record: for ind, pheno_status in zip(inds, rec.data[::2]): if rec.identifier == 'A': pheno_status = pheno_status == '2' else: try: pheno_status = float(pheno_status) except ValueError: pass ind.phenotypes[rec.label] = pheno_status else: # We've reached the genotypes, and we're skipping out break f.seek(0) gtrows = [list(grouper(BeagleGenotypeRecord(x).data, 2)) for x in f if x.startswith('M')] genotypes = zip(*gtrows) for ind, sequentialalleles in zip(inds, genotypes): ind.genotypes = gt_from_seq(ind.chromosomes, sequentialalleles, missing_code=missingcode)
def read_germline(filename): ''' Reads a GERMLINE formatted SGS filename into an SGSAnalysis object GERMLINE files are text files with the format: 0) Family ID 1 1) Individual ID 1 2) Family ID 2 3) Individual ID 2 4) Chromosome 5) Segment start (bp/cM) 6) Segment end (bp/cM) 7) Segment start (SNP) 8) Segment end (SNP) 9) Total SNPs in segment 10) Length of segment 11) Units for genetic length (cM or MB) 12) Mismatching SNPs in segment 13) 1 if Individual 1 is homozygous in match; 0 otherwise 14) 1 if Individual 2 is homozygous in match; 0 otherwise This function only uses 0-6. ''' analysis = SGSAnalysis() with smartopen(filename) as f: for line in f: rec = GermlineRecord(line) if rec.pair not in analysis: analysis[rec.pair] = SGS(rec.ind1, rec.ind2) phys_loc = (rec.location if rec.bp_locations else None) seg = Segment(rec.ind1, rec.ind2, rec.chromosome, None, None, physical_location=phys_loc) analysis[rec.pair].append(seg) return analysis
def from_file(filename): """ Reads a trait from a file :param filename: path to file :type filename: string :rtype: QuantitativeTrait """ with smartopen(filename) as f: trait_type, name = f.readline().strip().split() trait = QuantitativeTrait(trait_type, name) for line in f: l = line.strip().split() if len(l) != 5: # TODO: implement epistatic effects in file raise NotImplementedError( 'Epistatic effects not yet implemented') chrom, loc, _, _, a, k = line.strip().split() locus = chrom, loc trait.add_effect(locus, a, k) return trait
def read_vcf(filename, require_pass=False, freq_info=None): """ Reads a VCF file and returns a Population object with the individuals represented in the file Genotypes generated by this function will be sparse :param require_pass: only allow variants with PASS under FILTER :type require_pass: bool :param freq_info: INFO field to get allele frequency from :param freq_info: string :returns: Individuals in the VCF :rtype: Population """ with smartopen(filename) as f: genotypes = [] pop, inds = _vcf_parseheader(f) last_chrom = None chromobj = None for line in f: record = VCFRecord(line) if require_pass and not record.filter_passed: continue if record.chrom != last_chrom: if last_chrom is not None: pop.add_chromosome(chromobj) chromobj = ChromosomeTemplate(label=record.chrom) if freq_info is not None: freq = _vcf_get_infofreq(record.info, freq_info) else: freq = 0 genorow = record.genotypes() genotypes.append(genorow) chromobj.add_genotype(bp=record.pos, label=record.label, frequency=freq) last_chrom = record.chrom pop.add_chromosome(chromobj) pop.chromosomes.finalize() for ind in inds: # Initialize new genotypes ind._init_genotypes(sparse=True) # Now actually sift through markers and assign them to individuals final_indices = [] for chromidx, chromobj in enumerate(pop.chromosomes): indices = zip([chromidx] * chromobj.nmark(), range(chromobj.nmark())) final_indices.extend(indices) raw_indices = range(len(genotypes)) for raw, final in zip(raw_indices, final_indices): chromidx, markidx = final row = genotypes[raw] assign_genorow(row, inds, chromidx, markidx) # Kill the row so we don't end up with the whole dataset in memory twice genotypes[raw] = None return pop
import pydigree as pyd from pydigree.io import smartopen from pydigree.sgs.sgs import intervals_to_array from pydigree.ibs import ibs replicate = sys.argv[1] ms = int(sys.argv[2]) prefix='null' peds = pyd.io.plink.read_plink('{}-{}.ped'.format(prefix, replicate), '{}.map'.format(prefix)) ped = peds['1'] s = pyd.sgs.sgs_population(ped, seed_size=ms) with smartopen('{}-{}.ibd.gz'.format(prefix, replicate)) as f: trueibd = {} for line in f: fam, id1, id2, ibd_states = line.strip().split(None, 3) trueibd[frozenset({id1,id2})] = np.array([int(x) for x in ibd_states.split()]) a = intervals_to_array(s[frozenset({ped['7'],ped['8']})][0], ped.chromosomes[0].nmark()) b = trueibd[frozenset({'7','8'})] genos1 = zip(*ped['7'].genotypes[0]) genos2 = zip(*ped['8'].genotypes[0]) identical = [ibs(x,y) for x,y in zip(genos1, genos2)] from pydigree.common import table, runs