def read_beagle_markerfile(filename, label=None): ''' Reads marker locations from a beagle format file Arguments ----- filename: The file to be read label: An optional label to give the chromosome, since the BEAGLE format does not require it Returns: a ChromosomeTemplate object ''' with open(filename) as f: chrom = ChromosomeTemplate(label=label) last_pos = -1 for line in f: rec = BeagleMarkerRecord(line) if rec.pos < 0: raise FileFormatError( 'Bad position for genotype: {}'.format(rec.pos)) elif rec.pos <= last_pos: raise FileFormatError('Makers in file out of order') chrom.add_genotype(None, cm=None, label=rec.label, bp=rec.pos, reference=rec.reference, alternates=rec.alternates) last_pos = rec.pos return chrom
def direct_to_disk(filename, pop, seed_size=500, min_length=1, size_unit='mb', min_density=100, maxmiss=0.25, onlywithin=False, njobs=1): with open(filename, 'w') as outf: results = _perform_sgs(pop, seed_size=seed_size, min_length=min_length, size_unit=size_unit, min_density=min_density, maxmiss=maxmiss, onlywithin=onlywithin, njobs=njobs) for result in results: for segment in result.segments: outf.write(segment.to_germline() + '\n')
def write_sgs(data, filename): # GERMLINE files are text files with the format: # 0) Family ID 1 # 1) Individual ID 1 # 2) Family ID 2 # 3) Individual ID 2 # 4) Chromosome # 5) Segment start (bp/cM) # 6) Segment end (bp/cM) # 7) Segment start (SNP) # 8) Segment end (SNP) # 9) Total SNPs in segment # 10) Genetic length of segment # 11) Units for genetic length (cM or MB) # 12) Mismatching SNPs in segment # 13) 1 if Individual 1 is homozygous in match; 0 otherwise # 14) 1 if Individual 2 is homozygous in match; 0 otherwise with open(filename, 'w') as o: for segment in data.segments: oline = [] ind1 = segment.ind1.full_label ind2 = segment.ind2.full_label oline.extend(ind1) oline.extend(ind2) chrom = [segment.chromosome.label] physical = segment.physical_location labs = segment.marker_labels nmark = [segment.nmark] psize = [segment.physical_size / 1e6] # Megabases, not basepairs oline.extend(chrom) oline.extend(physical) oline.extend(labs) oline.extend(nmark) oline.extend(psize) unit = ['MB'] # Extra info GERMLINE gives you like mismatch rate misc = 'X', 'X', 'X' oline.extend(unit) oline.extend(misc) oline = '\t'.join([str(x) for x in oline]) o.write(oline) o.write('\n')
def read_kinship(filename): ''' Reads a KinInbCoef formatted file of kinship and inbreeding coefficients Arguments ------ filename: the filename to be read Returns: a dictionary in the format {frozenset({(fam, ind_a), (fam, ind_b)}): kinship/inbreeding ''' kindict = {} with open(filename) as f: for line in f: fam, ida, idb, phi = line.strip().split() kindict[frozenset({(fam, ida), (fam, idb)})] = float(phi) return kindict
def read_beagle_genotypefile(filename, pop, missingcode='0'): ''' Reads BEAGLE formatted genotype files Arguments ------ filename: Filename of BEAGLE genotype file pop: the population to add these individuals to missingcode: The value that indicates a missing genotype Returns: Nothing ''' with open(filename) as f: for line in f: rec = BeagleGenotypeRecord(line) if rec.identifier == 'I': inds = [Individual(pop, label) for label in rec.data[::2]] elif rec.is_phenotype_record: for ind, pheno_status in zip(inds, rec.data[::2]): if rec.identifier == 'A': pheno_status = pheno_status == '2' else: try: pheno_status = float(pheno_status) except ValueError: pass ind.phenotypes[rec.label] = pheno_status else: # We've reached the genotypes, and we're skipping out break f.seek(0) gtrows = [list(grouper(BeagleGenotypeRecord(x).data, 2)) for x in f if x.startswith('M')] genotypes = zip(*gtrows) for ind, sequentialalleles in zip(inds, genotypes): ind.genotypes = genotypes_from_sequential_alleles(ind.chromosomes, sequentialalleles, missingcode=missingcode)
def read_germline(filename): ''' Reads a GERMLINE formatted SGS filename into an SGSAnalysis object GERMLINE files are text files with the format: 0) Family ID 1 1) Individual ID 1 2) Family ID 2 3) Individual ID 2 4) Chromosome 5) Segment start (bp/cM) 6) Segment end (bp/cM) 7) Segment start (SNP) 8) Segment end (SNP) 9) Total SNPs in segment 10) Length of segment 11) Units for genetic length (cM or MB) 12) Mismatching SNPs in segment 13) 1 if Individual 1 is homozygous in match; 0 otherwise 14) 1 if Individual 2 is homozygous in match; 0 otherwise This function only uses 0-6. ''' analysis = SGSAnalysis() with open(filename) as f: for line in f: rec = GermlineRecord(line) if rec.pair not in analysis: analysis[rec.pair] = SGS(rec.ind1, rec.ind2) phys_loc = (rec.location if rec.bp_locations else None) gen_loc = (rec.location if not rec.bp_locations else None) seg = Segment(rec.ind1, rec.ind2, rec.chromosome, None, None, physical_location=phys_loc) analysis[rec.pair].append(seg) return analysis
def read_vcf(filename, require_pass=False, freq_info=None, info_filters=None): ''' Reads a VCF file and returns a Population object with the individuals represented in the file ''' if not info_filters: info_filters = [] for filter in info_filters: if not callable(filter): raise ValueError('Filter not callable') with open(filename) as f: pop = Population() last_chrom = None genotypes = [] for i, line in enumerate(f): if line.startswith('##'): continue elif line.startswith('#'): ind_ids = line.strip().split()[9:] inds = [Individual(pop, ind_id) for ind_id in ind_ids] for ind in inds: pop.register_individual(ind) break for i, line in enumerate(f): record = VCFRecord(line) if info_filters and not all(filter(record) for filter in info_filters): continue if require_pass and not record.filter_passed: continue if record.chrom != last_chrom: if last_chrom is not None: chromobj.finalize() pop.add_chromosome(chromobj) chromobj = ChromosomeTemplate(label=record.chrom) if freq_info is not None and freq_info in record.info: freq = record.info[freq_info] if ',' in freq: freq = freq.split(',')[0] freq = float(freq) else: freq = 0 genorow = record.genotypes() genotypes.append(genorow) chromobj.add_genotype(bp=record.pos, label=record.label, frequency=freq) last_chrom = record.chrom chromobj.finalize() pop.add_chromosome(chromobj) for ind in inds: # Initialize new genotypes ind._init_genotypes(sparse=True) # Now actually sift through markers and assign them to individuals final_indices = [] for chromidx, chromobj in enumerate(pop.chromosomes): indices = zip([chromidx]*chromobj.nmark(), range(chromobj.nmark())) final_indices.extend(indices) raw_indices = range(len(genotypes)) for raw, final in zip(raw_indices, final_indices): chromidx, markidx = final row = genotypes[raw] assign_genorow(row, inds, chromidx, markidx) # Kill the row so we don't end up with the whole dataset in memory twice genotypes[raw] = None return pop