def read_map(mapfile): """ Reads a PLINK map file into a list of ChromosomeTemplate objects Arguments: mapfile: The file to be read Returns: a list of ChromosomeTemplate objects """ last_chr, last_pos = None, 0 chroms = [] chromosome = None with open(mapfile) as f: for i, line in enumerate(f): line = line.strip().split() chr, label, cm, pos = line cm, pos = float(cm), int(pos) if pos < 0: raise FileFormatError("Invalid position: {}".format(pos)) if chr != last_chr: # If this happens, we've moved on to a new chromosome, # or we've just started. If we haven't just started, We'll # close up the old one if i > 0: chromosome.finalize() chroms.append(chromosome) # Make the next chromosome chromosome = ChromosomeTemplate(label=chr) elif pos < last_pos: raise FileFormatError("Map file not sorted") chromosome.add_genotype(None, cm, label=label, bp=pos) last_chr, last_pos = chr, pos chromosome.finalize() chroms.append(chromosome) return chroms
def read_gs_chromosome_template(templatef): """ Reads a genomeSIMLA format chromosome template file Arguments ------ templatef: The filename of the template file Returns: A ChromosomeTemplate object corresponding to the file """ with open(templatef) as f: label = f.readline().strip() # The label and f.readline() # the number of markers, both of which we dont need. c = pydigree.ChromosomeTemplate(label=label) # genomeSIMLA chromosome files have marginal recombination probs # instead of map positions. We'll have to keep track of what the # last position was and add to it to get it into the shape we want # it to be in. last_cm = 0 for line in f: if line == '\n': continue label, majf, minf, cm, bp = line.strip().split() bp = int(bp) cm = float(cm) last_cm += cm c.add_genotype(float(minf), last_cm, label=label, bp=bp) return c
def write_map(pedigrees, mapfile, output_chromosomes=None): """ Writes the genotype location data to a PLINK MAP file Arguments ------ pedigrees: the population containing the data to be written mapfile: the name of the file to be output to Returns: Nothing """ # Check if we're only supposed to be outputting certain chromosomes if output_chromosomes is not None: checkchroms = True else: checkchroms = False with open(mapfile, "w") as f: for ci, chromosome in enumerate(pedigrees.chromosomes): if checkchroms and chromosome.outputlabel not in output_chromosomes: continue for mi, marker in enumerate(chromosome._iinfo()): label, cm, mb, frequency = marker if not mb: mb = int(cm * 10e6) if not label: label = "SNP%s-%s" % (chromosome.outputlabel, mi) f.write("\t".join(str(x) for x in [chromosome.outputlabel, label, cm, mb]) + "\n")
def read_phenotypes(pedigrees, csvfile, delimiter=',', missingcode='X'): """ Reads a csv with header famid,ind,phen,phen,phen,phen etc etc Arguments ------ Pedigrees: An object of class PedigreeCollection csvfile: the filename of the file containing phenotypes. delimiter: the field delimiter for the file missingcode: the code for missing values Returns: Nothing """ with open(csvfile) as f: header = f.readline().strip().split(delimiter) for line in f: # Match columns to their column name d = dict(list(zip(header, line.strip().split(delimiter)))) for k, v in list(d.items()): # Convert all phenotypes into floats try: v = float(v) except ValueError: if not v: v = None if k in set(['famid', 'id']): continue fam, ind = d['famid'], d['id'] pedigrees[fam][ind].phenotypes[k] = v
def write_pedigree(pedigrees, filename, missingcode='X', delim=' '): ''' Writes pedigree to a LINKAGE formatted pedigree file ''' sorting_key = lambda x: (x.population.label, x.depth, x.label) with open(filename, 'w') as f: for ind in sorted(pedigrees.individuals, key=sorting_key): oline = [ind.population.label, ind.label, '0' if ind.is_founder() else ind.father.label, '0' if ind.is_founder() else ind.mother.label, '1' if ind.sex == 1 else '0', '-9'] oline = delim.join(oline) f.write(oline + '\n')
def write_phenotypes(pedigrees, filename, predicate=None, missingcode='X', delim=','): "Writes phenotypes to a CSV (or other delimited) file" inds = pedigrees.individuals if isinstance(predicate, Callable): inds = [x for x in inds if predicate(x)] available_phenotypes = reduce(set.union, [set(x.phenotypes.keys()) for x in inds]) available_phenotypes = sorted(available_phenotypes) header = ['famid', 'id'] + available_phenotypes with open(filename, 'w') as ofile: ofile.write(delim.join([str(x) for x in header]) + '\n') for ind in inds: row = [ind.population.label, ind.label] row += [ind.phenotypes.get(phenotype, missingcode) for phenotype in available_phenotypes] row = delim.join([str(x) for x in row]) ofile.write(row + '\n')
def write_ped(pedigrees, pedfile, delim=" ", predicate=None, output_chromosomes=None): """ write_ped writes data in a plink-format PED file, and optionally a plink-format map file. Arguments ------ pedigrees: An object of class PedigreeCollection containing what you want to output pedfile: a string giving the name out the file to output to. mapfile: the name of a mapfile to output, if you want to output one. an object that evaluates as False or None will skip the mapfile genotypes: Should genotypes be output True/False delim: Field seperator predicate: Which inputs to include in the output file. If not specified all are output. If the string is 'affected', only affected individuals are output. If the string is 'phenotyped', all individuals with phenotype information are output. Any other value of predicate must be a function to perform on the individual that evaluates to True/False for whether the individual should be output. Returns: Nothing """ # Check if we're only supposed to be outputting certain chromosomes if output_chromosomes is not None: checkchroms = True else: checkchroms = False if not predicate: predicate = lambda x: True elif predicate == "affected": predicate = lambda x: x.phenotypes["affected"] == 1 elif predicate == "phenotyped": predicate = lambda x: x.phenotypes["affected"] in set([0, 1]) elif not isinstance(predicate, collections.Callable): raise ValueError("Not a valid predicate!") afflab = {1: "2", 0: "1", None: "-9"} with open(pedfile, "w") as f: for pedigree in pedigrees.pedigrees: for ind in pedigree.individuals: if not predicate(ind): continue # Get the phenotype code aff = afflab[ind.phenotypes["affected"]] # Prepare the 6-column identifier outline = [ pedigree.label, ind.label, ind.father.label if ind.father is not None else "0", ind.mother.label if ind.mother is not None else "0", 1 if ind.sex == 0 else 2, aff, ] # Make strings outline = list(map(str, outline)) # Get the genotypes in the format we need them g = [] for template, chromatids in zip(ind.chromosomes, ind.genotypes): if checkchroms and template.outputlabel not in output_chromosomes: continue chroma, chromb = chromatids ga = chroma.astype(str).tolist() gb = chromb.astype(str).tolist() gn = interleave(ga, gb) g.extend(gn) outline.extend(g) # Write it out outline = delim.join(outline) f.write(outline) f.write("\n")
def read_ped(filename, population=None, delimiter=None, affected_labels=None, population_handler=None, data_handler=None, connect_inds=True, onlyinds=None): """ Reads a plink format pedigree file, ie: familyid indid father mother sex whatever whatever whatever into a pydigree pedigree object, with optional population to assign to pedigree members. If you don't provide a population you can't simulate genotypes! Arguments ----- filename: The file to be read population: The population to assign individuals to delimiter: a string defining the field separator, default: any whitespace affected_labels: The labels that determine affection status. population_handler: a function to set up the population data_handler: a function to turn the data into useful individual information connect_inds: build references between individuals. Requires all individuals be present in the file onlyinds: a list of individuals to be processed, allows skipping parts of a file Returns: An object of class PedigreeCollection """ sex_codes = {'1': 0, '2': 1, 'M': 0, 'F': 1, '0': None, '-9': None} if not affected_labels: affected_labels = {'1': 0, '2': 1, 'A': 1, 'U': 0, 'X': None, '-9': None} # Tries to get a phenotype and returns unknown on failure def getph(ph): try: return affected_labels[ph] except KeyError: return None population = Population() p = Pedigree() if isinstance(population_handler, Callable): population_handler(p) pc = PedigreeCollection() with open(filename) as f: # Parse the lines in the file for line in f: split = line.strip().split(delimiter) if len(split) > 5: fam, id, fa, mo, sex, aff = split[0:6] elif len(split) == 5: fam, id, fa, mo, sex = split[0:5] aff = None # Give a special id for now, to prevent overwriting duplicated # ids between families id = (fam, id) if onlyinds and (id not in onlyinds): continue p[id] = Individual(population, id, fa, mo, sex) p[id].phenotypes['affected'] = getph(aff) p[id].pedigree = p p[id].sex = sex_codes[p[id].sex] if isinstance(data_handler, Callable) and len(split) > 6: data = split[6:] data_handler(p[id], data) # Fix the individual-level data if connect_inds: for ind in p.individuals: fam, id = ind.label # Actually make the references instead of just pointing at strings ind.father = p[(fam, ind.father)] if ind.father != '0' else None ind.mother = p[(fam, ind.mother)] if ind.mother != '0' else None ind.register_with_parents() # Place individuals into pedigrees pedigrees = {} for ind in p.individuals: if ind.label[0] not in pedigrees: pedigrees[ind.label[0]] = [] pedigrees[ind.label[0]].append(ind) for pedigree_label, ped_inds in list(pedigrees.items()): ped = Pedigree(label=pedigree_label) if isinstance(population_handler, Callable): population_handler(ped) for ind in ped_inds: ind.label = ind.label[1] ped[ind.label] = ind ind.population = ped ind.pedigree = ped pc[pedigree_label] = ped return pc