def main(options): hap = gdc.open2(options["hap"]) don = gdc.open2(options["donor"]) rec = gdc.open2(options["recipient"]) snp, ind, geno = [ open(options["out"] + x, "w") for x in [".snp", ".ind", ".geno"] ] total_individuals = write_ind_file(rec, don, ind) removed = {"multiallelic": 0} for line in hap: bits = line[:-1].split() chr, pos = bits[0:2] gts = bits[2:] alleles = list(set(gts)) if len(alleles) > 2: removed["multiallelic"] += 1 next # Setting first allele to 0 - should check this is ok. if len(gts) != 2 * total_individuals: raise Exception("N.genotypes!=N.individuals T %s %s" % (chr, pos)) snp.write("\t".join( [chr + ":" + pos, chr, "0.0", pos, alleles[0], alleles[1]]) + "\n") for i in range(total_individuals): this_gt = sum([g == alleles[0] for g in gts[(i * 2):(i * 2 + 2)]]) geno.write(str(this_gt)) geno.write("\n") [x.close() for x in [hap, don, rec, snp, ind, geno]]
def main(options): hap=gdc.open2(options["hap"]) don=gdc.open2(options["donor"]) rec=gdc.open2(options["recipient"]) snp, ind, geno = [open(options["out"]+x, "w") for x in [".snp", ".ind", ".geno"]] total_individuals=write_ind_file(rec, don, ind) removed={"multiallelic":0} for line in hap: bits=line[:-1].split() chr,pos=bits[0:2] gts=bits[2:] alleles=list(set(gts)) if len(alleles)>2: removed["multiallelic"]+=1 next # Setting first allele to 0 - should check this is ok. if len(gts)!=2*total_individuals: raise Exception("N.genotypes!=N.individuals T %s %s"%(chr, pos)) snp.write("\t".join([chr+":"+pos, chr, "0.0", pos, alleles[0], alleles[1]])+"\n") for i in range(total_individuals): this_gt=sum([g==alleles[0] for g in gts[(i*2):(i*2+2)]]) geno.write(str(this_gt)) geno.write("\n") [x.close() for x in [hap, don, rec, snp, ind, geno]]
def read_ms(ms_file, options): """ Read a ms file and return positions and haplotypes """ ms=gdc.open2(ms_file) if options["length"]: length=options["length"] nhap=length=None line=ms.next() if options["macs"]: nhap, length = [int(x) for x in line.split()[1:3]] else: #For example, srcm with -SC abs length=options["length"] for line in ms: if line.startswith("segsites:"): npos=int(line.split()[1]) elif line.startswith("positions:"): pos=np.array([int(length*float(p)) for p in line.split()[1:]]) if len(pos) != npos: raise Exception("Number of positions does not match segsites") break haps=np.genfromtxt(ms, dtype=int, delimiter=1) #Assume the rest of the file is the haplotypes haps=np.transpose(haps) if haps.shape[0] != npos: raise Exception("Number of positions doesn't match") if nhap and haps.shape[1] != nhap: raise Exception("Number of haplotypes doesn't match") return length, pos, haps
def load_from_arp(arp): """ Parse an arp file into an internal format, returning site data, which is a list of lists, with the sites on each chromosome, and gt data which is a dict with infomation on the samples and genotypes. I'm really just guessing what the structure of the file is. """ arp_file=gdc.open2(arp) N_chr=-1 N_sites=-1 site_data=[] gt_data={} ascertained=False for line in arp_file: if line.startswith("#Number of independent chromosomes"): N_chr=int(line.split()[-1]) if line.startswith("#Total number of polymorphic sites") and not ascertained: N_sites=int(line.split()[-1]) if line.startswith("#ASCERTAINED DATA"): ascertained=True N_sites=-1 if line.startswith("#Number of polym. sites meeting ascertainment criterion:"): N_sites=int(line.split()[-1]) if "polymorphic positions on chromosome" in line and not ascertained: npos=int(line.split()[1]) chrom=int(line.split()[-1]) line=next(arp_file) site_data.append(None) site_data[chrom-1]=[int(x.replace(",", "")) for x in line[1:].split()] if line.startswith("#Ascertained polymorphic positions on chromosome") and ascertained: chrom=int(line.split()[-1]) line=next(arp_file) npos=len(line.split()) site_data.append(None) site_data[chrom-1]=[int(x.replace(",", "")) for x in line[1:].split()] if "SampleName=" in line: sname=line.split("\"")[-2] line=arp_file.next() ssize=int(line.split("=")[-1]) gt=np.zeros((N_sites, ssize), dtype='int') arp_file.next() # "SampleData= {" line for i in range(ssize): line=arp_file.next() gt_string=line.split()[2] if len(gt_string)!=N_sites: raise Exception("Wrong number of sites") gt[:,i]=[int(int(x)>0) for x in gt_string] # converting {1,2,3}->1 gt_data[sname]={"size":ssize, "gt":gt} if not N_chr==len(site_data): raise Exception( "Number of chromosomes does not match site data" ) if not N_sites==sum([len(x) for x in site_data]): raise Exception( "Total number of sites does not match" ) return site_data, gt_data
def main(options): """ Convert vcf to eigenstrat format (ind, snp and geno files) """ vcf=gdc.open2(options["vcf"]) snp, ind, geno = [open(options["out"]+x, "w") for x in [".snp", ".ind", ".geno"]] removed={"multiallelic":0, "indel":0} count=0 for line in vcf: if line[:2]=="##": # Comment line next elif line[:6]=="#CHROM": # Header line inds=line.split()[9:] if options["ref"]: ind.write(options["ref"]+"\tU\tREF\n") for indi in inds: if not options["indAsPop"]: ind.write(indi+"\tU\tPOP\n") else: ind.write(indi+"\tU\t"+indi+"\n") else: # data bits=line.split() if "," in bits[4]: removed["indel"]+=1 continue if len(bits[3])!=1 or len(bits[4])!=1: removed["multiallelic"]+=1 continue else: if bits[2]==".": bits[2]=bits[0]+":"+bits[1] snp.write(" ".join([bits[2], bits[0], "0.0", bits[1], bits[3], bits[4]])+"\n") geno_string="" if options["ref"]: geno_string="2" for gt in bits[9:]: geno_string+=decode_gt_string(gt) geno.write(geno_string+"\n") count+=1 [f.close for f in [ind, snp, geno]] print "Done. Wrote "+str(count) + " sites" print "Excluded " + str(sum(removed.values())) + " sites" for key in removed: print "Excluded " + str(removed[key]) + " " + key return
def read_ms(ms_file, options): """ Read a ms file and return positions and haplotypes """ ms = gdc.open2(ms_file) if options["length"]: length = options["length"] nhap = length = None line = ms.next() if options["macs"]: nhap, length = [int(x) for x in line.split()[1:3]] else: #For example, srcm with -SC abs length = options["length"] for line in ms: if line.startswith("segsites:"): npos = int(line.split()[1]) elif line.startswith("positions:"): pos = np.array([int(length * float(p)) for p in line.split()[1:]]) if len(pos) != npos: raise Exception("Number of positions does not match segsites") break haps = np.genfromtxt( ms, dtype=int, delimiter=1) #Assume the rest of the file is the haplotypes haps = np.transpose(haps) if haps.shape[0] != npos: raise Exception("Number of positions doesn't match") if nhap and haps.shape[1] != nhap: raise Exception("Number of haplotypes doesn't match") return length, pos, haps
def read_macs(macs_file): """ Read a macs file and return positions and haplotypes """ macs=gdc.open2(macs_file) line=macs.next() nhap, length = [int(x) for x in line.split()[1:3]] for line in macs: if line.startswith("segsites:"): npos=int(line.split()[1]) elif line.startswith("positions:"): pos=np.array([int(length*float(p)) for p in line.split()[1:]]) if len(pos) != npos: raise Exception("Number of positions does not match segsites") break haps=np.genfromtxt(macs, dtype=int, delimiter=1) #Assume the rest of the file is the haplotypes haps=np.transpose(haps) if haps.shape != (npos, nhap): raise Exception("Genotype matrix shape doesn't match") return length, pos, haps
def main(argv): try: opts, args = getopt.getopt(argv, "o:v:d:", ["output=", "vcf=", "diploid="]) except getopt.GetoptError: sys.exit(2) for opt, arg in opts: if opt in ("-o", "--output"): output_filename = arg elif opt in ("-v", "--vcf"): vcf_filename = arg elif opt in ("-d", "--diploid"): # samples diploid? (True/False) if (arg in ['1', 1, 'T', 't', 'True', 'true']): d_flag = True else: d_flag = False n_head_lines = 0 for vcfline in gdc.open2(vcf_filename): n_head_lines += 1 line = vcfline.decode() if line[1] != '#': break nlines = sum(1 for line in gdc.open2(vcf_filename)) - n_head_lines vcf_file = gdc.open2(vcf_filename) vcfline = next(vcf_file) pound = vcfline.decode()[1] while pound == '#': vcfline = next(vcf_file) pound = vcfline.decode()[1] line = vcfline.decode() output_head = ['ID', 'chr', 'pos', 'Alt', 'N', 'Fq'] with open(output_filename, 'w') as mycsv: datawriter = csv.writer(mycsv) datawriter.writerow(output_head) #, quoting = csv.QUOTE_NONE for i in range(0, nlines): vcfline = next(vcf_file) line = vcfline.decode() line = line.split() chr, pos = line[0], line[1] chrpos = line[0] + '_' + line[1] alt_count = 0 total = 0 for j in range(9, len(line)): if line[j][0] == '.': continue else: alt_count, total = tally(alt_count, line[j], d_flag, total) if total == 0: continue else: alt_fq = alt_count / total row = [chrpos, chr, pos, alt_count, total, alt_fq] datawriter.writerow(row) mycsv.close()
def load_from_arp(arp): """ Parse an arp file into an internal format, returning site data, which is a list of lists, with the sites on each chromosome, and gt data which is a dict with infomation on the samples and genotypes. I'm really just guessing what the structure of the file is. """ arp_file = gdc.open2(arp) N_chr = -1 N_sites = -1 site_data = [] gt_data = {} ascertained = False for line in arp_file: if line.startswith("#Number of independent chromosomes"): N_chr = int(line.split()[-1]) if line.startswith( "#Total number of polymorphic sites") and not ascertained: N_sites = int(line.split()[-1]) if line.startswith("#ASCERTAINED DATA"): ascertained = True N_sites = -1 if line.startswith( "#Number of polym. sites meeting ascertainment criterion:"): N_sites = int(line.split()[-1]) if "polymorphic positions on chromosome" in line and not ascertained: npos = int(line.split()[1]) chrom = int(line.split()[-1]) line = next(arp_file) site_data.append(None) site_data[chrom - 1] = [int(x.replace(",", "")) for x in line[1:].split()] if line.startswith("#Ascertained polymorphic positions on chromosome" ) and ascertained: chrom = int(line.split()[-1]) line = next(arp_file) npos = len(line.split()) site_data.append(None) site_data[chrom - 1] = [int(x.replace(",", "")) for x in line[1:].split()] if "SampleName=" in line: sname = line.split("\"")[-2] line = arp_file.next() ssize = int(line.split("=")[-1]) gt = np.zeros((N_sites, ssize), dtype='int') arp_file.next() # "SampleData= {" line for i in range(ssize): line = arp_file.next() gt_string = line.split()[2] if len(gt_string) != N_sites: raise Exception("Wrong number of sites") gt[:, i] = [int(int(x) > 0) for x in gt_string] # converting {1,2,3}->1 gt_data[sname] = {"size": ssize, "gt": gt} if not N_chr == len(site_data): raise Exception("Number of chromosomes does not match site data") if not N_sites == sum([len(x) for x in site_data]): raise Exception("Total number of sites does not match") return site_data, gt_data
def output_hetfa(options): """ output a single hetfa """ ref_fa = Fasta(options["ref"]) mask = None if options["mask"]: mask = Fasta(options["mask"]) out = None if options["out"]: out = gzip.open(options["out"] + ".hetfa.fa.gz", "w") else: out = sys.stdout out.write(">" + options["chrom"] + "\n") vcf = gdc.open2(options["vcf"]) sample_idx = None last_pos = 0 for line in vcf: if line.startswith("##"): continue elif line.startswith("#"): bits = line.split() sample_idx = bits.index(options["sample"]) else: #data line bits = line.split() gt = bits[sample_idx] pos = int(bits[1]) if pos == last_pos: continue ref = bits[3] alt = bits[4] masked = check_mask(mask, options, pos) ref_seq = get_ref_seq(options, ref_fa, mask, last_pos, pos) if len(ref) == 1 and len(alt) == 1 and gt[0] in [ "0", "1" ] and gt[2] in ["0", "1" ] and not masked: #This is a biallelic site #This is the sequence from the last position to the base before the current position (note that pos is 1-based) if options["refcheck"] and ref_fa[options["chrom"]][ pos - 1].seq != ref: raise Exception("Reference mismatch at pos " + str(pos)) genotype = int(gt[0]) + int(gt[2]) if genotype == 0: out.write(ref_seq + ref) elif genotype == 1: hetfa_code = HETFA_MAP[tuple(sorted([ref, alt]))] out.write(ref_seq + hetfa_code) elif genotype == 2: out.write(ref_seq + alt) else: raise Exception( "Untrapped bad genotype in haplotype 0 at pos" + str(pos)) else: #This is either unphased or missing or multiallelic out.write(ref_seq + "N") last_pos = pos #Fill in the reference at the end and terminate with newline. tail_seq = ref_fa[options["chrom"]][last_pos:].seq out.write(tail_seq + "\n")
def output_fastas(options): """ output two .fa files, one for each chromosome. """ ref_fa = Fasta(options["ref"]) out0 = gzip.open(options["out"] + ".0.fa.gz", "w") out1 = gzip.open(options["out"] + ".1.fa.gz", "w") out0.write(">" + options["chrom"] + "\n") out1.write(">" + options["chrom"] + "\n") mask = None if options["mask"]: mask = Fasta(options["mask"]) vcf = gdc.open2(options["vcf"]) sample_idx = None last_pos = 0 for line in vcf: if line.startswith("##"): continue elif line.startswith("#"): bits = line.split() sample_idx = bits.index(options["sample"]) else: #data line bits = line.split() gt = bits[sample_idx] pos = int(bits[1]) if pos == last_pos: continue ref = bits[3] alt = bits[4] masked = check_mask(mask, options, pos) ref_seq = get_ref_seq(options, ref_fa, mask, last_pos, pos) if len(ref) == 1 and len(alt) == 1 and gt in [ "0|0", "1|0", "0|1", "1|1" ] and not masked: #This is a phased biallelic site #This is the sequence from the last position to the base before the current position (note that pos is 1-based) if options["refcheck"] and ref_fa[options["chrom"]][ pos - 1].seq != ref: raise Exception("Reference mismatch at pos " + str(pos)) if gt[0] == "0": out0.write(ref_seq + ref) elif gt[0] == "1": out0.write(ref_seq + alt) else: raise Exception( "Untrapped bad genotype in haplotype 0 at pos" + str(pos)) if gt[2] == "0": out1.write(ref_seq + ref) elif gt[2] == "1": out1.write(ref_seq + alt) else: raise Exception( "Untrapped bad genotype in haplotype 1 at pos" + str(pos)) else: #This is either unphased or missing or multiallelic out0.write(ref_seq + "N") out1.write(ref_seq + "N") last_pos = pos #Fill in the reference at the end and terminate with newline. tail_seq = ref_fa[options["chrom"]][last_pos:].seq out0.write(tail_seq + "\n") out1.write(tail_seq + "\n") out0.close() out1.close()
def main(options): """ Iterate over the vcf and output one fasta file for each chromosome. """ ref_fa = Fasta(options["ref"]) out0 = gzip.open(options["out"] + ".0.fa.gz", "w") out1 = gzip.open(options["out"] + ".1.fa.gz", "w") out0.write(">" + options["chrom"] + "\n") out1.write(">" + options["chrom"] + "\n") vcf = gdc.open2(options["vcf"]) sample_idx = None last_pos = 0 for line in vcf: if line.startswith("##"): continue elif line.startswith("#"): bits = line.split() sample_idx = bits.index(options["sample"]) else: # data line bits = line.split() gt = bits[sample_idx] pos = int(bits[1]) if pos == last_pos: continue ref = bits[3] alt = bits[4] if ( len(ref) == 1 and len(alt) == 1 and gt in ["0|0", "1|0", "0|1", "1|1"] ): # This is a phased biallelic site # This is the sequence from the last position to the base before the current position (note that pos is 1-based) ref_seq = ref_fa[options["chrom"]][last_pos : (pos - 1)].seq if options["refcheck"] and ref_fa[options["chrom"]][pos - 1].seq != ref: raise Exception("Reference mismatcah at pos " + str(pos)) if gt[0] == "0": out0.write(ref_seq + ref) elif gt[0] == "1": out0.write(ref_seq + alt) else: raise Exception("Untrapped bad genotype in haplotype 0 at pos" + str(pos)) if gt[2] == "0": out1.write(ref_seq + ref) elif gt[2] == "1": out1.write(ref_seq + alt) else: raise Exception("Untrapped bad genotype in haplotype 1 at pos" + str(pos)) else: # This is either unphased or missing or multiallelic out0.write("N" * (pos - last_pos)) out1.write("N" * (pos - last_pos)) last_pos = pos # Fill in the reference at the end and terminate with newline. tail_seq = ref_fa[options["chrom"]][last_pos:].seq out0.write(tail_seq + "\n") out1.write(tail_seq + "\n")