words = line.split() chr = str(words[0].split('_')[1]) pos = words[1] # split chromosomes into separate files if chr != chrPrev: fileoutput.close() fileoutput = open(chr+'_'+args.output, 'w') chrPrev = chr siteNumber = 1 # select samples genotypes = calls.selectSamples(sampCol, words) # count Ns valueN = calls.countPerPosition(genotypes, 'N') if valueN <= args.missing: genotypesMerged = ''.join(str(e) for e in genotypes) genotypesMergedP = genotypesMerged.replace('N', '?') else: continue # count the number of called sites if not calls.is_polymorphic(genotypes): siteNumber += 1 else: fileoutput.write("%s\t%s\t%s\t%s\n" % (chr, pos, siteNumber, genotypesMergedP)) siteNumber = 1 # track progress
HeterWindow = round(meanWindow(Hwindow, Twindow), 4) except Exception: HeterWindow = "NA" calls.processWindow(Chr, posS, posE, HeterWindow, outputFile) windPosEnd = windPosEnd + windSize Hwindow = [] Twindow = [] posS = pos while pos > windPosEnd: # if the gap in positions is larger than window size windPosEnd = windPosEnd + windSize ChrPrevious = Chr posE = pos # count hetero Nmising = calls.countPerPosition(sample_charaters, 'N') if Nmising < allowedN: # skip if too many Ns nHerer = calls.countHeteroPerPosition(sample_charaters) nTotal = float(nSample - Nmising) Hwindow.append(float(nHerer)) Twindow.append(float(nTotal)) # track progress counter += 1 if counter % 1000000 == 0: print str(counter), "lines processed" # process the last window try: HeterWindow = round(meanWindow(Hwindow, Twindow), 4) except Exception:
words = line.split() chr = str(words[0].split('_')[1]) pos = words[1] # split chromosomes into separate files if chr != chrPrev: fileoutput.close() fileoutput = open(chr + '_' + args.output, 'w') chrPrev = chr siteNumber = 1 # select samples genotypes = calls.selectSamples(sampCol, words) # count Ns valueN = calls.countPerPosition(genotypes, 'N') if valueN <= args.missing: genotypesMerged = ''.join(str(e) for e in genotypes) genotypesMergedP = genotypesMerged.replace('N', '?') else: continue # count the number of called sites if not calls.is_polymorphic(genotypes): siteNumber += 1 else: fileoutput.write("%s\t%s\t%s\t%s\n" % (chr, pos, siteNumber, genotypesMergedP)) siteNumber = 1
for line in datafile: # track progress counter += 1 if counter % 1000000 == 0: print str(counter), "lines processed" words = line.split() chr_pos = words[0:2] ch = int(words[0].split('_')[1]) pos = int(words[1]) # select samples alleles = calls.selectSamples(sampCol, words) # count Ns valueN = calls.countPerPosition(alleles, 'N') if valueN <= args.missing: Allalleles = [i for i in alleles if i != 'N'] else: continue # find overlap with the ancestor while ch > ref_ch or (ch == ref_ch and pos > ref_pos): words2 = ref.readline().split() if words2 == []: ancest = 'N' break else: ref_chr_pos = words2[0:2] ref_ch = int(ref_chr_pos[0].split('_')[1])
sampColnames = calls.selectSamples(sampCol, header_words) sampNs = [0 for i in sampColnames] print('Counting Ns ...') Ns = [] for line in datafile: words = line.split() chr_pos = words[0:2] # select samples sample_charaters = calls.selectSamples(sampCol, words) # count Ns per position contNsOnly = calls.countPerPosition(sample_charaters, 'N') Ns.append(contNsOnly) # count Ns per sample calls.countPerSample(sample_charaters, sampNs, 'N') # track progress counter += 1 if counter % 1000000 == 0: print str(counter), "lines processed" datafile.close() # write the counts to a fine outputTXTsite = open(args.output + "_Ns_per_site.csv", 'w') outputTXTsample = open(args.output + "_Ns_per_sample.csv", 'w')
for line in datafile: words = line.split() ch = int(words[0].split('_')[1]) pos = int(words[1]) # track progress counter += 1 if counter % 1000000 == 0: print str(counter), "lines processed" # select samples genotypesN = calls.selectSamples(sampCol, words) # skip sites with missing data numN = calls.countPerPosition(genotypesN, 'N') if numN <= AlowedN: genotypes = [i for i in genotypesN if i != 'N'] nGenotypes = len(genotypes) else: continue # skip lines with too many Ns # count alleles numAl = collections.Counter(genotypes) numAlM = numAl.most_common() while ch > ref_ch or (ch == ref_ch and pos > ref_pos): words2 = ref.readline().split() if words2 == []: ancest = 'N' break