fileoutput = open(chrPrev+'_'+args.output, 'w') for line in datafile: words = line.split() chr = str(words[0].split('_')[1]) pos = words[1] # split chromosomes into separate files if chr != chrPrev: fileoutput.close() fileoutput = open(chr+'_'+args.output, 'w') chrPrev = chr siteNumber = 1 # select samples genotypes = calls.selectSamples(sampCol, words) # count Ns valueN = calls.countPerPosition(genotypes, 'N') if valueN <= args.missing: genotypesMerged = ''.join(str(e) for e in genotypes) genotypesMergedP = genotypesMerged.replace('N', '?') else: continue # count the number of called sites if not calls.is_polymorphic(genotypes): siteNumber += 1 else: fileoutput.write("%s\t%s\t%s\t%s\n" % (chr, pos, siteNumber, genotypesMergedP))
sampCol = calls.indexSamples(sampleNames, header_words) # count number of sample nSample = len(sampleNames) ############################## perform counting #################### print('Calculating ...') nonMissDP = [] for line in datafile: words = line.split() Chr = words[0] pos = int(words[1]) # select samples sample_num = calls.selectSamples(sampCol, words) # sum up for i in sample_num: if i.isdigit(): nonMissDP.append(float(i)) elif i == "NA": continue else: warnings.warn("%s is not numeric at the line %s" % (i, counter + 1)) # make output depth = round(sum(nonMissDP) / float(len(nonMissDP)), 2) outputFile.write("%s\t%s\t%s\n" % (Chr, pos, depth)) nonMissDP = []
counter = 0 # read the header ances = open(args.ancestral, 'r') ances_words = ances.readline() output = open(args.output, 'w') print('Opening the file...') with open(args.input) as datafile: header_words = datafile.readline().split() sampCol = calls.indexSamples(sampleNames, header_words) print('Creating the output file...') samples_head = calls.selectSamples(sampCol, header_words) samples_headP = '\t'.join(str(e) for e in samples_head) output.write('%s\t%s\t%s\t%s\t%s\n' % ('CHROM', 'POS', 'ANC', 'DER', samples_headP)) # read the second line of the ancestral file ances_words = ances.readline().split() if '_' in ances_words[0]: ances_ch = int(ances_words[0].split('_')[1]) else: ances_ch = int(ances_words[0].split('chr')[1]) ances_pos = int(ances_words[1]) ances_gt = splitAncestral(ances_words[2]) for line in datafile: words = line.split()
posE = '' for line in datafile: words = line.split() Chr = words[0] pos = int(words[1]) # to store the values of a previous line if not ChrPrevious: ChrPrevious = Chr if not posS: posS = pos if not posE: posE = pos # select samples sample_charaters = calls.selectSamples(sampCol, words) # check if one- or two-character code if any(["/" in gt for gt in sample_charaters]): sample_charaters = calls.twoToOne(sample_charaters) # if window size is reached output the results if Chr > ChrPrevious: # if end of a chromosome try: HeterWindow = round(meanWindow(Hwindow, Twindow), 4) except Exception: HeterWindow = "NA" calls.processWindow(ChrPrevious, posS, posE, HeterWindow, outputFile) windPosEnd = windSize Hwindow = []
vars()[popName + "Index"] = calls.indexSamples( vars()[popName + "samples"], header_words) for line in datafile: words = line.split() GT = words[2:] GTpair = [i for i in list(set(GT)) if i != 'N'] # get the set of alleles, skip missing alleles popNum = 0 #if ("N" not in GT) and (len(GTpair) == 2) : # skip missing data or non-biallelic if (len(GTpair) == 2): # skip non-biallelic for popName in pops: popNum += 1 # to make correct output. See below # select genotypes sGT = calls.selectSamples(vars()[popName + "Index"], words) counts = Counter(sGT) # count alleles bicounts = [counts[GTpair[0]], counts[GTpair[1]]] # extract counts bicountsP = ','.join(str(w) for w in bicounts) # make output if popNum == len(pops): fileoutput.write("%s\n" % bicountsP) else: fileoutput.write("%s\t" % bicountsP) # track progress counter += 1 if counter % 1000000 == 0:
############################# program ############################# counter = 0 print('Opening the file...') with open(args.input) as datafile: header_words = datafile.readline().split() # index samples sampCol = calls.indexSamples(sampleNames, header_words) # make output header print('Creating the output file...') fileoutput = open(args.output, 'w') sampHeader = calls.selectSamples([0, 1] + sampCol, header_words) sampHeaderP = '\t'.join(str(el) for el in sampHeader) fileoutput.write(sampHeaderP + '\n') for line in datafile: words = line.split() chr_pos = words[0:2] # select samples genotypes = calls.selectSamples(sampCol, words) # make output chr_posP = '\t'.join(str(el) for el in chr_pos) genotypesP = '\t'.join(str(el) for el in genotypes) fileoutput.write('%s\t%s\n' % (chr_posP, genotypesP))
fileoutput = open(chrPrev + '_' + args.output, 'w') for line in datafile: words = line.split() chr = str(words[0].split('_')[1]) pos = words[1] # split chromosomes into separate files if chr != chrPrev: fileoutput.close() fileoutput = open(chr + '_' + args.output, 'w') chrPrev = chr siteNumber = 1 # select samples genotypes = calls.selectSamples(sampCol, words) # count Ns valueN = calls.countPerPosition(genotypes, 'N') if valueN <= args.missing: genotypesMerged = ''.join(str(e) for e in genotypes) genotypesMergedP = genotypesMerged.replace('N', '?') else: continue # count the number of called sites if not calls.is_polymorphic(genotypes): siteNumber += 1 else: fileoutput.write("%s\t%s\t%s\t%s\n" %
return all(x == items[0] for x in items[1:]) counter = 0 output = open(args.output, 'w') output.write("#CHR\tPOS\tCommon_alleles\tRare_alleles\n") print('Opening the file...') with open(args.input) as datafile: header_words = datafile.readline().split() # index samples sIndex = calls.indexSamples(sNames, header_words) # create lists for output sNames = calls.selectSamples(sIndex, header_words) for line in datafile: words = line.split() chr_pos = words[0:2] chr_posP = '\t'.join(str(e) for e in chr_pos) # select samples sGT = calls.selectSamples(sIndex, words) # define two or one character code if all(len(i) == 1 for i in sGT): alleles = calls.OneToTwo(sGT) elif all("/" in i for i in sGT): alleles = sGT
# index samples sampCol = calls.indexSamples(sampleNames, header_words) for line in datafile: # track progress counter += 1 if counter % 1000000 == 0: print str(counter), "lines processed" words = line.split() chr_pos = words[0:2] ch = int(words[0].split('_')[1]) pos = int(words[1]) # select samples alleles = calls.selectSamples(sampCol, words) # count Ns valueN = calls.countPerPosition(alleles, 'N') if valueN <= args.missing: Allalleles = [i for i in alleles if i != 'N'] else: continue # find overlap with the ancestor while ch > ref_ch or (ch == ref_ch and pos > ref_pos): words2 = ref.readline().split() if words2 == []: ancest = 'N' break
counter = 0 print('Opening the file...\n') with open(args.input) as datafile: header_words = datafile.readline().split() ChrPos = header_words[0:2] ChrPosP = '\t'.join(str(e) for e in ChrPos) # index samples sampCol = calls.indexSamples(sampleNames, header_words) # create lists for output sampColnames = calls.selectSamples(sampCol, header_words) # create merged column names idsheader = [] for i in range(len(sampColnames)): if i % 2 == 0: name1 = sampColnames[i].split("_")[0] name2 = sampColnames[i + 1].split("_")[0] if name1 == name2: idsheader.append(name1) else: raise KeyError( "Sample is not paired. Sample name %s doesn't equal to sample name %s" % (name1, name2)) header = '\t'.join(str(e) for e in idsheader)
############################# program ############################# counter = 0 print('Opening the file...') with open(args.input) as datafile: header_line = datafile.readline() header_words = header_line.split() # index samples sampCol = calls.indexSamples(sampleNames, header_words) # create lists for output sampColnames = calls.selectSamples(sampCol, header_words) sampNs = [0 for i in sampColnames] print('Counting Ns ...') Ns = [] for line in datafile: words = line.split() chr_pos = words[0:2] # select samples sample_charaters = calls.selectSamples(sampCol, words) # count Ns per position contNsOnly = calls.countPerPosition(sample_charaters, 'N')
sampleNames = calls.checkSampleNames(args.stats, args.input) ############################# program ############################# output = open(args.output, 'w') counter = 0 print('Opening the file...') with open(args.input) as datafile: header_line = datafile.readline() header_words = header_line.split() sampCol = calls.indexSamples(sampleNames, header_words) sampColnames = calls.selectSamples(sampCol, header_words) sampColnamesP = '\t'.join(str(e) for e in sampColnames) output.write("%s\t%s\t%s\n" % (header_words[0], header_words[1], sampColnamesP)) print('Adding noise ...') for line in datafile: words = line.split() chr_pos = words[0:2] sample_scores = calls.selectSamples(sampCol, words) sample_scoresNoise = [] for s in sample_scores:
fieldsIndex = calls.indexSamples(fieldsNames, annotOptions) sift_words = siftFile.readline().split() sift_chr = int(sift_words[0].split('_')[1]) sift_pos = int(sift_words[1]) with open(args.tab) as datafile: header_words = datafile.readline().split() # index samples sampCol = calls.indexSamples(sampleNames, header_words) # make output header print('Creating the output file...') output = open(args.output, 'w') ouput_header = header_words[0:2] + calls.selectSamples(sampCol, header_words) ouput_headerP = '\t'.join(str(el) for el in ouput_header) output.write('%s\n' % ouput_headerP) ############################### perform counting #################### for line in datafile: words = line.split() ch = int(words[0].split('_')[1]) pos = int(words[1]) # select samples tab_charaters = calls.selectSamples(sampCol, words) # find overlap in genomic position while (ch > sift_chr) or (ch == sift_chr and pos > sift_pos):
with open(args.callsFile) as callsFile: header_words = callsFile.readline().split() # index samples according to the header for groupName in groups: vars()[groupName + "Index"] = calls.indexSamples( vars()[groupName + "samples"], header_words) for line in callsFile: words = line.split() CHR = words[0] POS = int(words[1]) # prepare the genotypes for popName in groups: sGT = calls.selectSamples(vars()[popName + "Index"], words) # select genotypes per group sGTnoN = [i for i in sGT if i != 'N'] # remove missing data if sGTnoN != []: # if not all GT are missing sGTnoNset = list(set(sGTnoN)) # find set of alleles random.shuffle( sGTnoNset) # shuffle list to deal with a tie case. mostFreqGT = max( sGTnoNset, key=sGTnoN.count ) # find the most frequent allele. A tie is solved randomly. else: mostFreqGT = 'NA' if CHR != CHRprev: try: fastaSeqP = ''.join( str(w) for w in vars()[popName + "fastaSeq"])
ANCindex = calls.indexSamples(['ANC'], header_words) DERindex = calls.indexSamples(['DER'], header_words) FamilyIndex = {} phasedLines = {} for family in familySamples: FamilyIndex[family] = calls.indexSamples(familySamples[family], header_words) phasedLines[family] = [] for line in datafile: words = line.split() CHR = words[0] POS = words[1] chr_pos = CHR + '_' + POS ANC = calls.selectSamples(ANCindex, words)[0] DER = calls.selectSamples(DERindex, words)[0] allSeq = calls.selectSamples(sampleIndex, words) if calls.is_biallelic(allSeq): for family in FamilyIndex: famSeq = calls.selectSamples(FamilyIndex[family], words) GTsplit = [] splitfamSeq = [] for GT in famSeq: GT = GT.replace('/', ' ') GT = GT.replace('|', ' ') GT = GT.replace('.', '9') GTsplit.append(GT) splitSeq = ' '.join(str(e) for e in GTsplit) splitfamSeq.append(splitSeq)
parser.add_argument('-s', '--samples', help = 'column names of the samples to process (optional)', type=str, required=False) args = parser.parse_args() # check if samples names are given and if all sample names are present in a header sampleNames = calls.checkSampleNames(args.samples, args.input) ############################# program ############################# CHRprev = 'NA' with open(args.input) as datafile: header_words = datafile.readline().split() # index a sample sampCol = calls.indexSamples(sampleNames, header_words) header_samples = calls.selectSamples(sampCol, header_words) header_samplesP = '\t'.join(str(e) for e in header_samples) for line in datafile: words = line.split() CHR = words[0] Pos = words[1] samples = calls.selectSamples(sampCol, words) samplesP = '\t'.join(str(e) for e in samples) # find chromosome border if CHRprev == CHR: output.write('%s\t%s\t%s\n' % (CHR, Pos, samplesP)) else: CHRprev = CHR