def loadGLFFiles(inputGLFFiles=''): glffiles = [] fg = open(inputGLFFiles, mode='r') for line in fg.readlines(): glffiles.append(line.rstrip("\n").split()[0]) fg.close() fp_to_fname = {} for glffile in glffiles: if not os.path.exists(glffile): sys.stderr.write("File %s does not exist\n" % glffile) continue fg = FileUtils.FileWithHeader(fname=glffile, mode='r') while True: dat = fg.readline() if dat['realigned_position'] != 'NA': firstpos = int(dat['realigned_position']) if fp_to_fname.has_key(firstpos): raise NameError('Huh?') fp_to_fname[firstpos] = glffile fg.close() break newglffiles = [] for pos in sorted(fp_to_fname.keys()): print "pos:", pos, "glffile:", fp_to_fname[pos] newglffiles.append(fp_to_fname[pos]) return newglffiles
def getCalls(callFile=''): vcf = FileUtils.FileWithHeader(fname=callFile, mode='r', joinChar="\t") calls = {} numcalls = 0 while True: dat = vcf.readline() if dat == {}: break if dat['FILTER'] == "PASS" or (dat['FILTER'] == "q20" and float(dat['QUAL']) >= 10): chrom = dat['CHROM'] pos = int(dat['POS']) ref = dat['REF'] alt = dat['ALT'] if alt.find(',') != -1: raise NameError("Cannot deal with these entries") var = Variant.Variant4(ref=ref, alt=alt) # note, must be zero-based pos! SEE newpos = pos + var.offset - 1 newstr = var.str if not calls.has_key(chrom): calls[chrom] = {} if not calls[chrom].has_key(newpos): calls[chrom][newpos] = {} if calls[chrom][newpos].has_key(newstr): raise NameError('Multiple same variants?') calls[chrom][newpos][newstr] = dat.copy() numcalls += 1 vcf.close() print "Number of calls imported:", numcalls return calls
def processPooledGLFFiles(bamFilesFile='', glfFilesFile='', refFile='', outputVCFFile='', maxHPLen=10, minForwardReverse=1, minDist=10, dbSNPWindow=50, newVarCov=False, doNotFilterOnFR=False, filterQual=20, numSamples=1, numBamFiles=1): coverageRange = [20, 10000] # read file with glf files allFiles = [] headerLabels = [] f = open(glfFilesFile, 'r') idx = 0 for line in f.readlines(): idx += 1 dat = line.rstrip("\n").split() for gf in dat: if not os.path.exists(gf): sys.stderr.write("WARNING: GLF file %s does not exist.\n" % gf) else: if os.path.splitext(gf)[-1] == '.gz': fgf = gzip.open(gf, 'r') else: fgf = open(gf, 'r') line = fgf.readline() if line == '': sys.stderr.write("WARNING: GLF file %s is empty.\n" % gf) else: d = line.rstrip("\n").split() if headerLabels == []: headerLabels = d[:] allFiles.append(gf) else: if d != headerLabels: sys.stderr.write( "Inconsistent header in GLF file %s\n" % gf) else: allFiles.append(gf) fgf.close() f.close() fa = Fasta.Fasta(fname=refFile) # read precall files # make hash table [pos][variant][fname] numInds = numSamples minFreq = 1.0 / (float(2 * numInds) * 5) nf = 0 try: realpos_col = headerLabels.index('realigned_position') var_col = headerLabels.index('nref_all') # apply filters across individuals tcFilter = "tc%d" % minDist col_num_reads = headerLabels.index('num_reads') col_num_forward_old = headerLabels.index('num_cover_forward') col_num_reverse_old = headerLabels.index('num_cover_reverse') col_num_forward = headerLabels.index('var_coverage_forward') col_num_reverse = headerLabels.index('var_coverage_reverse') col_post_prob = headerLabels.index('post_prob_variant') chr_col = headerLabels.index('tid') idx_col = headerLabels.index('indidx') ana_col = headerLabels.index('analysis_type') except ValueError: raise NameError( "GLF files are corrupt. Could not find all required columns.") pass_filters = {} varStat = {} nr = 0 num_pass = 0 # read depth histo rdhist = {} for glffile in allFiles: fglf = FileUtils.FileWithHeader(fname=glffile, mode='r', joinChar=' ') print "Reading", glffile done = False while True: pos = -1 var = '' nr += 1 if nr % 10000 == 9999: print "Number of lines read:", nr + 1 num_ind_with_data = 0 tot_coverage = 0 tot_num_forward = 0 tot_num_reverse = 0 tot_num_forward_old = 0 tot_num_reverse_old = 0 skip = False for fidx in range(0, numBamFiles): try: dat = fglf.readlineList() except IOError: sys.stderr.write("WARNING: IOError in %s\n" % glffile) done = True break if dat == []: done = True break if dat[realpos_col] == 'NA': skip = True break if dat[ana_col] != "singlevariant": skip = True break if dat[idx_col] != 'NA' and int(dat[idx_col]) >= numBamFiles: raise NameError( 'Error. Is the number of BAM files correctly specified?' ) if pos == -1: pos = int(dat[realpos_col]) var = dat[var_col] chr = dat[chr_col] else: if int(dat[realpos_col]) != pos: raise NameError( 'Inconsistent glf files! Is the number of BAM files correctly specified?' ) if int(dat[idx_col]) != fidx: sys.stderr.write( "Error reading this variant: %s %d %s in %s\n" % (chr, pos, var, glffile)) tot_num_forward_old += int(dat[col_num_forward_old]) tot_num_reverse_old += int(dat[col_num_reverse_old]) if fidx == 0: # only record for first individual tot_num_forward = int(dat[col_num_forward]) tot_num_reverse = int(dat[col_num_reverse]) numreads = int(dat[col_num_reads]) if numreads > 0: num_ind_with_data += 1 tot_coverage += numreads if skip: continue if done: break prob = float(dat[col_post_prob]) freq = float(dat[headerLabels.index('est_freq')]) if rdhist.has_key(tot_coverage): rdhist[tot_coverage] += 1 else: rdhist[tot_coverage] = 1 if prob > 0.20: if not varStat.has_key(chr): varStat[chr] = {} if not varStat[chr].has_key(pos): varStat[chr][pos] = {} # hplen seq = fa.get(chr, pos + 1 - 25, 50) hplen = AnalyzeSequence.HomopolymerLength(seq=seq, pos=25) varStat[chr][pos][var] = { 'QUAL': prob, 'NF': tot_num_forward, 'NR': tot_num_reverse, 'NFS': tot_num_forward_old, 'NRS': tot_num_reverse_old, 'DP': tot_coverage, 'NS': num_ind_with_data, 'AF': freq, 'HP': hplen } del dat # finished reading this one fglf.close() #print "Number of variants passing filters:", num_pass # apply haplotype coverage and other filters coverageRange = getPercentiles(rdhist, [1, 99]) fqp = 1.0 - math.pow(10.0, -float(filterQual) / 10.0) fqp_str = "q%d" % filterQual for chr in varStat.keys(): for pos in varStat[chr].keys(): for varseq, var in varStat[chr][pos].iteritems(): filters = [] prob = var['QUAL'] num_ind_with_data = var['NS'] hplen = var['HP'] freq = var['AF'] tot_coverage = var['DP'] tot_num_forward = var['NF'] tot_num_reverse = var['NR'] if prob < fqp: filters.append(fqp_str) if (tot_num_forward < minForwardReverse or tot_num_reverse < minForwardReverse) and not doNotFilterOnFR: filters.append('fr0') if tot_coverage < coverageRange[ 0] or tot_coverage > coverageRange[1]: filters.append('ocr') if num_ind_with_data < numInds / 2: filters.append('s50') if hplen > maxHPLen: filters.append("hp%d" % (maxHPLen)) if freq < minFreq: filters.append("mf") if filters == []: if not pass_filters.has_key(chr): pass_filters[chr] = {} if not pass_filters[chr].has_key(pos): pass_filters[chr][pos] = [] pass_filters[chr][pos].append(varseq) num_pass += 1 if filters == []: varStat[chr][pos][varseq]['filter'] = '' else: varStat[chr][pos][varseq]['filter'] = ';'.join(filters) # now visit each chromosome and apply closeness filter chromosomes = [str(c) for c in range(1, 23)] chromosomes.extend(['X', 'Y']) other_chr = list(set(varStat.keys()) - set(chromosomes)) chromosomes.extend(other_chr) # create VCF file print "Writing VCF" fv = open(outputVCFFile, 'w') fv.write("##fileformat=VCFv4.0\n") fv.write("##source=Dindel\n") fv.write("##reference=%s\n" % refFile) fv.write( "##INFO=<ID=NS,Number=1,Type=Integer,Description=\"Number of samples with data\">\n" ) fv.write( "##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Total number of reads in haplotype window\">\n" ) fv.write( "##INFO=<ID=HP,Number=1,Type=Integer,Description=\"Reference homopolymer tract length\">\n" ) fv.write( "##INFO=<ID=NFS,Number=1,Type=Integer,Description=\"Number of reads covering non-ref variant site on forward strand\">\n" ) fv.write( "##INFO=<ID=NRS,Number=1,Type=Integer,Description=\"Number of reads covering non-ref variant site on reverse strand\">\n" ) fv.write( "##INFO=<ID=NF,Number=1,Type=Integer,Description=\"Number of reads covering non-ref variant on forward strand\">\n" ) fv.write( "##INFO=<ID=NR,Number=1,Type=Integer,Description=\"Number of reads covering non-ref variant on reverse strand\">\n" ) fv.write( "##INFO=<ID=AF,Number=-1,Type=Float,Description=\"Allele frequency\">\n" ) fv.write( "##INFO=<ID=DB,Number=0,Type=Flag,Description=\"dbSNP membership build 129 - type match and indel sequence length match within %d bp\">\n" % dbSNPWindow) fv.write("##FILTER=<ID=q%d,Description=\"Quality below %d\">\n" % (filterQual, filterQual)) fv.write( "##FILTER=<ID=s50,Description=\"Less than 50% of samples have data\">\n" ) fv.write( "##FILTER=<ID=tc%d,Description=\"Indel site was closer than %d base pairs from another site with higher posterior probability\">\n" % (minDist, minDist)) fv.write( "##FILTER=<ID=hp%d,Description=\"Reference homopolymer length was longer than %d\">\n" % (maxHPLen, maxHPLen)) if not doNotFilterOnFR: fv.write( "##FILTER=<ID=fr0,Description=\"Non-ref allele is not covered by at least one read on both strands\">\n" ) fv.write( "##FILTER=<ID=ocr,Description=\"Number of reads in haplotype window outside coverage range %d %d\">\n" % (coverageRange[0], coverageRange[1])) fv.write( "##FILTER=<ID=mf,Description=\"Too low non-ref allele frequency\">\n") fv.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n") for chr in chromosomes: if not pass_filters.has_key(chr): continue # filter out variants that are too close totSites = 0 positions = sorted(pass_filters[chr].keys()) newPosition = positions[:] done = False while not done: done = True for p in range(1, len(positions)): if newPosition[p] != newPosition[ p - 1] and newPosition[p] - positions[p - 1] <= minDist: newPosition[p] = newPosition[p - 1] done = False newSites = {} for p in range(0, len(newPosition)): newPos = newPosition[p] pos = positions[p] if not newSites.has_key(newPos): newSites[newPos] = {} if not newSites[newPos].has_key(pos): newSites[newPos][pos] = [] for var in varStat[chr][pos].keys(): newSites[newPos][pos].append(var) print "New number of sites:", len(newSites.keys()) print "Number of sites filtered:", len(pass_filters[chr].keys()) - len( newSites.keys()) # select best call for double sites filtered = [] for newPos in newSites.keys(): old = newSites[newPos].keys() pos_probs = [] pos_vars = [] pos_pos = [] for oldPos in old: probs = [] vars = [] max_prob = -1.0 max_var = '' for var in newSites[newPos][oldPos]: prob = varStat[chr][oldPos][var]['QUAL'] if prob > max_prob: max_prob = prob max_var = var pos_probs.append(max_prob) pos_vars.append(max_var) pos_pos.append(oldPos) idx = pos_probs.index(max(pos_probs)) okpos = pos_pos[idx] filtered.append(pos_pos[idx]) for duppos in set(old) - set([okpos]): for var in varStat[chr][duppos].keys(): if varStat[chr][duppos][var]['filter'] == '': varStat[chr][duppos][var]['filter'] == tcFilter else: varStat[chr][duppos][var]['filter'] += ';' + tcFilter print "Number of indel sites:", len(filtered) for pos in sorted(varStat[chr].keys()): for var in varStat[chr][pos].keys(): indel_report_pos = pos #refall = fa.get(chr, pos+1, 1) qual = -int(10.0 * math.log10( max(1.0 - float(varStat[chr][pos][var]['QUAL']), 1e-10))) infofield = [] for tag in ['AF', 'NS', 'DP', 'HP', 'NF', 'NR', 'NFS', 'NRS']: val = (varStat[chr][pos][var][tag]) infofield.append("%s=%s" % (tag, val)) vnref = Variant.Variant(varString=var) max_del_len = 0 if vnref.type == "del": if vnref.length > max_del_len: max_del_len = vnref.length seqlen = 1 + max_del_len refseq = ''.join(fa.get(chr, indel_report_pos, seqlen)) if vnref.type == "del": altseq = refseq[0] + refseq[(1 + vnref.length):] elif vnref.type == "ins": altseq = refseq[0] + vnref.seq + refseq[1:] elif vnref.type == "snp": indel_report_pos += 1 refseq = refseq[1] altseq = vnref.seq[0] infostr = ';'.join(infofield) filterstr = varStat[chr][pos][var]['filter'] if filterstr == '': filterstr = 'PASS' id = '.' outstr = "%s\t%d\t%s\t%s\t%s\t%d\t%s\t%s\n" % ( chr, indel_report_pos, id, refseq, altseq, qual, filterstr, infostr) fv.write(outstr) fv.close()
def processDiploidGLFFile(glfFile='', variants={}, refFile='', maxHPLen=10, isHomozygous=False, doNotFilterOnFR=False, newVarCov=False, filterQual=20): # setup reference sequence fa = Fasta.Fasta(fname=refFile) # variants will be added to variants fglf = FileUtils.FileWithHeader(fname=glfFile) numSkipped = 0 # number of windows that were skipped by Dindel # read line by line, aggregate results for identical windows prevPos = -1 prevChr = -1 prevDat = {} while True: dat = fglf.readline() if dat == {}: break if True: errcode = dat['msg'] index = dat['index'] # index of window in original variant file if errcode != "ok": numSkipped += 1 continue if dat['analysis_type'] != 'dip.map': continue if dat['was_candidate_in_window'] != '1': continue glf = {} chrom = dat['tid'] if chrom != prevChr: prevPos = -1 prevChr = chrom glf['chr'] = dat['tid'] glf['pos'] = dat['realigned_position'] pos = int(glf['pos']) prevDat = dat glf['qual'] = int(float(dat['qual'])) if float(glf['qual']) < 1.0: continue glf['nref_all'] = dat['nref_all'].split(',') if glf['nref_all'] == ['R=>D']: continue nfa = dat['var_coverage_forward'].split(',') nra = dat['var_coverage_reverse'].split(',') ai = 0 glf['num_cover_forward'] = int(nfa[ai]) glf['num_cover_reverse'] = int(nra[ai]) glf['num_cover_forward_old'] = int(dat['num_cover_forward']) glf['num_cover_reverse_old'] = int(dat['num_cover_reverse']) glf['num_hap_reads'] = dat['num_reads'] glf['genotype'] = dat['glf'] (vcf_str, report_pos) = getVCFString(glf=glf, fa=fa, filterQual=filterQual) if not variants.has_key(chrom): variants[chrom] = {} if not variants[chrom].has_key(report_pos): variants[chrom][report_pos] = [] variants[chrom][report_pos].append(vcf_str) prevPos = pos
def makeGLF(inputGLFFiles='', outputFile='', callFile='', bamfilesFile=''): # get VCF calls sys.stdout.write("Reading VCF file\n") calls = getCalls(callFile=callFile) sys.stdout.write("done\n") sys.stdout.flush() # read through glf files glffiles = loadGLFFiles(inputGLFFiles=inputGLFFiles) # get BAMfiles file bamfiles = [] fb = open(bamfilesFile, 'r') for line in fb.readlines(): bamfiles.append(line.rstrip("\n").split()[0]) fb.close() # check each GLF file numwritten = 0 # open output file fout = open(outputFile, 'w') for glffile in glffiles: sys.stdout.write("Checking %s\n" % glffile) fg = FileUtils.FileWithHeader(fname=glffile, mode='r') buffer = {} curr_index = '-1' while True: dat = fg.readline() if dat == {}: break newindex = "%s.%s.%s" % (dat['index'], dat['realigned_position'], dat['nref_all']) if not buffer.has_key(newindex): buffer[newindex] = [] buffer[newindex].append(dat) if newindex != curr_index: if curr_index != '-1': result = emptyBuffer(index=curr_index, buffer=buffer, calls=calls, outputFileHandle=fout, bamfiles=bamfiles) if result == "a-ok": numwritten += 1 curr_index = newindex result = emptyBuffer(index=curr_index, buffer=buffer, calls=calls, outputFileHandle=fout, bamfiles=bamfiles) if result == "a-ok": numwritten += 1 fg.close() print "Number written:", numwritten sys.stdout.flush() # finish up fout.close()