def writeEvalOutput(self): self.outputfh.write( "rows are eval genotypes columns comparison genotypes\n") self.outputfh.write("\t".join(['','AA','AB','BB', './.' ]) +"\n") rownames=[0,'AA', 1,'AB', 2,'BB', 3,'./.'] for (i, gt) in grouper(2,rownames): row=self.concordancetable[i,:].tolist() for r in row: outstr="\t".join(map(str,r)) self.outputfh.write( gt +"\t"+outstr+"\n") self.outputfh.write( "matrix sum: \n") summy=np.sum(self.concordancetable) self.outputfh.write( str(summy) +"\n") #now we figure out how many sites were called or not called self.calledtable[0,0]=self.concordancetable[0:3,0:3].sum() self.calledtable[0,1]=self.concordancetable[0:3,3].sum() self.calledtable[1,0]=self.concordancetable[3,0:3].sum() self.calledtable[1,1]=self.concordancetable[3,3] self.outputfh.write("\n") rownames=[ 0,'called', 1,'./.' ] self.outputfh.write( "rows are eval genotypes columns comparison genotypes\n") self.outputfh.write( "\t".join(['','called','./.' ]) +"\n" ) for (i, gt) in grouper(2,rownames): row=self.calledtable[i,:].tolist() for r in row: outstr="\t".join(map(str,r)) self.outputfh.write( gt +"\t"+outstr+"\n") self.outputfh.write( "matrix sum: \n") summy=np.sum(self.calledtable) self.outputfh.write( str(summy) +"\n") self.outputfh.write("\n") discordance=self.concordancetable[0,1]+self.concordancetable[0,2]+self.concordancetable[1,0]+self.concordancetable[1,2]+self.concordancetable[2,0]+self.concordancetable[2,1] total=self.concordancetable[0,1]+self.concordancetable[0,2]+self.concordancetable[1,0]+self.concordancetable[1,1]+ self.concordancetable[1,2]+self.concordancetable[2,0]+self.concordancetable[2,1] +self.concordancetable[2,2] nrd=round( (float(discordance)/float(total)) * 100, 2) variant_count_evaluation= self.concordancetable[1,1]+ self.concordancetable[1,2]+ self.concordancetable[2,1]+ self.concordancetable[2,2] variant_count_comparison= self.concordancetable[0,1]+self.concordancetable[0,2]+self.concordancetable[1,1]+self.concordancetable[1,2]+self.concordancetable[2,1]+self.concordancetable[2,2]+self.concordancetable[3,1]+self.concordancetable[3,2] nrs=round( float(variant_count_evaluation)/float(variant_count_comparison) * 100 , 2) self.outputfh.write( "NRD: " + str(nrd) +" \n") self.outputfh.write( "NRS " + str(nrs) +" \n") outstring=",".join( map(str,melt_lol(self.concordancetable.tolist())) ) self.genotypematrixfh.write(outstring+"\n")
def main(): usage = "usage: %prog [options] nrd.log.vcf\n" parser = OptionParser(usage) # parser.add_option("--matrixonly", action="store_true", dest="matrixonly", help="only print concordance matrixe", default=False) # parser.add_option("--includeRef", action="store_true", dest="includeRef", help="include sites in the set ReferenceInAll", default=False) (options, args) = parser.parse_args() vcfilename = args[0] basename = os.path.splitext(vcfilename)[0] vcfobj = VcfFile(vcfilename) vcfh = open(vcfilename, "r") nrdallfh = open(basename + ".allgenos.nrd.txt", "w") nrdtwofh = open(basename + ".twogenos.nrd.txt", "w") nrdonefh = open(basename + ".onegenos.nrd.txt", "w") vcfobj.parseMetaAndHeaderLines(vcfh) samples = vcfobj.getSampleList() # print samples # print "#setname\t" + "\t".join(samples) for vrec in vcfobj.yieldVcfRecordwithGenotypes(vcfh): outputline = [[vrec.getPos()]] setname = vrec.returnInfoDict()["set"] # which callset does the site belong to? outputline.append([setname]) # we aggregate genotypes per sample heere vrec_ziptuple = vrec.zipGenotypes(samples) # print vrec_ziptuple """ Since I'm testing against trio, NRD count can be 1 2 or 3 We keep track of the nrd count and print those records to the appropriate file: nrdallfh, nrdtwofh, nrdonefh """ nrd_count = 0 for (compare, eval) in grouper(2, vrec_ziptuple): (comp_allele1, comp_allele2) = compare[1].getAlleles() (eval_allele1, eval_allele2) = eval[1].getAlleles() eval_alleletype = typeofGenotype(eval_allele1, eval_allele2) comp_alleletype = typeofGenotype(comp_allele1, comp_allele2) if eval_alleletype == comp_alleletype: continue outputline.append([eval[0], str(eval_alleletype), compare[0], str(comp_alleletype)]) nrd_count += 1 output = "\t".join(melt_lol(outputline)) """ depending on the nrd count, print the records to appropirate file(s) """ if nrd_count == 3: nrdallfh.write(output + "\n") if nrd_count == 2: nrdtwofh.write(output + "\n") if nrd_count == 1: nrdonefh.write(output + "\n")
def main(): usage = "usage: %prog [options] " parser = argparse.ArgumentParser(description='Calculate non-reference sensitivity (NRS) and non-reference discrepancy (NRD) of VCF files with the same records') parser.add_argument("-goldvcf", dest='gold', help="VCF with gold standard genotypes you want to compare to") parser.add_argument("-evalvcf", dest='eval', help="VCF you want to evaluate against the gold standard") args=parser.parse_args() nrsfh=open('NRS.log', 'w') nrdfh=open('NRD.log', 'w') #matrixfh=open('overall.wes.array.genotype.matrix.csv', 'w') concordancetable= np.matrix( [ [ 0,0,0,0 ], [ 0,0,0,0 ], [ 0,0,0,0 ], [ 0,0,0,0 ] ] ) calledtable = np.matrix ( [ [0 ,0] , [0,0] ] ) vcf_readerOne = vcf.Reader(open(args.eval, 'r'),compressed=True) vcf_readerTwo = vcf.Reader(open(args.gold, 'r'),compressed=True) vcf_gen1=py_recordgen(vcf_readerOne) vcf_gen2=py_recordgen(vcf_readerTwo) FLAG=True vcf_sample_eval_objects=[] common_samples=[] sys.stderr.write("computing per-sample concordance ....\n") for vrec1, vrec2 in itertools.izip(vcf_gen1, vcf_gen2): vrec1_samples=[elem.sample for elem in vrec1.samples ] vrec2_samples=[elem.sample for elem in vrec2.samples ] if vrec1.CHROM != vrec2.CHROM: sys.stderr.write("chromosome number does not match!\n") sys.stderr.write(vrec1.CHROM + " " + vrec2.CHROM + "\n") sys.exit(1) if vrec1.POS != vrec2.POS: sys.stderr.write("chromosome POS does not match!\n") sys.stderr.write(vrec1.POS + " " + vrec2.POS + "\n") sys.exit(1) if vrec1.ID == None: vrec1.ID='.' if vrec2.ID == None: vrec2.ID='.' common_samples= [x for x in vrec1_samples if x in vrec2_samples ] #print len(common_samples) if FLAG == True: #vcf_sample_eval_objects = [ VcfSampleEval ('array', 'wes', x) for x in common_samples ] vcf_sample_eval_objects = [ VcfSampleEval ('gold', 'eval', x) for x in common_samples ] FLAG=False gold_eval_genotypes=[] # list of tuples where (sample_name, eval_gt, compare(gold).gt is the order for s in common_samples: gold_eval_genotypes.append( [s, vrec1.genotype(s).gt_type, vrec2.genotype(s).gt_type] ) #print gold_eval_genotypes for eval_obj, eval_genotypes in itertools.izip(vcf_sample_eval_objects, gold_eval_genotypes): if eval_genotypes[1] == None: eval_genotypes[1]=3 if eval_genotypes[2] == None: eval_genotypes[2] = 3 eval_obj.incrementcellcount(eval_genotypes[1],eval_genotypes[2]) concordancetable[eval_genotypes[1], eval_genotypes[2] ]+=1 if eval_genotypes[1] != eval_genotypes[2]: if (eval_genotypes[1] == 0 or eval_genotypes[1] == 3) and (eval_genotypes[2] == 1 or eval_genotypes[2] == 2): nrsout="\t".join( [str(vrec1.CHROM), str(vrec1.POS),eval_genotypes[0], vrec1.ID, eval_genotypes[0], str(eval_genotypes[1]), str(eval_genotypes[2]) ] ) nrsfh.write(nrsout+"\n") if eval_genotypes[1] != 3: nrdout="\t".join([str(vrec1.CHROM), str(vrec1.POS),eval_genotypes[0], vrec1.ID, eval_genotypes[0], str(eval_genotypes[1]), str(eval_genotypes[2]) ]) nrdfh.write(nrdout+"\n") #print concordancefh=open("concordance.txt", 'w') matrixfh=open("genotype.matrix.csv", 'w') print "Sample\tNRS\tNRD" for (eval_obj, sample) in itertools.izip(vcf_sample_eval_objects, common_samples): (NRS, NRD)=eval_obj.returnNRS_NRD() outstring="\t".join( [sample, str(NRS), str(NRD)]) print outstring eval_obj.write_genotype_matrix() outstring=",".join( map(str,melt_lol(concordancetable.tolist())) )
def main(): usage = "usage: %prog [options] file.vcf.gz \n calcuate NRS and NRD on a vcf generated from CombineVariants --genotypemergeoption UNIQUIFY\n" parser = OptionParser(usage) parser.add_option("--matrixonly", action="store_true", dest="matrixonly", help="only print concordance matrixe", default=False) parser.add_option("--includeRef", action="store_true", dest="includeRef", help="include sites in the set ReferenceInAll", default=False) parser.add_option("--includeFilter", action="store_true", dest="includeFilter", help="include site filtered or not!", default=False) (options, args)=parser.parse_args() vcfilename=args[0] #basename=os.path.splitext(vcfilename)[0] basename=os.path.splitext(os.path.splitext(vcfilename)[0])[0] """ row is eval, column is comparison make a numpy matrix to represent genotype concordance matrix """ concordancetable= np.matrix( [ [ 0,0,0,0 ], [ 0,0,0,0 ], [ 0,0,0,0 ], [ 0,0,0,0 ] ] ) calledtable = np.matrix ( [ [0 ,0] , [0,0] ] ) #outputfile is the the basename of the VCF to be analyzed replaced with a variantEval.txt suffix outputfile=".".join([basename, 'variantEval','txt']) outputfh=open(outputfile, 'w') #log file of sites that contribute to NRS penalty; hom-ref and no-calls at variant sites in comparison set nrslog=".".join([basename, 'nrs','log']) nrdlog=".".join([basename, 'nrd','log']) filterlog=".".join([basename, 'filtered','log']) multialleliclog=".".join([basename, 'multiallelic','log']) concordancelog=".".join([basename, 'concordance','log']) genotypematrix=".".join([basename, 'genotype.matrix', 'csv']) fieldslog=".".join([basename, 'fields', 'log']) nrsfh=open(nrslog, 'w') nrdfh=open(nrdlog, 'w') filteredfh=open(filterlog, 'w') multifh=open(multialleliclog, 'w') concordancefh=open(concordancelog, 'w') genotypematrixfh=open(genotypematrix, 'w') fieldsfh=open(fieldslog, 'w') fieldsfh.write('set'+"\n") vcfobj=VcfFile(vcfilename) vcfh=gzip.open(vcfilename,'r') vcfobj.parseMetaAndHeaderLines(vcfh) header=vcfobj.returnHeader() +"\n" nrsfh.write(header) nrdfh.write(header) filteredfh.write(header) concordancefh.write(header) multifh.write(header) #outputfh.write(header) #multifh.write(header) samples=vcfobj.getSampleList() #for (comparename, evalname) in grouper(2,samples): # print comparename, evalname vcf_sample_eval_objects = [ VcfSampleEval(compare,eval,basename) for (compare,eval) in grouper(2,samples) ] totalrecords=0 pattern=';set=(\S+)' for vrec in vcfobj.yieldVcfRecordwithGenotypes(vcfh): if ',' in vrec.getAlt() > 1: outstring=vrec.toStringwithGenotypes() + "\n" multifh.write(outstring) #continue """ skip homoz reference calls unless you want to include them! """ if 'ReferenceInAll' in vrec.getInfo() and options.includeRef == False: continue """ if variant is filtered, skip it! """ if 'filterIn' in vrec.getInfo() and options.includeFilter == False: outstring=vrec.toStringwithGenotypes() + "\n" filteredfh.write(outstring) continue if 'FilteredInAll' in vrec.getInfo(): outstring=vrec.toStringwithGenotypes() + "\n" filteredfh.write(outstring) continue #returns a list [ (samplename, vcfgenotype) , ... () ] vrec_ziptuple=vrec.zipGenotypes(samples) #compare_eval =[ compare+evalu for (compare,evalu) in grouper(2,vrec_ziptuple) ] #what set are you in? field=re.search(pattern, vrec.getInfo()).groups()[0] fieldsfh.write(field+"\n") totalrecords+=1 """ we take records two at a time, assuming the first is the comparison genotype the second is the evaluation genotype """ for (compare, eval) in grouper(2,vrec_ziptuple): (comp_allele1, comp_allele2)=compare[1].getAlleles() (eval_allele1, eval_allele2)=eval[1].getAlleles() eval_alleletype=typeofGenotype(eval_allele1, eval_allele2) comp_alleletype=typeofGenotype(comp_allele1, comp_allele2) """ increment the cell count """ concordancetable[eval_alleletype, comp_alleletype]+=1 """write gentoype record to log appropriate log file """ #print records that contirubut the NRS penalty if eval_alleletype == 3: if comp_alleletype == 1 or comp_alleletype==2: outstring=vrec.toStringwithGenotypes() + "\n" nrsfh.write( outstring) if eval_alleletype==0: if comp_alleletype == 1 or comp_alleletype == 2: outstring=vrec.toStringwithGenotypes() + "\n" nrsfh.write( outstring ) #print records that contribute to NRD penalty if eval_alleletype==0: if comp_alleletype == 1 or comp_alleletype == 2: outstring=vrec.toStringwithGenotypes() + "\n" nrdfh.write( outstring ) if comp_alleletype == 0: outstring=vrec.toStringwithGenotypes() + "\n" concordancefh.write( outstring ) if eval_alleletype == 1: if comp_alleletype == 0 or comp_alleletype == 2: outstring=vrec.toStringwithGenotypes() + "\n" nrdfh.write( outstring ) if comp_alleletype == 1: outstring=vrec.toStringwithGenotypes() + "\n" concordancefh.write( outstring ) if eval_alleletype == 2: if comp_alleletype == 0 or comp_alleletype ==1: outstring=vrec.toStringwithGenotypes() + "\n" nrdfh.write( outstring ) if comp_alleletype == 2: outstring=vrec.toStringwithGenotypes() + "\n" concordancefh.write( outstring ) outputfh.write("total records analyzed: " + str(totalrecords) + "\n" ) outputfh.write( "rows are eval genotypes columns comparison genotypes\n") outputfh.write("\t".join(['','AA','AB','BB', './.' ]) +"\n") rownames=[0,'AA', 1,'AB', 2,'BB', 3,'./.'] for (i, gt) in grouper(2,rownames): row=concordancetable[i,:].tolist() for r in row: outstr="\t".join(map(str,r)) outputfh.write( gt +"\t"+outstr+"\n") outputfh.write( "matrix sum: \n") sum=np.sum(concordancetable) outputfh.write( str(sum) +"\n") #now we figure out how many sites were called or not called calledtable[0,0]=concordancetable[0:3,0:3].sum() calledtable[0,1]=concordancetable[0:3,3].sum() calledtable[1,0]=concordancetable[3,0:3].sum() calledtable[1,1]=concordancetable[3,3] outputfh.write("\n") rownames=[ 0,'called', 1,'./.' ] outputfh.write( "rows are eval genotypes columns comparison genotypes\n") outputfh.write( "\t".join(['','called','./.' ]) +"\n" ) for (i, gt) in grouper(2,rownames): row=calledtable[i,:].tolist() for r in row: outstr="\t".join(map(str,r)) outputfh.write( gt +"\t"+outstr+"\n") outputfh.write( "matrix sum: \n") sum=np.sum(calledtable) outputfh.write( str(sum) +"\n") outputfh.write("\n") outstring=",".join( map(str,melt_lol(concordancetable.tolist())) ) genotypematrixfh.write(outstring+"\n") if options.matrixonly == False: discordance=concordancetable[0,1]+concordancetable[0,2]+concordancetable[1,0]+concordancetable[1,2]+concordancetable[2,0]+concordancetable[2,1] total=concordancetable[0,1]+concordancetable[0,2]+concordancetable[1,0]+concordancetable[1,1]+ concordancetable[1,2]+concordancetable[2,0]+concordancetable[2,1] +concordancetable[2,2] nrd=round( (float(discordance)/float(total)) * 100, 2) variant_count_evaluation= concordancetable[1,1]+ concordancetable[1,2]+ concordancetable[2,1]+ concordancetable[2,2] variant_count_comparison= concordancetable[0,1]+concordancetable[0,2]+concordancetable[1,1]+concordancetable[1,2]+concordancetable[2,1]+concordancetable[2,2]+concordancetable[3,1]+concordancetable[3,2] nrs=round( float(variant_count_evaluation)/float(variant_count_comparison) * 100 , 2) outputfh.write( "NRD: " + str(nrd) +" \n") outputfh.write( "NRS " + str(nrs) +" \n")
def write_genotype_matrix(self): """ melt the genotypematrix into a CSV of numbers """ outstring=",".join( map(str,melt_lol(self.concordancetable.tolist())) ) self.genotypematrixfh.write(outstring+"\n")