def run(self): """ 2012.7.13 """ if self.debug: import pdb pdb.set_trace() session = self.db_vervet.session session.begin() if not self.dataDir: self.dataDir = self.db_vervet.data_dir dataDir = self.dataDir genotypeFile = self.db_vervet.getGenotypeFile(genotype_method_id=self.genotypeMethodID, chromosome=self.chromosome, format=self.format) if not genotypeFile: sys.stderr.write("Error: genotype_method_id %s, chromosome %s does not exist.\n"%(self.genotypeMethodID, self.chromosome)) sys.exit(2) filename = os.path.join(dataDir, genotypeFile.path) if os.path.isfile(filename): counter= 0 from pymodule.VCFFile import VCFFile #allow 0 depth-> no missing data vcfFile = VCFFile(inputFname=filename,minDepth=0) sampleIDList = vcfFile.getSampleIDList() writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t') sampleIDlist = ['sampleID'] columnIndexList = [] countryid_row=['country_id'] uclaIDList=['ucla_id'] speciesid_row=['tax_id'] longitudeList=['longitude']; latitudeList=['latitude']; for i in xrange(len(sampleIDList)): sampleID = sampleIDList[i] individualAlignment = self.db_vervet.parseAlignmentReadGroup(sampleID).individualAlignment site = individualAlignment.ind_sequence.individual.site sampleIDlist.append(sampleID) columnIndexList.append(i) uclaIDList.append(individualAlignment.ind_sequence.individual.ucla_id); countryid_row.append(individualAlignment.ind_sequence.individual.site.country_id) speciesid_row.append(individualAlignment.ind_sequence.individual.tax_id) longitudeList.append(individualAlignment.ind_sequence.individual.longitude); latitudeList.append(individualAlignment.ind_sequence.individual.latitude); writer.writerow(sampleIDlist) writer.writerow(uclaIDList) writer.writerow(speciesid_row) writer.writerow(countryid_row) writer.writerow(longitudeList) writer.writerow(latitudeList) del writer
def selectSubPopNoDB(self,columnindexlist,ind_id_ls,vcffilename): """ 2012.9.19 get entries of VCF-file that correspond to a sub-population with ucla_id in uclaidlist and return genotype matrix """ #import pdb filename = vcffilename if os.path.isfile(filename): counter= 0 from pymodule.VCFFile import VCFFile vcfFile = VCFFile(inputFname=filename, minDepth=0) #this is a list with the read-group names readgroupIDList = vcfFile.getSampleIDList() #writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t') #header = ['Chromosome', 'position', 'ref','alt'] chrom_ls=[]; ref_ls=[]; snp_pos_ls=[]; alt_ls=[] columnIndexList = columnindexlist datalist=[] for vcfRecord in vcfFile: data_row=[] chrom_ls.append(vcfRecord.chr) snp_pos_ls.append(vcfRecord.pos) refBase = vcfRecord.refBase nonRefBase = vcfRecord.altBase ref_ls.append(refBase) alt_ls.append(nonRefBase) for columnIndex in columnIndexList: #for vcfCall in vcfRecord.data_row[1:]: #data_row is a list of dictionary {'GT': base-call, 'DP': depth}, or None if missing. #it includes one extra sample in index 0, which is the reference individual (from the ref column of VCF). vcfCall = vcfRecord.data_row[columnIndex+1] if vcfCall: if vcfCall['GT'][0]==refBase and vcfCall['GT'][1]==refBase: gt=0 elif vcfCall['GT'][0]==refBase or vcfCall['GT'][1]==refBase: gt=1 else: gt=2 data_row.append(gt) else: data_row.append(-9) counter += 1 datalist.append(data_row) sys.stderr.write("%s loci in %i individuals outputted.\n"%(counter,len(columnIndexList))) #pdb.set_trace() data=np.array(datalist,dtype=np.float) datastruct=hsContigDataStruct(ind_id_ls=np.array(ind_id_ls), chrom_ls=np.array(chrom_ls),ref_ls=np.array(ref_ls),snp_pos_ls=np.array(snp_pos_ls),alt_ls=np.array(alt_ls), data=data) return datastruct
def getVCFInd(self,uclaidlist): """ 2012.9.19 get entries of VCF-file that correspond to a sub-population with ucla_id in uclaidlist and return genotype matrix """ session = self.db_vervet.session session.begin() if not self.dataDir: self.dataDir = self.db_vervet.data_dir dataDir = self.dataDir genotypeFile = self.db_vervet.getGenotypeFile(genotype_method_id=self.genotypeMethodID, chromosome=self.chromosome, format=self.format) if not genotypeFile: sys.stderr.write("Error: genotype_method_id %s, chromosome %s does not exist.\n"%(self.genotypeMethodID, self.chromosome)) sys.exit(2) filename = os.path.join(dataDir, genotypeFile.path) if os.path.isfile(filename): counter= 0 from pymodule.VCFFile import VCFFile vcfFile = VCFFile(inputFname=filename, minDepth=0) #this is a list with the read-group names readgroupIDList = vcfFile.getSampleIDList() #writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t') #header = ['Chromosome', 'position', 'ref','alt'] ind_id_ls=[]; chrom_ls=[]; ref_ls=[]; snp_pos_ls=[]; alt_ls=[] columnIndexList = [] datalist=[] for i in xrange(len(readgroupIDList)): readgroupID = readgroupIDList[i] #this is the first part of the read group individualAlignment = self.db_vervet.parseAlignmentReadGroup(readgroupID).individualAlignment uclaid=individualAlignment.ind_sequence.individual.ucla_id if uclaid in uclaidlist: #header.append(readgroupID) columnIndexList.append(i) ind_id_ls.append(uclaid) session.close() return (columnIndexList,ind_id_ls)
def createMetadataMat(self): session = self.db_vervet.session session.begin() if not self.dataDir: self.dataDir = self.db_vervet.data_dir dataDir = self.dataDir genotypeFile = self.db_vervet.getGenotypeFile(genotype_method_id=self.genotypeMethodID, chromosome=self.chromosome, format=self.format) if not genotypeFile: sys.stderr.write("Error: genotype_method_id %s, chromosome %s does not exist.\n"%(self.genotypeMethodID, self.chromosome)) sys.exit(2) filename = os.path.join(dataDir, genotypeFile.path) if os.path.isfile(filename): counter= 0 from pymodule.VCFFile import VCFFile #allow 0 depth-> no missing data vcfFile = VCFFile(inputFname=filename,minDepth=0) sampleIDList = vcfFile.getSampleIDList() sampleIDlist = ['sampleID'] columnIndexList = [] countryid_row=['country_id'] uclaIDList=['ucla_id'] speciesid_row=['tax_id'] longitudeList=['longitude']; latitudeList=['latitude']; for i in xrange(len(sampleIDList)): sampleID = sampleIDList[i] individualAlignment = self.db_vervet.parseAlignmentReadGroup(sampleID).individualAlignment site = individualAlignment.ind_sequence.individual.site sampleIDlist.append(sampleID) columnIndexList.append(i) uclaIDList.append(individualAlignment.ind_sequence.individual.ucla_id); countryid_row.append(individualAlignment.ind_sequence.individual.site.country_id) speciesid_row.append(individualAlignment.ind_sequence.individual.tax_id) longitudeList.append(individualAlignment.ind_sequence.individual.longitude); latitudeList.append(individualAlignment.ind_sequence.individual.latitude); self.metadata=[uclaIDList,countryid_row,speciesid_row,longitudeList,latitudeList] session.close()
def selectSubPop(self,uclaidlist): """ 2012.9.19 get entries of VCF-file that correspond to a sub-population with ucla_id in uclaidlist and return genotype matrix """ session = self.db_vervet.session session.begin() if not self.dataDir: self.dataDir = self.db_vervet.data_dir dataDir = self.dataDir genotypeFile = self.db_vervet.getGenotypeFile(genotype_method_id=self.genotypeMethodID, chromosome=self.chromosome, format=self.format) if not genotypeFile: sys.stderr.write("Error: genotype_method_id %s, chromosome %s does not exist.\n"%(self.genotypeMethodID, self.chromosome)) sys.exit(2) filename = os.path.join(dataDir, genotypeFile.path) if os.path.isfile(filename): counter= 0 from pymodule.VCFFile import VCFFile vcfFile = VCFFile(inputFname=filename, minDepth=0) #this is a list with the read-group names readgroupIDList = vcfFile.getSampleIDList() #writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t') #header = ['Chromosome', 'position', 'ref','alt'] ind_id_ls=[]; chrom_ls=[]; ref_ls=[]; snp_pos_ls=[]; alt_ls=[] columnIndexList = [] datalist=[] for i in xrange(len(readgroupIDList)): readgroupID = readgroupIDList[i] #this is the first part of the read group individualAlignment = self.db_vervet.parseAlignmentReadGroup(readgroupID).individualAlignment uclaid=individualAlignment.ind_sequence.individual.ucla_id if uclaid in uclaidlist: #header.append(readgroupID) columnIndexList.append(i) ind_id_ls.append(uclaid) #writer.writerow(header) #datalist.append(header) for vcfRecord in vcfFile: data_row=[] chrom_ls.append(vcfRecord.chr) snp_pos_ls.append(vcfRecord.pos) refBase = vcfRecord.refBase nonRefBase = vcfRecord.altBase ref_ls.append(refBase) alt_ls.append(nonRefBase) for columnIndex in columnIndexList: #for vcfCall in vcfRecord.data_row[1:]: #data_row is a list of dictionary {'GT': base-call, 'DP': depth}, or None if missing. #it includes one extra sample in index 0, which is the reference individual (from the ref column of VCF). vcfCall = vcfRecord.data_row[columnIndex+1] if vcfCall: if vcfCall['GT'][0]==refBase and vcfCall['GT'][1]==refBase: gt=0 elif vcfCall['GT'][0]==refBase or vcfCall['GT'][1]==refBase: gt=1 else: gt=2 data_row.append(gt) else: data_row.append('N') counter += 1 datalist.append(data_row) sys.stderr.write("%s loci in %i individuals outputted.\n"%(counter,len(columnIndexList))) data=np.array(datalist,dtype=np.float) datastruct=hsContigDataStruct(ind_id_ls=np.array(ind_id_ls), chrom_ls=np.array(chrom_ls),ref_ls=np.array(ref_ls),snp_pos_ls=np.array(snp_pos_ls),alt_ls=np.array(alt_ls), data=data) session.close() return datastruct
def run(self): """ 2012.7.13 """ if self.debug: import pdb pdb.set_trace() session = self.db_vervet.session session.begin() if not self.dataDir: self.dataDir = self.db_vervet.data_dir dataDir = self.dataDir genotypeFile = self.db_vervet.getGenotypeFile(genotype_method_id=self.genotypeMethodID, chromosome=self.chromosome, format=self.format) if not genotypeFile: sys.stderr.write("Error: genotype_method_id %s, chromosome %s does not exist.\n"%(self.genotypeMethodID, self.chromosome)) sys.exit(2) filename = os.path.join(dataDir, genotypeFile.path) if os.path.isfile(filename): counter= 0 from pymodule.VCFFile import VCFFile #allow 0 depth-> no missing data vcfFile = VCFFile(inputFname=filename,minDepth=0) sampleIDList = vcfFile.getSampleIDList() writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t') header = ['Chromosome', 'position', 'ref'] columnIndexList = [] countryid_row=['-','-','-'] speciesid_row=['-','-','-'] for i in xrange(len(sampleIDList)): sampleID = sampleIDList[i] individualAlignment = self.db_vervet.parseAlignmentReadGroup(sampleID).individualAlignment site = individualAlignment.ind_sequence.individual.site #if individualAlignment.ind_sequence.individual.tax_id==60711 and (site.country_id!=144 and site.country_id!=135 \ # and site.country_id!=136 and site.country_id!=148): header.append(sampleID) columnIndexList.append(i) countryid_row.append(individualAlignment.ind_sequence.individual.site.country_id) speciesid_row.append(individualAlignment.ind_sequence.individual.tax_id) writer.writerow(header) writer.writerow(speciesid_row) writer.writerow(countryid_row) for vcfRecord in vcfFile: data_row = [vcfRecord.chr, vcfRecord.pos] refCall = vcfRecord.data_row[0] data_row.append(refCall['GT']) #get alternative allele frequency AF_list = vcfRecord.info_tag2value.get('AF') #info_tag2value['AF'] #if not isinstance(AF_list,types.NoneType): # AF_list = AF_list.split(',') # AF_list = map(float, AF_list) for columnIndex in columnIndexList: #for vcfCall in vcfRecord.data_row[1:]: #data_row is a list of dictionary {'GT': base-call, 'DP': depth}, or None if missing. #it includes one extra sample in index 0, which is the reference individual (from the ref column of VCF). vcfCall = vcfRecord.data_row[columnIndex+1] if vcfCall: #if vcfCall['GT'][1]==refCall['GT'] and vcfCall['GT'][2]==refCall['GT']: # gt=0 #elif vcfCall['GT'][1]==refCall['GT'] or vcfCall['GT'][2]==refCall['GT']: # gt=0.5 data_row.append(vcfCall['GT']) else: data_row.append('NN') writer.writerow(data_row) counter += 1 sys.stderr.write("%s loci outputted.\n"%(counter)) del writer
def run(self): """ 2012.7.13 """ if self.debug: import pdb pdb.set_trace() session = self.db_vervet.session session.begin() if not self.dataDir: self.dataDir = self.db_vervet.data_dir dataDir = self.dataDir genotypeFile = self.db_vervet.getGenotypeFile(genotype_method_id=self.genotypeMethodID, chromosome=self.chromosome, format=self.format) if not genotypeFile: sys.stderr.write("Error: genotype_method_id %s, chromosome %s does not exist.\n"%(self.genotypeMethodID, self.chromosome)) sys.exit(2) filename = os.path.join(dataDir, genotypeFile.path) if os.path.isfile(filename): counter= 0 from pymodule.VCFFile import VCFFile vcfFile = VCFFile(inputFname=filename, minDepth=0) sampleIDList = vcfFile.getSampleIDList() writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t') header = ['Chromosome', 'position', 'ref','alt'] columnIndexList = [] for i in xrange(len(sampleIDList)): sampleID = sampleIDList[i] individualAlignment = self.db_vervet.parseAlignmentReadGroup(sampleID).individualAlignment site = individualAlignment.ind_sequence.individual.site #if individualAlignment.ind_sequence.individual.tax_id==60711 and (site.country_id!=144 and site.country_id!=135 \ # and site.country_id!=136 and site.country_id!=148): header.append(sampleID) columnIndexList.append(i) writer.writerow(header) for vcfRecord in vcfFile: data_row = [vcfRecord.chr, vcfRecord.pos] refBase = vcfRecord.refBase nonRefBase = vcfRecord.altBase data_row.append(refBase) data_row.append(nonRefBase) for columnIndex in columnIndexList: #for vcfCall in vcfRecord.data_row[1:]: #data_row is a list of dictionary {'GT': base-call, 'DP': depth}, or None if missing. #it includes one extra sample in index 0, which is the reference individual (from the ref column of VCF). vcfCall = vcfRecord.data_row[columnIndex+1] if vcfCall: if vcfCall['GT'][0]==refBase and vcfCall['GT'][1]==refBase: gt=0 elif vcfCall['GT'][0]==refBase or vcfCall['GT'][1]==refBase: gt=1 else: gt=2 data_row.append(gt) else: data_row.append('N') writer.writerow(data_row) counter += 1 sys.stderr.write("%s loci outputted.\n"%(counter)) del writer
def run(self): """ 2012.7.13 """ if self.debug: import pdb pdb.set_trace() session = self.db_vervet.session session.begin() if not self.dataDir: self.dataDir = self.db_vervet.data_dir dataDir = self.dataDir genotypeFile = self.db_vervet.getGenotypeFile( genotype_method_id=self.genotypeMethodID, chromosome=self.chromosome, format=self.format ) if not genotypeFile: sys.stderr.write( "Error: genotype_method_id %s, chromosome %s does not exist.\n" % (self.genotypeMethodID, self.chromosome) ) sys.exit(2) filename = os.path.join(dataDir, genotypeFile.path) if os.path.isfile(filename): counter = 0 from pymodule.VCFFile import VCFFile # allow 0 depth-> no missing data vcfFile = VCFFile(inputFname=filename, minDepth=0) sampleIDList = vcfFile.getSampleIDList() writer = csv.writer(open(self.outputFname, "w"), delimiter="\t") # header = ['Chromosome', 'position', 'ref'] columnIndexList = [] countryidList = [] speciesidList = [] keptSampleIDList = [] genotypeMat = [] for i in xrange(len(sampleIDList)): sampleID = sampleIDList[i] individualAlignment = self.db_vervet.parseAlignmentReadGroup(sampleID).individualAlignment site = individualAlignment.ind_sequence.individual.site if individualAlignment.ind_sequence.individual.target_coverage == 10: keptSampleIDList.append(sampleID) columnIndexList.append(i) countryidList.append(individualAlignment.ind_sequence.individual.site.country_id) speciesidList.append(individualAlignment.ind_sequence.individual.tax_id) for vcfRecord in vcfFile: data_row = [] refCall = vcfRecord.data_row[0] # data_row.append(refCall['GT']) # get alternative allele frequency # AF_list = vcfRecord.info_tag2value.get('AF') #info_tag2value['AF'] # if not isinstance(AF_list,types.NoneType): # AF_list = AF_list.split(',') # AF_list = map(float, AF_list) for columnIndex in columnIndexList: # for vcfCall in vcfRecord.data_row[1:]: #data_row is a list of dictionary {'GT': base-call, 'DP': depth}, or None if missing. # it includes one extra sample in index 0, which is the reference individual (from the ref column of VCF). vcfCall = vcfRecord.data_row[columnIndex + 1] if vcfCall: if vcfCall["GT"][0] == refCall["GT"] and vcfCall["GT"][1] == refCall["GT"]: gt = 0 elif vcfCall["GT"][0] == refCall["GT"] or vcfCall["GT"][1] == refCall["GT"]: gt = 1 else: gt = 2 data_row.append(gt) else: data_row.append("NN") genotypeMat.append(data_row) counter += 1 sys.stderr.write("%s loci outputted.\n" % (counter)) # calculate distance Matrix import numpy as np matArr = np.array(genotypeMat, np.int32) distArr = np.empty((matArr.shape[1], matArr.shape[1])) distArr[:] = np.NAN for i in range(matArr.shape[1]): for j in range(matArr.shape[1]): distArr[i][j] = sum(abs(matArr[:, i] - matArr[:, j])) # normalise so that distance is between 0 and 2: distArr = distArr / matArr.shape[0] np.savetxt(self.outputFname, distArr) print countryidList