Python VCFFile.getSampleIDList示例，pymodule.VCFFile.VCFFile.getSampleIDList Python示例

示例#1

0

显示文件

文件： hsCreateMetadataMatrix.py 项目： mjmontague/vervet-web

	def run(self):
		"""
		2012.7.13
		"""
		if self.debug:
			import pdb
			pdb.set_trace()
		session = self.db_vervet.session
		
		session.begin()
		if not self.dataDir:
			self.dataDir = self.db_vervet.data_dir
		dataDir = self.dataDir
		
		genotypeFile = self.db_vervet.getGenotypeFile(genotype_method_id=self.genotypeMethodID, chromosome=self.chromosome, format=self.format)
		
		if not genotypeFile:
			sys.stderr.write("Error: genotype_method_id %s, chromosome %s does not exist.\n"%(self.genotypeMethodID, self.chromosome))
			sys.exit(2)
		filename = os.path.join(dataDir, genotypeFile.path)
		if os.path.isfile(filename):
			counter= 0
			from pymodule.VCFFile import VCFFile
			
			#allow 0 depth-> no missing data
			vcfFile = VCFFile(inputFname=filename,minDepth=0)
			sampleIDList = vcfFile.getSampleIDList()
			writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t')
			sampleIDlist = ['sampleID']
			columnIndexList = []
			countryid_row=['country_id']
			uclaIDList=['ucla_id']
			speciesid_row=['tax_id']
			longitudeList=['longitude'];
			latitudeList=['latitude'];
			for i in xrange(len(sampleIDList)):
				sampleID = sampleIDList[i]
				individualAlignment = self.db_vervet.parseAlignmentReadGroup(sampleID).individualAlignment
				site = individualAlignment.ind_sequence.individual.site				
				sampleIDlist.append(sampleID)
				columnIndexList.append(i)
				uclaIDList.append(individualAlignment.ind_sequence.individual.ucla_id);
				countryid_row.append(individualAlignment.ind_sequence.individual.site.country_id)
				speciesid_row.append(individualAlignment.ind_sequence.individual.tax_id)
				longitudeList.append(individualAlignment.ind_sequence.individual.longitude);
				latitudeList.append(individualAlignment.ind_sequence.individual.latitude);
			writer.writerow(sampleIDlist)
			writer.writerow(uclaIDList)
			writer.writerow(speciesid_row)
			writer.writerow(countryid_row)
			writer.writerow(longitudeList)
			writer.writerow(latitudeList)
			del writer

示例#2

0

显示文件

文件： hsExtractDataTool.py 项目： mjmontague/vervet-web

	def selectSubPopNoDB(self,columnindexlist,ind_id_ls,vcffilename):
		"""
		2012.9.19
			get entries of VCF-file that correspond to a sub-population with ucla_id in uclaidlist
			and return genotype matrix
		"""
		#import pdb
		filename = vcffilename
		if os.path.isfile(filename):
			counter= 0
			from pymodule.VCFFile import VCFFile
			
			vcfFile = VCFFile(inputFname=filename, minDepth=0)
			#this is a list with the read-group names
			readgroupIDList = vcfFile.getSampleIDList()
			#writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t')
			#header = ['Chromosome', 'position', 'ref','alt']
			chrom_ls=[]; ref_ls=[]; snp_pos_ls=[]; alt_ls=[]
			columnIndexList = columnindexlist
			datalist=[]
			for vcfRecord in vcfFile:
				data_row=[]
				chrom_ls.append(vcfRecord.chr)
				snp_pos_ls.append(vcfRecord.pos)
				refBase = vcfRecord.refBase
				nonRefBase = vcfRecord.altBase
				ref_ls.append(refBase)
				alt_ls.append(nonRefBase)
				for columnIndex in columnIndexList:
					#for vcfCall in vcfRecord.data_row[1:]: #data_row is a list of dictionary {'GT': base-call, 'DP': depth}, or None if missing.
					#it includes one extra sample in index 0, which is the reference individual (from the ref column of VCF).
					vcfCall = vcfRecord.data_row[columnIndex+1]
					if vcfCall:
						if vcfCall['GT'][0]==refBase and vcfCall['GT'][1]==refBase:
							gt=0
						elif vcfCall['GT'][0]==refBase or vcfCall['GT'][1]==refBase:
							gt=1
						else:
							gt=2
						data_row.append(gt)
					else:
						data_row.append(-9)
				counter += 1
				datalist.append(data_row)
			sys.stderr.write("%s loci in %i individuals outputted.\n"%(counter,len(columnIndexList)))
			#pdb.set_trace()
			data=np.array(datalist,dtype=np.float)
			datastruct=hsContigDataStruct(ind_id_ls=np.array(ind_id_ls), chrom_ls=np.array(chrom_ls),ref_ls=np.array(ref_ls),snp_pos_ls=np.array(snp_pos_ls),alt_ls=np.array(alt_ls), data=data)
			return datastruct

示例#3

0

显示文件

文件： hsExtractDataTool.py 项目： mjmontague/vervet-web

	def getVCFInd(self,uclaidlist):
		"""
		2012.9.19
			get entries of VCF-file that correspond to a sub-population with ucla_id in uclaidlist
			and return genotype matrix
		"""
		session = self.db_vervet.session
		
		session.begin()
		if not self.dataDir:
			self.dataDir = self.db_vervet.data_dir
		dataDir = self.dataDir
		
		genotypeFile = self.db_vervet.getGenotypeFile(genotype_method_id=self.genotypeMethodID, chromosome=self.chromosome, format=self.format)
		
		if not genotypeFile:
			sys.stderr.write("Error: genotype_method_id %s, chromosome %s does not exist.\n"%(self.genotypeMethodID, self.chromosome))
			sys.exit(2)
		filename = os.path.join(dataDir, genotypeFile.path)
		if os.path.isfile(filename):
			counter= 0
			from pymodule.VCFFile import VCFFile
			
			vcfFile = VCFFile(inputFname=filename, minDepth=0)
			#this is a list with the read-group names
			readgroupIDList = vcfFile.getSampleIDList()
			#writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t')
			#header = ['Chromosome', 'position', 'ref','alt']
			ind_id_ls=[]; chrom_ls=[]; ref_ls=[]; snp_pos_ls=[]; alt_ls=[]
			columnIndexList = []
			datalist=[]
			for i in xrange(len(readgroupIDList)):
				readgroupID = readgroupIDList[i]
				#this is the first part of the read group
				individualAlignment = self.db_vervet.parseAlignmentReadGroup(readgroupID).individualAlignment
				uclaid=individualAlignment.ind_sequence.individual.ucla_id
				if uclaid in uclaidlist:			
					#header.append(readgroupID)
					columnIndexList.append(i)
					ind_id_ls.append(uclaid)
			session.close()		
			return (columnIndexList,ind_id_ls)

示例#4

0

显示文件

文件： hsExtractDataTool.py 项目： mjmontague/vervet-web

	def createMetadataMat(self):
		session = self.db_vervet.session
		
		session.begin()
		if not self.dataDir:
			self.dataDir = self.db_vervet.data_dir
		dataDir = self.dataDir
		
		genotypeFile = self.db_vervet.getGenotypeFile(genotype_method_id=self.genotypeMethodID, chromosome=self.chromosome, format=self.format)
		
		if not genotypeFile:
			sys.stderr.write("Error: genotype_method_id %s, chromosome %s does not exist.\n"%(self.genotypeMethodID, self.chromosome))
			sys.exit(2)
		filename = os.path.join(dataDir, genotypeFile.path)
		if os.path.isfile(filename):
			counter= 0
			from pymodule.VCFFile import VCFFile
			
			#allow 0 depth-> no missing data
			vcfFile = VCFFile(inputFname=filename,minDepth=0)
			sampleIDList = vcfFile.getSampleIDList()
			sampleIDlist = ['sampleID']
			columnIndexList = []
			countryid_row=['country_id']
			uclaIDList=['ucla_id']
			speciesid_row=['tax_id']
			longitudeList=['longitude'];
			latitudeList=['latitude'];
			for i in xrange(len(sampleIDList)):
				sampleID = sampleIDList[i]
				individualAlignment = self.db_vervet.parseAlignmentReadGroup(sampleID).individualAlignment
				site = individualAlignment.ind_sequence.individual.site				
				sampleIDlist.append(sampleID)
				columnIndexList.append(i)
				uclaIDList.append(individualAlignment.ind_sequence.individual.ucla_id);
				countryid_row.append(individualAlignment.ind_sequence.individual.site.country_id)
				speciesid_row.append(individualAlignment.ind_sequence.individual.tax_id)
				longitudeList.append(individualAlignment.ind_sequence.individual.longitude);
				latitudeList.append(individualAlignment.ind_sequence.individual.latitude);
			self.metadata=[uclaIDList,countryid_row,speciesid_row,longitudeList,latitudeList]
			session.close()

示例#5

0

显示文件

文件： hsCalculateStatsForSubPop_0_1.py 项目： mjmontague/vervet-web

	def selectSubPop(self,uclaidlist):
		"""
		2012.9.19
			get entries of VCF-file that correspond to a sub-population with ucla_id in uclaidlist
			and return genotype matrix
		"""
		session = self.db_vervet.session
		
		session.begin()
		if not self.dataDir:
			self.dataDir = self.db_vervet.data_dir
		dataDir = self.dataDir
		
		genotypeFile = self.db_vervet.getGenotypeFile(genotype_method_id=self.genotypeMethodID, chromosome=self.chromosome, format=self.format)
		
		if not genotypeFile:
			sys.stderr.write("Error: genotype_method_id %s, chromosome %s does not exist.\n"%(self.genotypeMethodID, self.chromosome))
			sys.exit(2)
		filename = os.path.join(dataDir, genotypeFile.path)
		if os.path.isfile(filename):
			counter= 0
			from pymodule.VCFFile import VCFFile
			
			vcfFile = VCFFile(inputFname=filename, minDepth=0)
			#this is a list with the read-group names
			readgroupIDList = vcfFile.getSampleIDList()
			#writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t')
			#header = ['Chromosome', 'position', 'ref','alt']
			ind_id_ls=[]; chrom_ls=[]; ref_ls=[]; snp_pos_ls=[]; alt_ls=[]
			columnIndexList = []
			datalist=[]
			for i in xrange(len(readgroupIDList)):
				readgroupID = readgroupIDList[i]
				#this is the first part of the read group
				individualAlignment = self.db_vervet.parseAlignmentReadGroup(readgroupID).individualAlignment
				uclaid=individualAlignment.ind_sequence.individual.ucla_id
				if uclaid in uclaidlist:			
					#header.append(readgroupID)
					columnIndexList.append(i)
					ind_id_ls.append(uclaid)
			#writer.writerow(header)
			#datalist.append(header)
			for vcfRecord in vcfFile:
				data_row=[]
				chrom_ls.append(vcfRecord.chr)
				snp_pos_ls.append(vcfRecord.pos)
				refBase = vcfRecord.refBase
				nonRefBase = vcfRecord.altBase
				ref_ls.append(refBase)
				alt_ls.append(nonRefBase)
				for columnIndex in columnIndexList:
					#for vcfCall in vcfRecord.data_row[1:]: #data_row is a list of dictionary {'GT': base-call, 'DP': depth}, or None if missing.
					#it includes one extra sample in index 0, which is the reference individual (from the ref column of VCF).
					vcfCall = vcfRecord.data_row[columnIndex+1]
					if vcfCall:
						if vcfCall['GT'][0]==refBase and vcfCall['GT'][1]==refBase:
							gt=0
						elif vcfCall['GT'][0]==refBase or vcfCall['GT'][1]==refBase:
							gt=1
						else:
							gt=2
						data_row.append(gt)
					else:
						data_row.append('N')
				counter += 1
				datalist.append(data_row)
			sys.stderr.write("%s loci in %i individuals outputted.\n"%(counter,len(columnIndexList)))
			data=np.array(datalist,dtype=np.float)
			datastruct=hsContigDataStruct(ind_id_ls=np.array(ind_id_ls), chrom_ls=np.array(chrom_ls),ref_ls=np.array(ref_ls),snp_pos_ls=np.array(snp_pos_ls),alt_ls=np.array(alt_ls), data=data)
			session.close()
			return datastruct

示例#6

0

显示文件

文件： hsFetchCountrySpeciesGenotypeMatrix.py 项目： mjmontague/vervet-web

	def run(self):
		"""
		2012.7.13
		"""
		if self.debug:
			import pdb
			pdb.set_trace()
		session = self.db_vervet.session
		
		session.begin()
		if not self.dataDir:
			self.dataDir = self.db_vervet.data_dir
		dataDir = self.dataDir
		
		genotypeFile = self.db_vervet.getGenotypeFile(genotype_method_id=self.genotypeMethodID, chromosome=self.chromosome, format=self.format)
		
		if not genotypeFile:
			sys.stderr.write("Error: genotype_method_id %s, chromosome %s does not exist.\n"%(self.genotypeMethodID, self.chromosome))
			sys.exit(2)
		filename = os.path.join(dataDir, genotypeFile.path)
		if os.path.isfile(filename):
			counter= 0
			from pymodule.VCFFile import VCFFile
			
			#allow 0 depth-> no missing data
			vcfFile = VCFFile(inputFname=filename,minDepth=0)
			sampleIDList = vcfFile.getSampleIDList()
			writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t')
			header = ['Chromosome', 'position', 'ref']
			columnIndexList = []
			countryid_row=['-','-','-']
			speciesid_row=['-','-','-']
			for i in xrange(len(sampleIDList)):
				sampleID = sampleIDList[i]
				individualAlignment = self.db_vervet.parseAlignmentReadGroup(sampleID).individualAlignment
				site = individualAlignment.ind_sequence.individual.site
				#if individualAlignment.ind_sequence.individual.tax_id==60711 and (site.country_id!=144 and site.country_id!=135 \
				#																and site.country_id!=136 and site.country_id!=148): 
				header.append(sampleID)
				columnIndexList.append(i)
				countryid_row.append(individualAlignment.ind_sequence.individual.site.country_id)
				speciesid_row.append(individualAlignment.ind_sequence.individual.tax_id)
			writer.writerow(header)
			writer.writerow(speciesid_row)
			writer.writerow(countryid_row)
			for vcfRecord in vcfFile:
				data_row = [vcfRecord.chr, vcfRecord.pos]
				refCall = vcfRecord.data_row[0]
				data_row.append(refCall['GT'])
				#get alternative allele frequency
				AF_list = vcfRecord.info_tag2value.get('AF')	#info_tag2value['AF']
				#if not isinstance(AF_list,types.NoneType):
				#	AF_list = AF_list.split(',')
				#	AF_list = map(float, AF_list)
				for columnIndex in columnIndexList:
					#for vcfCall in vcfRecord.data_row[1:]: #data_row is a list of dictionary {'GT': base-call, 'DP': depth}, or None if missing.
					#it includes one extra sample in index 0, which is the reference individual (from the ref column of VCF).
					vcfCall = vcfRecord.data_row[columnIndex+1]
					if vcfCall:
						#if vcfCall['GT'][1]==refCall['GT'] and vcfCall['GT'][2]==refCall['GT']:
						#	gt=0
						#elif vcfCall['GT'][1]==refCall['GT'] or vcfCall['GT'][2]==refCall['GT']:
						#	gt=0.5
						data_row.append(vcfCall['GT'])
					else:
						data_row.append('NN')
						
				writer.writerow(data_row)
				counter += 1
			sys.stderr.write("%s loci outputted.\n"%(counter))
			del writer

示例#7

0

显示文件

文件： hsFetchGenotypeMatrix012.py 项目： mjmontague/vervet-web

	def run(self):
		"""
		2012.7.13
		"""
		if self.debug:
			import pdb
			pdb.set_trace()
		session = self.db_vervet.session
		
		session.begin()
		if not self.dataDir:
			self.dataDir = self.db_vervet.data_dir
		dataDir = self.dataDir
		
		genotypeFile = self.db_vervet.getGenotypeFile(genotype_method_id=self.genotypeMethodID, chromosome=self.chromosome, format=self.format)
		
		if not genotypeFile:
			sys.stderr.write("Error: genotype_method_id %s, chromosome %s does not exist.\n"%(self.genotypeMethodID, self.chromosome))
			sys.exit(2)
		filename = os.path.join(dataDir, genotypeFile.path)
		if os.path.isfile(filename):
			counter= 0
			from pymodule.VCFFile import VCFFile
			
			vcfFile = VCFFile(inputFname=filename, minDepth=0)
			sampleIDList = vcfFile.getSampleIDList()
			writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t')
			header = ['Chromosome', 'position', 'ref','alt']
			columnIndexList = []
			for i in xrange(len(sampleIDList)):
				sampleID = sampleIDList[i]
				individualAlignment = self.db_vervet.parseAlignmentReadGroup(sampleID).individualAlignment
				site = individualAlignment.ind_sequence.individual.site
				#if individualAlignment.ind_sequence.individual.tax_id==60711 and (site.country_id!=144 and site.country_id!=135 \
				#																and site.country_id!=136 and site.country_id!=148): 
				header.append(sampleID)
				columnIndexList.append(i)
			writer.writerow(header)
			for vcfRecord in vcfFile:
				data_row = [vcfRecord.chr, vcfRecord.pos]
				refBase = vcfRecord.refBase
				nonRefBase = vcfRecord.altBase
				data_row.append(refBase)
				data_row.append(nonRefBase)
				for columnIndex in columnIndexList:
					#for vcfCall in vcfRecord.data_row[1:]: #data_row is a list of dictionary {'GT': base-call, 'DP': depth}, or None if missing.
					#it includes one extra sample in index 0, which is the reference individual (from the ref column of VCF).
					vcfCall = vcfRecord.data_row[columnIndex+1]
					if vcfCall:
						if vcfCall['GT'][0]==refBase and vcfCall['GT'][1]==refBase:
							gt=0
						elif vcfCall['GT'][0]==refBase or vcfCall['GT'][1]==refBase:
							gt=1
						else:
							gt=2
						data_row.append(gt)
					else:
						data_row.append('N')
						
				writer.writerow(data_row)
				counter += 1
			sys.stderr.write("%s loci outputted.\n"%(counter))
			del writer

示例#8

0

显示文件

文件： hs10XIndDistanceMatrix.py 项目： mjmontague/vervet-web

    def run(self):
        """
		2012.7.13
		"""
        if self.debug:
            import pdb

            pdb.set_trace()
        session = self.db_vervet.session

        session.begin()
        if not self.dataDir:
            self.dataDir = self.db_vervet.data_dir
        dataDir = self.dataDir

        genotypeFile = self.db_vervet.getGenotypeFile(
            genotype_method_id=self.genotypeMethodID, chromosome=self.chromosome, format=self.format
        )

        if not genotypeFile:
            sys.stderr.write(
                "Error: genotype_method_id %s, chromosome %s does not exist.\n"
                % (self.genotypeMethodID, self.chromosome)
            )
            sys.exit(2)
        filename = os.path.join(dataDir, genotypeFile.path)
        if os.path.isfile(filename):
            counter = 0
            from pymodule.VCFFile import VCFFile

            # allow 0 depth-> no missing data
            vcfFile = VCFFile(inputFname=filename, minDepth=0)
            sampleIDList = vcfFile.getSampleIDList()
            writer = csv.writer(open(self.outputFname, "w"), delimiter="\t")
            # header = ['Chromosome', 'position', 'ref']
            columnIndexList = []
            countryidList = []
            speciesidList = []
            keptSampleIDList = []
            genotypeMat = []
            for i in xrange(len(sampleIDList)):
                sampleID = sampleIDList[i]
                individualAlignment = self.db_vervet.parseAlignmentReadGroup(sampleID).individualAlignment
                site = individualAlignment.ind_sequence.individual.site
                if individualAlignment.ind_sequence.individual.target_coverage == 10:
                    keptSampleIDList.append(sampleID)
                    columnIndexList.append(i)
                    countryidList.append(individualAlignment.ind_sequence.individual.site.country_id)
                    speciesidList.append(individualAlignment.ind_sequence.individual.tax_id)
            for vcfRecord in vcfFile:
                data_row = []
                refCall = vcfRecord.data_row[0]
                # data_row.append(refCall['GT'])
                # get alternative allele frequency
                # AF_list = vcfRecord.info_tag2value.get('AF')	#info_tag2value['AF']
                # if not isinstance(AF_list,types.NoneType):
                # 	AF_list = AF_list.split(',')
                # 	AF_list = map(float, AF_list)
                for columnIndex in columnIndexList:
                    # for vcfCall in vcfRecord.data_row[1:]: #data_row is a list of dictionary {'GT': base-call, 'DP': depth}, or None if missing.
                    # it includes one extra sample in index 0, which is the reference individual (from the ref column of VCF).
                    vcfCall = vcfRecord.data_row[columnIndex + 1]
                    if vcfCall:
                        if vcfCall["GT"][0] == refCall["GT"] and vcfCall["GT"][1] == refCall["GT"]:
                            gt = 0
                        elif vcfCall["GT"][0] == refCall["GT"] or vcfCall["GT"][1] == refCall["GT"]:
                            gt = 1
                        else:
                            gt = 2
                        data_row.append(gt)
                    else:
                        data_row.append("NN")
                genotypeMat.append(data_row)
                counter += 1
            sys.stderr.write("%s loci outputted.\n" % (counter))

            # calculate distance Matrix
            import numpy as np

            matArr = np.array(genotypeMat, np.int32)
            distArr = np.empty((matArr.shape[1], matArr.shape[1]))
            distArr[:] = np.NAN

            for i in range(matArr.shape[1]):
                for j in range(matArr.shape[1]):
                    distArr[i][j] = sum(abs(matArr[:, i] - matArr[:, j]))
                    # normalise so that distance is between 0 and 2:
            distArr = distArr / matArr.shape[0]
            np.savetxt(self.outputFname, distArr)
            print countryidList