Пример #1
0
	def addLocusFromVCF2DB(self, db_vervet, inputFname=None, ref_ind_seq_id=None, locus_type_id=None, minDepth=0):
		"""
		2012-5.2
			given a VCF file, find all the loci and submit them into db
		"""
		sys.stderr.write("Adding loci from %s into db ... "%(inputFname))
		vcfFile = VCFFile(inputFname=inputFname, minDepth=minDepth)
		
		counter = 0
		previous_reported_counter = ''
		for vcfRecord in vcfFile.parseIter():
			chr = vcfRecord.chr
			pos = vcfRecord.pos
			pos = int(pos)
			refBase = vcfRecord.data_row[0].get("GT")[0]
			refBaseDBEntry = self.getSequenceDBEntry(db_vervet, sequence=refBase, comment=None)
			altBase = vcfRecord.altBase
			altBaseDBEntry = self.getSequenceDBEntry(db_vervet, sequence=altBase, comment=None)
			locus = db_vervet.getLocus(chr=chr, start=pos, stop=pos, ref_seq=refBaseDBEntry, alt_seq=altBaseDBEntry, \
							ref_ind_seq_id=ref_ind_seq_id, \
							locus_type_id=locus_type_id)
			counter += 1
			if counter%500==0:
				sys.stderr.write("%s%s"%('\x08'*len(previous_reported_counter), counter))
				previous_reported_counter = repr(counter)
		sys.stderr.write("%s%s"%(len(previous_reported_counter), counter))
		sys.stderr.write(" Done.\n")
    def run(self):
        """
		"""

        if self.debug:
            import pdb
            pdb.set_trace()

        snpData = SNPData(input_fname=self.inputFname,
                          turn_into_array=1,
                          ignore_2nd_column=1)
        snpData = SNPData.removeMonomorphicCols(snpData, NA_set=set([]))
        if self.min_MAF and self.min_MAF > 0:
            snpData = SNPData.removeColsByMAF(snpData,
                                              min_MAF=self.min_MAF,
                                              NA_set=set([]))

        self.writer = VCFFile(outputFname=self.outputFname, openMode='w')
        self.writer.makeupHeaderFromSampleIDList(
            sampleIDList=snpData.row_id_ls)
        self.writer.writeMetaAndHeader()

        counter = 0
        for j in xrange(len(snpData.col_id_ls)):
            snp_id = snpData.col_id_ls[j]
            chromosome, start = snp_id.split('_')[:2]
            genotype_ls = snpData.data_matrix[:, j]
            genotype_ls = utils.dict_map(number2di_nt, genotype_ls)
            genotype_ls_vcf = []
            alleleNucleotide2Number = {}
            alleleNumber2Nucleotide = {}
            for genotype in genotype_ls:
                if genotype == 'NA':
                    genotype_ls_vcf.append("./.")
                elif len(genotype) == 2:
                    for allele in genotype:
                        if allele not in alleleNucleotide2Number:
                            alleleNumber = len(alleleNucleotide2Number)
                            alleleNucleotide2Number[allele] = alleleNumber
                            alleleNumber2Nucleotide[alleleNumber] = allele
                    genotype_ls_vcf.append(
                        "%s/%s" % (alleleNucleotide2Number[genotype[0]],
                                   alleleNucleotide2Number[genotype[1]]))

                else:
                    genotype_ls_vcf.append("./.")
            refAllele = alleleNumber2Nucleotide[0]
            if 1 not in alleleNumber2Nucleotide:
                altAllele = refAllele
            else:
                altAllele = alleleNumber2Nucleotide[1]
            row = [
                chromosome, start, ".", refAllele, altAllele, 999, 'PASS',
                "DP=100", "GT"
            ] + genotype_ls_vcf
            self.writer.writerow(row)
            counter += 1
        sys.stderr.write("  %s records.\n" % (counter))
        self.writer.close()
Пример #3
0
	def countHomoHetCallsForEachSampleFromVCF(self, inputFname, outputFname, chromosome=None, chrLength=None, minDepth=1):
		"""
		2011-11-2
			given a VCF file, count the number of h**o-ref, h**o-alt, het calls
			
		"""
		sys.stderr.write("Count the number of homozygous-ref/alt & het from %s .\n"%(inputFname))
		vcfFile = VCFFile(inputFname=inputFname, minDepth=minDepth)
		
		sampleID2data = {}	#key is sampleID, value is a list of 3 numbers. 'NoOfHomoRef', 'NoOfHomoAlt', 'NoOfHet'
		
		no_of_total = 0.
		minStart = None
		for vcfRecord in vcfFile.parseIter():
			chr = vcfRecord.chr
			pos = vcfRecord.pos
			pos = int(pos)
			refBase = vcfRecord.data_row[0].get("GT")[0]
			
			for sample_id, sample_index in vcfFile.sample_id2index.iteritems():
				if sample_id=='ref':	#ignore the reference
					continue
				if sample_id not in sampleID2data:
					sampleID2data[sample_id] = [0, 0, 0]
				if not vcfRecord.data_row[sample_index]:	#None for this sample
					continue
				callForThisSample = vcfRecord.data_row[sample_index].get('GT')
				if not callForThisSample or callForThisSample=='NA':
					continue
				if callForThisSample[0]==refBase and callForThisSample[1]==refBase:
					#homozygous reference allele
					sampleID2data[sample_id][0]+=1
				elif callForThisSample[0]==callForThisSample[1] and callForThisSample[0]!=refBase:
					#homozygous alternative allele
					sampleID2data[sample_id][1]+=1
				elif callForThisSample[0]!=callForThisSample[1]:
					sampleID2data[sample_id][2]+=1
			
		import csv
		writer = csv.writer(open(outputFname, 'w'), delimiter='\t')
		writer.writerow(['#sampleID', 'chromosome', 'length', "NoOfTotal", 'NoOfHomoRef', 'NoOfHomoAlt', "FractionOfHomoAlt", 'NoOfHet', "FractionOfHet"])
		sampleIDLs = sampleID2data.keys()
		sampleIDLs.sort()
		for sampleID in sampleIDLs:
			count_data = sampleID2data.get(sampleID)
			noOfHomoRef, noOfHomoAlt, noOfHet = count_data[:3]
			no_of_calls = float(sum(count_data))
			if no_of_calls>0:
				fractionOfHomoAlt = noOfHomoAlt/no_of_calls
				fractionOfHet = noOfHet/no_of_calls
			else:
				fractionOfHomoAlt = -1
				fractionOfHet = -1
			writer.writerow([sampleID, chromosome, chrLength, int(no_of_calls), noOfHomoRef, noOfHomoAlt, \
							fractionOfHomoAlt, noOfHet, fractionOfHet])
		del writer
		sys.stderr.write("Done.\n")
	def run(self):
		"""
		2012.7.13
		"""
		if self.debug:
			import pdb
			pdb.set_trace()
		session = self.db_vervet.session
		
		session.begin()
		if not self.data_dir:
			self.data_dir = self.db_vervet.data_dir
		data_dir = self.data_dir
		
		genotypeFile = self.db_vervet.getGenotypeFile(genotype_method_id=self.genotypeMethodID, chromosome=self.chromosome, format=self.format)
		#query = VervetDB.GenotypeFile.query.filter_by(genotype_method_id=self.genotypeMethodID).filter_by(format=self.format)
		#for genotypeFile in query:
		if not genotypeFile:
			
			sys.stderr.write("Error: genotype_method_id %s, chromosome %s does not exist.\n"%(self.genotypeMethodID, self.chromosome))
			sys.exit(2)
		filename = os.path.join(data_dir, genotypeFile.path)
		if os.path.isfile(filename):
			counter= 0
			from pymodule import VCFFile
			
			vcfFile = VCFFile(inputFname=filename, minDepth=0)
			sampleIDList = vcfFile.getSampleIDList()
			writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t')
			header = ['Chromosome', 'position', 'ref']
			columnIndexList = []
			for i in xrange(len(sampleIDList)):
				sampleID = sampleIDList[i]
				individualAlignment = self.db_vervet.parseAlignmentReadGroup(sampleID).individualAlignment
				site = individualAlignment.individual_sequence.individual.site
				#2012.8.29 get scientific name from the taxonomy db
				scientifcName = self.db_taxonomy.returnScientificNameGivenTaxID(individualAlignment.individual_sequence.individual.tax_id)
				#if individualAlignment.individual_sequence.individual.tax_id==60711 and (site.country_id!=144 and site.country_id!=135 \
				#																and site.country_id!=136 and site.country_id!=148): 
				header.append('%s %s'%(sampleID, scientifcName))
				columnIndexList.append(i)
			writer.writerow(header)
			for vcfRecord in vcfFile:
				data_row = [vcfRecord.chr, vcfRecord.pos]
				refCall = vcfRecord.data_row[0]
				data_row.append(refCall['GT'])
				#get alternative allele frequency
				AF_list = vcfRecord.info_tag2value.get('AF')	#info_tag2value['AF']
				AF_list = AF_list.split(',')
				AF_list = map(float, AF_list)
				for columnIndex in columnIndexList:
					#for vcfCall in vcfRecord.data_row[1:]: #data_row is a list of dictionary {'GT': base-call, 'DP': depth}, or None if missing.
					#it includes one extra sample in index 0, which is the reference individual (from the ref column of VCF).
					vcfCall = vcfRecord.data_row[columnIndex+1]
					if vcfCall:
						data_row.append(vcfCall['GT'])
					else:
						data_row.append('NA')
						
				writer.writerow(data_row)
				counter += 1
			sys.stderr.write("%s loci outputted.\n"%(counter))
			del writer
	def outputPedigreeForPlink(self, DG=None, db_vervet=None, inputFname=None, outputFname=None, \
							treatEveryOneIndependent=None, sampleIDFormat=1,\
							addUngenotypedDuoParents=False):
		"""
		http://pngu.mgh.harvard.edu/~purcell/plink/data.shtml
			either space or tab could be the delimiter.
		sampleIDFormat
			1: individual.ucla_id
			2: input sampleID
		argument addUngenotypedDuoParents
				for mendel error detection, if an ungenotyped parent in a duo (the other is genotyped) is not present in the genotype file (PED/TPED/BED),
					then plink won't look for its mendel inconsistency 
		
		2013.07.18
			added argument addUngenotypedDuoParents
				for mendel error detection, if an ungenotyped parent in a duo is not present in the genotype file (PED/TPED/BED),
					then plink won't look for its mendel inconsistency 
		
		2013.06.24 added argument sampleIDFormat
			1: individual.ucla_id
			2: alignment.read_group
		2013.1.2
			copied from run()
			
		"""
		sys.stderr.write("Outputting pedigree constrained by %s to %s, treatEveryOneIndependent=%s, sampleIDFormat=%s, addUngenotypedDuoParents=%s ... "%\
						(inputFname, outputFname, treatEveryOneIndependent, sampleIDFormat, addUngenotypedDuoParents))
		vcfFile = VCFFile(inputFname=inputFname)
		
		alignmentLs = []
		alignmentID2sampleData = {}
		individual_id2alignment = {}
		for sampleID in vcfFile.getSampleIDList():
			alignment = db_vervet.parseAlignmentReadGroup(sampleID).individualAlignment
			alignmentLs.append(alignment)
			if alignment.id in alignmentID2sampleData:
				sys.stderr.write("Error: alignment %s (%s) for sample %s already in alignmentID2sampleData, with sampleID=%s.\n"%\
								(alignment.id, alignment.read_group, sampleID, \
								alignmentID2sampleData.get(alignment.id).sampleID))
				raise
			alignmentID2sampleData[alignment.id] = PassingData(sampleID=sampleID, alignment=alignment)
			
			individual_id = alignment.individual_sequence.individual_id
			if individual_id in individual_id2alignment:
				sys.stderr.write("Error: alignment %s (%s) for sample %s already in alignmentID2sampleData, with sampleID=%s.\n"%\
								(alignment.id, alignment.read_group, sampleID, \
								alignmentID2sampleData.get(alignment.id).sampleID))
				raise
			individual_id2alignment[individual_id] = alignment
		#alignmentLs = db_vervet.getAlignmentsFromVCFFile(inputFname =inputFname)
		
		"""
		pedigreeGraphData = db_vervet.constructPedgreeGraphOutOfAlignments(alignmentLs)
		DG = pedigreeGraphData.DG
		individual_id2alignmentLs = pedigreeGraphData.individual_id2alignmentLs
		"""
		individual_id2individual = {}
		
		ungenotypedNodeID2Data = {}
		writer = csv.writer(open(outputFname, 'w'), delimiter=' ')
		counter = 0
		family_id= 1	#all in one family
		currentNoOfFakes = 0
		for alignment in alignmentLs:
			nodeID = alignment.individual_sequence.individual_id
			individual = self.getIndividual(db_vervet=db_vervet, individual_id=nodeID, \
										individual_id2individual=individual_id2individual)
			
			if nodeID in DG:
				parents = DG.predecessors(nodeID)
				if len(parents)==2:
					
					parent1 = self.getIndividual(db_vervet=db_vervet, individual_id=parents[0], \
												individual_id2individual=individual_id2individual)
					parent2 = self.getIndividual(db_vervet=db_vervet, individual_id=parents[1], \
												individual_id2individual=individual_id2individual)
					parent1Sex = parent1.codeSexInNumber()
					parent2Sex = parent2.codeSexInNumber()
					#2013.07.18	one and only genotyped, then add the ungenotyped as a ungenotyped duo
					if parents[0] not in individual_id2alignment and parents[1] in individual_id2alignment:
						if parents[0] not in ungenotypedNodeID2Data:
							ungenotypedNodeID2Data[parents[0]] = PassingData(individualDBEntry=parent1, sex=parent1Sex)
					elif parents[0] in individual_id2alignment and parents[1] not in individual_id2alignment:
						if parents[1] not in ungenotypedNodeID2Data:
							ungenotypedNodeID2Data[parents[1]] = PassingData(individualDBEntry=parent2, sex=parent2Sex)
						
					if parent1Sex==2:
						#swap the father and mother row
						tmp = parent1
						parent1 = parent2
						parent2 = tmp
					
					father_id = self.getProperSampleIDForPlinkOutput(individual=parent1, \
									alignmentID2sampleData=alignmentID2sampleData, \
									individual_id2alignment=individual_id2alignment, sampleIDFormat=sampleIDFormat)
					mother_id = self.getProperSampleIDForPlinkOutput(individual=parent2, \
									alignmentID2sampleData=alignmentID2sampleData, \
									individual_id2alignment=individual_id2alignment, sampleIDFormat=sampleIDFormat)
				elif len(parents)==1:
					parent1 = self.getIndividual(db_vervet=db_vervet, individual_id=parents[0], \
										individual_id2individual=individual_id2individual)
					parent1Sex = parent1.codeSexInNumber()
					
					
					if parent1Sex==2:
						parent2Sex = 1
						father_id = 0
						mother_id = self.getProperSampleIDForPlinkOutput(individual=parent1, \
									alignmentID2sampleData=alignmentID2sampleData, \
									individual_id2alignment=individual_id2alignment, sampleIDFormat=sampleIDFormat)
					else:
						parent2Sex = 2
						father_id = self.getProperSampleIDForPlinkOutput(individual=parent1, \
									alignmentID2sampleData=alignmentID2sampleData, \
									individual_id2alignment=individual_id2alignment, sampleIDFormat=sampleIDFormat)
						mother_id = 0
					#2013.07.18 parent1 (parents[0]) has to be in individual_id2alignment (genotyped) in order for the other
						#to qualify as an ungenotype parent in a duo
					if parents[0] in individual_id2alignment:
						#if parents[0] not in ungenotypedNodeID2Data:
						#	ungenotypedNodeID2Data[parents[0]] = PassingData(individualDBEntry=parent1, sex=parent1Sex)
						fakeParentData = self.generateFakeIndividualID(pedigreeGraph=DG, currentNoOfFakes=currentNoOfFakes)
						currentNoOfFakes = fakeParentData.currentNoOfFakes
						fakeParent2ID = fakeParentData.individualID
						if fakeParent2ID not in individual_id2alignment:
							if fakeParent2ID not in ungenotypedNodeID2Data:
								ungenotypedNodeID2Data[fakeParent2ID] = PassingData(individualDBEntry=None, sex=parent2Sex)
				elif len(parents)==0:
					father_id = 0
					mother_id = 0
				else:
					sys.stderr.write("Error: number of parents (%s) for %s is %s.\n"%(repr(parents), nodeID, len(parents)))
					sys.exit(3)
			else:	# founders
				father_id = 0
				mother_id = 0
			
			if treatEveryOneIndependent:	#force the parents to be 0, everyone becomes founders
				father_id = 0
				mother_id = 0
			individual_id = self.getProperSampleIDForPlinkOutput(individual=individual, \
									alignmentID2sampleData=alignmentID2sampleData, \
									individual_id2alignment=individual_id2alignment, sampleIDFormat=sampleIDFormat)
			data_row = [family_id, individual_id, father_id, mother_id, \
					individual.codeSexInNumber(), 1]
			writer.writerow(data_row)
			counter += 1
		
		noOfUngenotypedParentsOutputted = 0
		if addUngenotypedDuoParents:
			for ungenotypedNodeID, pdata in ungenotypedNodeID2Data.iteritems():
				individual_id = self.getProperSampleIDForPlinkOutput(individual=pdata.individualDBEntry, \
									alignmentID2sampleData=alignmentID2sampleData, \
									individual_id2alignment=individual_id2alignment, \
									sampleIDFormat=sampleIDFormat, defaultSampleID=ungenotypedNodeID)
				data_row = [family_id, individual_id, 0, 0, pdata.sex, 1]
				writer.writerow(data_row)
				noOfUngenotypedParentsOutputted += 1
		sys.stderr.write("%s individuals and %s ungenotyped duo-parents outputted, number of fake parents %s, addUngenotypedDuoParents=%s.\n"%\
						(counter, noOfUngenotypedParentsOutputted, currentNoOfFakes, addUngenotypedDuoParents))
		del writer
	def splitNamVCFIntoMultipleSingleChrVCF(self, inputFname, outputDir, minDepth=1, includeIndels=False, maxContigNumber=1000):
		"""
		2012.5.10
			Two things in Nam's VCF file are to be modified. 
				1. extract VRC UCLAID from its sample ID
				2. replace vervet1_scaffolds_Contig137 with simply "Contig137"
		"""
		sys.stderr.write("Converting %s from VCF to EigenStrat ...\n"%(inputFname))
		from pymodule.VCFFile import VCFFile
		
		vcfFile = VCFFile(inputFname=inputFname, minDepth=minDepth)
		#replace Variant/PooledTissues/2002053/genome.algn.split.part17/5tissues.pooled.rmdup.bam with just monkey ID
		import re
		
		newSampleIDHeader = []
		for sampleID in vcfFile.sampleIDHeader:
			search_result = self.UCLAID_Pattern.search(sampleID)
			UCLAID = search_result.group('UCLAID')
			newSampleIDHeader.append(UCLAID)
		#new header for every output contig
		newHeader = vcfFile.header[:vcfFile.sampleStartingColumn] + newSampleIDHeader
		
		
		chr2outVCFFile = {}
		counter = 0
		real_counter = 0
		for vcfRecord in vcfFile.parseIter():
			counter += 1
			if not includeIndels and (len(vcfRecord.refBase)!=1 or len(vcfRecord.altBase)!=1):
				#it's an indel if refBase or altBase is not just one base
				continue
			
			contig_id_pattern_result = self.contig_id_pattern.search(vcfRecord.chr)
			chr = contig_id_pattern_result.group('contigID')
			if maxContigNumber:
				contigNumber = int(self.contig_number_pattern.search(chr).group('contigNumber'))
				if contigNumber>maxContigNumber:
					continue
			real_counter += 1
			vcfRecord.chr = chr
			pos = vcfRecord.pos
			if chr not in chr2outVCFFile:
				outputFname = os.path.join(outputDir, '%s.vcf'%(chr))
				outVCFFile = VCFFile(outputFname=outputFname)
				outVCFFile.metaInfoLs = vcfFile.metaInfoLs
				outVCFFile.header = newHeader
				outVCFFile.writeMetaAndHeader()
				chr2outVCFFile[chr] = outVCFFile
			outVCFFile = chr2outVCFFile.get(chr)
			
			# set genotype whose depth is below minDepth to ./. (=missing)
			for i in xrange(1, len(vcfRecord.data_row)):	#[0] is the ref base
				callData = vcfRecord.data_row[i]
				if callData is None or callData.get('DP',0)<minDepth:
					sampleColumnIndex = i+vcfFile.sampleStartingColumn-1
					vcfRecord.row[sampleColumnIndex] = './.'
			outVCFFile.writeVCFRecord(vcfRecord)
		
		vcfFile.close()
		#close all output files
		for chr, outVCFFile in chr2outVCFFile.iteritems():
			outVCFFile.close()
		
		sys.stderr.write("%s (out of %s) loci from %s chromosomes.\n"%(real_counter, counter, len(chr2outVCFFile)))
	def convertAlignmentReadGroup2UCLAIDInVCF(self, inputFname, outputFname, minDepth=1, includeIndels=False,\
											maxContigNumber=None):
		"""
		2012.5.10
		"""
		sys.stderr.write("Converting %s from VCF to EigenStrat ...\n"%(inputFname))
		
		vcfFile = VCFFile(inputFname=inputFname, minDepth=minDepth)
		#replace Variant/PooledTissues/2002053/genome.algn.split.part17/5tissues.pooled.rmdup.bam with just monkey ID
		
		newSampleIDHeader = []
		for sampleID in vcfFile.sampleIDHeader:
			readGroupData = VervetDB.VervetDB.parseAlignmentReadGroupWithoutDB(sampleID)
			UCLAID = readGroupData.individual_code
			newSampleIDHeader.append(UCLAID)
		#new header for every output contig
		newHeader = vcfFile.header[:vcfFile.sampleStartingColumn] + newSampleIDHeader
		
		counter = 0
		real_counter = 0
		outVCFFile = VCFFile(outputFname=outputFname)
		outVCFFile.metaInfoLs = vcfFile.metaInfoLs
		outVCFFile.header = newHeader
		outVCFFile.writeMetaAndHeader()
		for vcfRecord in vcfFile.parseIter():
			counter += 1
			if not includeIndels and (len(vcfRecord.refBase)!=1 or len(vcfRecord.altBase)!=1):
				#it's an indel if refBase or altBase is not just one base
				continue
			
			chr = vcfRecord.chr
			if maxContigNumber:
				contigNumber = int(self.contig_number_pattern.search(chr).group('contigNumber'))
				if contigNumber>maxContigNumber:
					continue
			real_counter += 1
			# set genotype whose depth is below minDepth to ./. (=missing)
			for i in xrange(1, len(vcfRecord.data_row)):	#[0] is the ref base
				callData = vcfRecord.data_row[i]
				if callData is None or callData.get('DP',0)<minDepth:
					sampleColumnIndex = i+vcfFile.sampleStartingColumn-1
					vcfRecord.row[sampleColumnIndex] = './.'
			outVCFFile.writeVCFRecord(vcfRecord)
		
		vcfFile.close()
		#close all output files
		outVCFFile.close()
		
		sys.stderr.write("%s (out of %s) loci.\n"%(real_counter, counter))
	def extractSamples(self, db_vervet=None, inputFname=None, outputFname=None, \
					tax_id_set=None, site_id_set=None, country_id_set=None, \
					min_coverage=None, max_coverage=None, outputFormat=1, is_contaminated=None,\
					**keywords):
		"""
		2013.07.03 added argument is_contaminated (whether to fetch contaminated samples or not)
		2013.04.30 added argument min_coverage, max_coverage
		2012.10.10
			added argument outputFormat. 
		2012.10.5
			
		"""
		sys.stderr.write("Extracting samples from %s, %s sites & %s countries & %s taxonomies, min_coverage=%s, max_coverage=%s, outputFormat=%s, is_contaminated=%s ...\n"%\
							(inputFname,\
							getattr(site_id_set, '__len__', returnZeroFunc)(),\
							getattr(country_id_set, '__len__', returnZeroFunc)(),\
							getattr(tax_id_set, '__len__', returnZeroFunc)(), min_coverage, max_coverage,\
							outputFormat, is_contaminated ))
		vcfFile = VCFFile(inputFname=inputFname)
		
		oldHeader = vcfFile.header
		oldHeaderLength = len(oldHeader)
		newHeader = oldHeader[:vcfFile.sampleStartingColumn]	#anything before the samples are same
		no_of_samples = 0
		col_index2sampleID = {}	#this structure stores the selected samples and their column index 
		for col_index, individual_name in vcfFile.get_col_index_individual_name_ls():
			individualAlignment = db_vervet.parseAlignmentReadGroup(individual_name).individualAlignment
			if individualAlignment is not None:
				filteredAlignmentList = db_vervet.filterAlignments(alignmentLs=[individualAlignment], min_coverage=min_coverage, \
						max_coverage=max_coverage, individual_site_id=None, \
						sequence_filtered=None, individual_site_id_set=site_id_set, \
						mask_genotype_method_id=None, parent_individual_alignment_id=None,\
						country_id_set=country_id_set, tax_id_set=tax_id_set, excludeContaminant=False, \
						is_contaminated=is_contaminated, excludeTissueIDSet=None,\
						local_realigned=None, reduce_reads=None, report=False)
				if filteredAlignmentList:	#non-empty, passed the filter
					newHeader.append(individual_name)
					no_of_samples += 1
					col_index2sampleID[col_index] = individual_name
			else:
				sys.stderr.write("Warning: no individualAlignment for sample %s.\n"%(individual_name))
				sys.exit(3)
		
		no_of_snps = 0
		if outputFormat==1:
			outVCFFile = VCFFile(outputFname=outputFname)
			outVCFFile.metaInfoLs = vcfFile.metaInfoLs
			outVCFFile.header = newHeader
			outVCFFile.writeMetaAndHeader()
			
			newHeaderLength = len(newHeader)
			for vcfRecord in vcfFile:
				data_row =vcfRecord.row[:vcfFile.sampleStartingColumn]
				for i in xrange(vcfFile.sampleStartingColumn, oldHeaderLength):
					if i in col_index2sampleID:
						data_row.append(vcfRecord.row[i])
				outVCFFile.writer.writerow(data_row)
				no_of_snps += 1
			outVCFFile.close()
		elif outputFormat in [2,3]:
			outf = open(outputFname, 'w')
			if outputFormat==2:
				outf.write("sampleID\n")
			for col_index, sampleID in col_index2sampleID.iteritems():
				outf.write("%s\n"%(sampleID))
			outf.close()
		vcfFile.close()
		sys.stderr.write("%s samples X %s SNPs.\n"%(no_of_samples, no_of_snps))
	def replicateVCFGenotypeColumns(self, inputFname, outputFname=None, replicateIndividualTag=None, sampleID2FamilyCount=None,\
								minDepth=0):
		"""
		2012.10.5 remove argument sampleStartingColumn
		2012.5.10
			VCFFile has been changed considerably and can act as a writer now.
		2012.3.29
			
		"""
		sys.stderr.write("Replicating some genotype columns in %s ...\n"%(inputFname))
		vcfFile = VCFFile(inputFname=inputFname, minDepth=minDepth)
		
		outVCFFile = VCFFile(outputFname=outputFname)
		outVCFFile.metaInfoLs = vcfFile.metaInfoLs
		
		"""
		outf = open(outputFname, 'w')
		writer = csv.writer(outf, delimiter='\t')
		#write all the headers up till the last line (which describes the samples and etc.)
		for metaInfo in vcfFile.metaInfoLs:
			outf.write(metaInfo)
		"""
		
		#modify the sample-id header line 
		sampleID2DataIndexLs = {}
		oldHeader = vcfFile.header
		oldHeaderLength = len(oldHeader)
		newHeader = oldHeader[:vcfFile.sampleStartingColumn]	#anything before the samples are same
		no_of_samples = 0
		for i in xrange(vcfFile.sampleStartingColumn, oldHeaderLength):
			#for sample_id in vcfFile.metaInfoLs[-1][vcfFile.sampleStartingColumn:]:
			sample_id = oldHeader[i].strip()
			newHeader.append('%s%s%s'%(sample_id, replicateIndividualTag, 1))	#1 because it's the 1st copy
			no_of_samples += 1
			sampleID2DataIndexLs[sample_id] = [i]	#1st copy for this sample
		
		#add additional column headers based on each one's occurrence
		extraColIndex2sampleID = {}
		for sample_id, familyCount in sampleID2FamilyCount.iteritems():
			for i in xrange(1, familyCount):
			#if familyCount>1:
				if sample_id in sampleID2DataIndexLs:
					no_of_samples += 1
					extraColIndex = len(newHeader)
					extraColIndex2sampleID[extraColIndex] = sample_id
					sampleID2DataIndexLs[sample_id].append(extraColIndex)
					replicate_order = len(sampleID2DataIndexLs[sample_id])
					newHeader.append("%s%s%s"%(sample_id, replicateIndividualTag, replicate_order))
		outVCFFile.header = newHeader
		outVCFFile.writeMetaAndHeader()
		
		newHeaderLength = len(newHeader)
		no_of_snps = 0
		for vcfRecord in vcfFile.parseIter():
			data_row =vcfRecord.row
			#2013.09.13 replace all "./." with full NA formating i.e. "./.:.:.:.", pending fields in the "format" column
			for i in xrange(vcfRecord.sampleStartingColumn, len(data_row)):
				if data_row[i]=='./.':	#2013.09.15 expand this NA genotype for TrioCaller
					field_value_ls = []
					for format_field in vcfRecord.format_column_ls:
						if format_field=='GT':
							field_value_ls.append('./.')
						elif format_field=='PL':	#for TrioCaller
							field_value_ls.append('.,.,.')
						else:
							field_value_ls.append('.')
					#field_value_ls = ['./.'] + ['.']*(len(vcfRecord.format_column_name2index)-1)
					data_row[i] = ':'.join(field_value_ls)
			for i in xrange(oldHeaderLength, newHeaderLength):	#add more genotype copies for those extra columns
				sample_id = extraColIndex2sampleID.get(i)
				sourceIndex = sampleID2DataIndexLs.get(sample_id)[0]
				data_row.append(data_row[sourceIndex])
			outVCFFile.writer.writerow(data_row)
			no_of_snps += 1
		outVCFFile.close()
		vcfFile.close()
		sys.stderr.write("%s samples X %s SNPs.\n"%(no_of_samples, no_of_snps))
	def run(self):
		"""
		2011-7-11
		"""
		if self.run_type!=1:
			self.needSplitChrIntervalData = False	#2013.06.21 turn this off before setup_run() to not construct chr2IntervalDataLs
		else:
			self.needSplitChrIntervalData = True
		pdata = self.setup_run()
		workflow = pdata.workflow
		db_vervet = self.db
		
		if self.run_type in [2,3]:
			inputData = self.registerAllInputFiles(workflow, self.inputDir, input_site_handler=self.input_site_handler, \
									checkEmptyVCFByReading=self.checkEmptyVCFByReading,\
									pegasusFolderName=self.pegasusFolderName,\
									maxContigID=self.maxContigID, \
									minContigID=self.minContigID,  db_vervet=db_vervet, \
									needToKnowNoOfLoci=abs(1-self.notToKnowNoOfLoci),\
									minNoOfLociInVCF=self.minNoOfLociInVCF)	#ignore files with too few loci
			inputF = inputData.jobDataLs[0].vcfFile
			vcfFile = VCFFile(inputFname=inputF.abspath)
			alignmentLs = db_vervet.getAlignmentsFromVCFSampleIDList(vcfFile.getSampleIDList())
			del vcfFile
		
		cumulativeMedianDepth = db_vervet.getCumulativeAlignmentMedianDepth(alignmentLs=pdata.alignmentLs, \
											defaultSampleAlignmentDepth=self.defaultSampleAlignmentDepth)
		
		registerReferenceData = pdata.registerReferenceData
		
		
		if self.run_type==1:
			#chr2size = set(['Contig149'])	#temporary when testing Contig149
			#chr2size = set(['1MbBAC'])	#temporary when testing the 1Mb-BAC (formerly vervet_path2)
			#2012.6.12
			#self.outputAlignmentDepthAndOthersForFilter(db_vervet=db_vervet, outputFname=self.alnStatForFilterFname, \
			#									ref_ind_seq_id=self.ref_ind_seq_id, \
			#									foldChange=self.depthFoldChange, minGQ=30)	#minGQ doesn't matter anymore.
			self.addGenotypeCallJobs(workflow=workflow, alignmentDataLs=pdata.alignmentDataLs, chr2IntervalDataLs=self.chr2IntervalDataLs, \
						registerReferenceData=registerReferenceData, \
						site_handler=self.site_handler, input_site_handler=self.input_site_handler,\
						needFastaIndexJob=self.needFastaIndexJob, needFastaDictJob=self.needFastaDictJob, \
						intervalSize=self.intervalSize, intervalOverlapSize=self.intervalOverlapSize, \
						site_type=self.site_type, data_dir=self.data_dir,\
						outputDirPrefix="",\
						genotypeCallerType=self.genotypeCallerType,\
						cumulativeMedianDepth=cumulativeMedianDepth,\
						transferOutput=True)
		elif self.run_type in [2, 3]:
			self.addTrioCallerJobsONVCFFiles(workflow=workflow, alignmentLs=alignmentLs, inputData=inputData, \
						samtools=workflow.samtools, \
						genotyperJava=workflow.genotyperJava,  SelectVariantsJava=workflow.SelectVariantsJava, \
						GenomeAnalysisTKJar=workflow.GenomeAnalysisTKJar, \
						addOrReplaceReadGroupsJava=workflow.addOrReplaceReadGroupsJava, AddOrReplaceReadGroupsJar=workflow.AddOrReplaceReadGroupsJar, \
						CreateSequenceDictionaryJava=workflow.CreateSequenceDictionaryJava, CreateSequenceDictionaryJar=workflow.CreateSequenceDictionaryJar, \
						MergeSamFilesJar=workflow.MergeSamFilesJar, \
						BuildBamIndexFilesJava=workflow.BuildBamIndexFilesJava, BuildBamIndexJar=workflow.BuildBamIndexJar, \
						mv=workflow.mv, CallVariantBySamtools=workflow.CallVariantBySamtools, \
						trioCallerPath=self.trioCallerPath, trioCallerWrapper=workflow.trioCallerWrapper, \
						replicateIndividualTag=self.replicateIndividualTag, treatEveryOneIndependent=self.treatEveryOneIndependent,\
						bgzip_tabix=workflow.bgzip_tabix, vcf_convert=workflow.vcf_convert, \
						vcf_isec=workflow.vcf_isec, vcf_concat=workflow.vcf_concat, \
						concatGATK=workflow.concatGATK, concatSamtools=workflow.concatSamtools,\
						ligateVcf=self.ligateVcf, ligateVcfExecutableFile=self.ligateVcfExecutableFile,\
						registerReferenceData=registerReferenceData, \
						namespace=workflow.namespace, version=workflow.version, site_handler=self.site_handler, input_site_handler=self.input_site_handler,\
						needFastaIndexJob=self.needFastaIndexJob, needFastaDictJob=self.needFastaDictJob, \
						outputDirPrefix="", \
						intervalSize=self.intervalSize, intervalOverlapSize=self.intervalOverlapSize, \
						site_type=self.site_type, data_dir=self.data_dir,\
						onlyKeepBiAllelicSNP=self.onlyKeepBiAllelicSNP, maxSNPMissingRate=self.maxSNPMissingRate,\
						alnStatForFilterF=None, cumulativeMedianDepth=cumulativeMedianDepth,\
						run_type=self.run_type, transferOutput=True)
		
		self.end_run()
Пример #11
0
	def run(self):
		"""
		2012.7.13
		"""
		if self.debug:
			import pdb
			pdb.set_trace()
		session = self.db_vervet.session
		
		session.begin()
		if not self.data_dir:
			self.data_dir = self.db_vervet.data_dir
		data_dir = self.data_dir
		
		realPath = os.path.realpath(self.inputFname)
		logMessage = "file %s.\n"%(self.inputFname)
		if NextGenSeq.isFileNameVCF(realPath, includeIndelVCF=True) and \
				not NextGenSeq.isVCFFileEmpty(realPath, checkContent=self.checkEmptyVCFByReading):
			vcfFile = VCFFile(inputFname=self.inputFname)
			
			individualAlignmentLs = self.getAlignmentLsFromVCF(db_vervet=self.db_vervet, vcfFile=vcfFile)
			
			genotypeMethod = self.db_vervet.getGenotypeMethod(short_name=self.genotypeMethodShortName, \
															individualAlignmentLs=individualAlignmentLs,\
															no_of_individuals=len(individualAlignmentLs), no_of_loci=None,\
															data_dir=self.data_dir)
			self.checkIfAlignmentListMatchMethodDBEntry(individualAlignmentLs, genotypeMethod, session)
			
			pdata = self.getNoOfLociFromVCFFile(vcfFile)
			chromosome2noOfLoci = pdata.chromosome2noOfLoci
			no_of_loci = pdata.no_of_loci
			if no_of_loci>0:	#file with zero loci could have identical md5sum
				try:
					md5sum = utils.get_md5sum(realPath)
				except:
					sys.stderr.write('Except type: %s\n'%repr(sys.exc_info()))
					import traceback
					traceback.print_exc()
					self.cleanUpAndExitOnFailure(exitCode=4)
			else:
				md5sum = None
			"""
			db_entry = VervetDB.GenotypeFile.query.filter_by(md5sum=md5sum).first()
			if db_entry:
				sys.stderr.write("Warning: another file %s with the identical md5sum %s as this file %s is already in db.\n"%\
									(db_entry.path, md5sum, realPath))
				session.rollback()
				#2012.8.3 when the jobs are clustered into one merged job and it failed halfway
				# and retried elsewhere, the redundancy check should not exit with non-zero. otherwise the merged job would fail again. 
				self.cleanUpAndExitOnFailure(exitCode=0)
			"""
			no_of_individuals = len(individualAlignmentLs)
			no_of_chromosomes = len(chromosome2noOfLoci)
			if no_of_chromosomes == 1:	#2012.8.30 use 1st chromosome
				chromosome = chromosome2noOfLoci.keys()[0]
			else:
				chromosome = None
			genotypeFile = self.db_vervet.getGenotypeFile(genotype_method=genotypeMethod,\
										chromosome=chromosome, format=self.format, path=None, file_size=None, md5sum=md5sum,\
										original_path=realPath, no_of_individuals=no_of_individuals, no_of_loci=no_of_loci,\
										data_dir=self.data_dir, no_of_chromosomes=no_of_chromosomes)
			if genotypeFile.id and genotypeFile.path:
				isPathInDB = self.db_vervet.isPathInDBAffiliatedStorage(relativePath=genotypeFile.path, data_dir=self.data_dir)
				if isPathInDB==-1:
					sys.stderr.write("Error while updating genotypeFile.path with the new path, %s.\n"%(genotypeFile.path))
					self.cleanUpAndExitOnFailure(exitCode=isPathInDB)
				elif isPathInDB==1:	#successful exit, entry already in db
					sys.stderr.write("Warning: file %s is already in db.\n"%\
										(genotypeFile.path))
					session.rollback()
					self.cleanUpAndExitOnFailure(exitCode=0)
				else:	#not in db affiliated storage, keep going.
					pass
			#move the file and update the db_entry's path as well
			inputFileBasename = os.path.basename(self.inputFname)
			relativePath = genotypeFile.constructRelativePath(sourceFilename=inputFileBasename)
			exitCode = self.db_vervet.moveFileIntoDBAffiliatedStorage(db_entry=genotypeFile, filename=inputFileBasename, \
									inputDir=os.path.split(self.inputFname)[0], dstFilename=os.path.join(self.data_dir, relativePath), \
									relativeOutputDir=None, shellCommand='cp -rL', \
									srcFilenameLs=self.srcFilenameLs, dstFilenameLs=self.dstFilenameLs,\
									constructRelativePathFunction=genotypeFile.constructRelativePath)
			
			if exitCode!=0:
				sys.stderr.write("Error: moveFileIntoDBAffiliatedStorage() exits with %s code.\n"%(exitCode))
				session.rollback()
				self.cleanUpAndExitOnFailure(exitCode=exitCode)
			
			#copy the tbi (tabix) index file if it exists
			tbiFilename = '%s.tbi'%(realPath)
			if os.path.isfile(tbiFilename):
				srcFilename = tbiFilename
				dstFilename = os.path.join(self.data_dir, '%s.tbi'%(genotypeFile.path))
				utils.copyFile(srcFilename=srcFilename, dstFilename=dstFilename)
				logMessage += "tbi file %s has been copied to %s.\n"%(srcFilename, dstFilename)
			## 2012.7.17 commented out because md5sum is calcualted above
			#db_vervet.updateDBEntryMD5SUM(db_entry=genotypeFile, data_dir=data_dir)
			# #2012.7.17 record the size of db_entry.path (folder or file)
			self.db_vervet.updateDBEntryPathFileSize(db_entry=genotypeFile, data_dir=self.data_dir)
			
			vcfFile.close()
			logMessage += "%s individuals, %s loci, md5sum=%s.\n"%(no_of_individuals, no_of_loci, md5sum)
		else:
			logMessage += " is empty (no loci) or not VCF file.\n"
		self.outputLogMessage(logMessage)
		
		if self.commit:
			try:
				session.flush()
				session.commit()
			except:
				sys.stderr.write('Except type: %s\n'%repr(sys.exc_info()))
				import traceback
				traceback.print_exc()
				self.cleanUpAndExitOnFailure(exitCode=3)
		else:
			session.rollback()
			#delete all target files but exit gracefully (exit 0)
			self.cleanUpAndExitOnFailure(exitCode=0)