Python MatrixFile.writerow 예제들, pymodule.MatrixFile.writerow Python 예제들

예제 #1

0

파일 보기

파일: SplitPlinkLMendelFileSNPIDIntoChrPosition.py 프로젝트: bopopescu/gwasmodules

    def run(self):
        """
		2013.07.24
		"""

        if self.debug:
            import pdb
            pdb.set_trace()

        #inf = utils.openGzipFile(self.inputFname)
        reader = MatrixFile(inputFname=self.inputFname)
        reader.constructColName2IndexFromHeader()
        writer = MatrixFile(inputFname=self.outputFname,
                            openMode='w',
                            delimiter='\t')
        header = ["SNPID", "oldChromosome", "Chromosome", "Start", "Stop", "N"]
        writer.writeHeader(header)

        counter = 0
        for row in reader:
            new_row = self.processRow(row)
            writer.writerow(new_row)
            counter += 1
        sys.stderr.write("%s lines processed.\n" % (counter))

        del reader
        del writer

예제 #2

0

파일 보기

	def run(self):
		if self.debug:
			import pdb
			pdb.set_trace()
		
		
		outputDir = os.path.split(self.outputFname)[0]
		if outputDir and not os.path.isdir(outputDir):
			os.makedirs(outputDir)
		
		reader = VCFFile(inputFname=self.inputFname)
		
		alignmentFile = pysam.Samfile(self.alignmentFilename, "rb")
		
		writer = VCFFile(outputFname=self.outputFname, openMode='w')
		writer.metaInfoLs = reader.metaInfoLs
		writer.header = reader.header
		writer.writeMetaAndHeader()
		
		statWriter = MatrixFile(self.missingStatFname, openMode='w', delimiter='\t')
		header = ["sampleID", "locusID", 'chromosome', 'start', 'stop', 'occurrence', 'missingReason', \
				'fractionOfGoodRead', 'medianMapQ', 'totalNoOfReads']
		statWriter.writeHeader(header)
		
		counter = 0
		real_counter = 0
		minDepth = self.alignmentMedianDepth/self.alignmentDepthFold
		maxDepth = self.alignmentMedianDepth*self.alignmentDepthFold
		
		for vcfRecord in reader:
			locusID = "%s_%s"%(vcfRecord.chromosome, vcfRecord.position)
			alignedReadLs = alignmentFile.fetch(vcfRecord.chromosome, vcfRecord.position-1, vcfRecord.position+1)	#start and end in fetch() are 0-based.
			locusLowMapQData = self.returnLocusLowMapQualityIndicator(alignedReadLs=alignedReadLs,\
												minMapQGoodRead=self.minMapQGoodRead, minFractionOfGoodRead=self.minFractionOfGoodRead)
			locusLowMapQIndicator = locusLowMapQData.locusLowMapQIndicator
			depth = locusLowMapQData.totalNoOfReads
			if depth>=minDepth and depth <=maxDepth:
				locusOutOfDepthIndicator = 0 	#good
			else:
				locusOutOfDepthIndicator = 1
			
			locusLowQualityIndicator = locusOutOfDepthIndicator + locusLowMapQIndicator
			data_row = [self.sampleID, locusID, vcfRecord.chromosome, vcfRecord.position, vcfRecord.position,\
						1, locusLowQualityIndicator, locusLowMapQData.fractionOfGoodRead, \
						locusLowMapQData.medianMapQ, locusLowMapQData.totalNoOfReads]
			statWriter.writerow(data_row)
			if locusLowQualityIndicator>0:
				real_counter += 1
				#modify the VCF record
				#get sample ID column, then set its genotype missing
				vcfRecord.setGenotypeCallForOneSample(sampleID=self.sampleID, genotype="./.", convertGLToPL=True)
			#2014.1.4 output VCF record
			writer.writeVCFRecord(vcfRecord)
			counter += 1
		reader.close()
		statWriter.close()
		writer.close()
		sys.stderr.write("%s (out of %s, %s) genotypes marked missing.\n"%(real_counter, counter, \
												real_counter/float(counter)))

예제 #3

0

파일 보기

	def run(self):
		if self.debug:
			import pdb
			pdb.set_trace()
		
		
		
		outputDir = os.path.split(self.outputFname)[0]
		if outputDir and not os.path.isdir(outputDir):
			os.makedirs(outputDir)
		
		snp_pos2genotypeVectorLs =self.readInSNPID2GenotypeVectorLs(self.inputFname).snp_pos2returnData
		
		
		
		writer = MatrixFile(self.outputFname, openMode='w', delimiter='\t')
		header = ['chromosome', 'position', 'noOfMatches', 'noOfTotal', 'concordance']
		writer.writeHeader(header)
		
		
		counter = 0
		real_counter = 0
		no_of_pairs = 0
		snp_pos_ls = snp_pos2genotypeVectorLs.keys()
		snp_pos_ls.sort()
		for i in xrange(len(snp_pos_ls)):
			counter += 1
			key = snp_pos_ls[i]
			chromosome, position = snp_pos_ls[i][:2]
			genotypeVectorLs = snp_pos2genotypeVectorLs.get(key)
			if len(genotypeVectorLs)>1:
				real_counter += 1
				for k in xrange(0, len(genotypeVectorLs)-1):
					for l in xrange(k+1, len(genotypeVectorLs)):
						no_of_pairs +=1
						noOfMatches = 0
						noOfTotal = 0
						genotypeVector0 = genotypeVectorLs[k]
						genotypeVector1 = genotypeVectorLs[l]
						for j in xrange(len(genotypeVector0)):
							call1 = genotypeVector0[j]['GT']
							call2 = genotypeVector1[j]['GT']
							if call1!='NA' and call2!='NA':
								noOfTotal += 1
								if SNP.nt2number[call1]==SNP.nt2number[call2]:
									noOfMatches += 1
						if noOfTotal>0:
							concordance = float(noOfMatches)/float(noOfTotal)
						else:
							concordance = -1
						data_row = [chromosome, position,noOfMatches, noOfTotal, concordance ]
						writer.writerow(data_row)
		writer.close()
		sys.stderr.write("%s (out of %s, %s) snps have >1 same-position entries. %s pairs.\n"%(real_counter, counter, \
												real_counter/float(counter), no_of_pairs))

예제 #4

0

파일 보기

파일: OutputVCFAlignmentDepthRange.py 프로젝트: mjmontague/vervet-web

	def outputAlignmentDepthAndOthersForFilter(self, db_vervet=None, inputFname=None, \
						ref_ind_seq_id=None, depthFoldChange=2, minGQ=30, \
						outputFname=None, outputFileFormat=1):
		"""
		2012.6.12
			added argument db_vervet, moved from FilterVCFPipeline.py
		2011-9-2
		"""
		sys.stderr.write("Outputting alignment (from %s) coverage to %s ..."%(inputFname, outputFname))
		if inputFname:
			alignmentLs = db_vervet.getAlignmentsFromVCFFile(inputFname=inputFname)
		else:
			alignmentLs = db_vervet.getAlignments(ref_ind_seq_id=self.ref_ind_seq_id, \
										alignment_method_id=self.alignment_method_id, data_dir=self.data_dir,\
										local_realigned=self.local_realigned, outdated_index=self.alignment_outdated_index,\
										completedAlignment=self.completedAlignment, \
										reduce_reads=self.reduce_reads)
			"""
			TableClass = VervetDB.IndividualAlignment
			query = TableClass.query.filter(TableClass.median_depth!=None)
			if ref_ind_seq_id:
				query = query.filter(TableClass.ref_ind_seq_id==ref_ind_seq_id)
			alignmentLs = query.order_by(TableClass.id)
			"""
			
		alignmentLs = db_vervet.filterAlignments(data_dir=self.data_dir, alignmentLs=alignmentLs, sequence_filtered=self.sequence_filtered, \
						mask_genotype_method_id=None, parent_individual_alignment_id=None,\
						excludeContaminant=self.excludeContaminant,local_realigned=self.local_realigned,\
						reduce_reads=self.reduce_reads,\
						completedAlignment=self.completedAlignment,\
						alignment_method_id=self.alignment_method_id, \
						outdated_index=self.alignment_outdated_index)
		writer = MatrixFile(inputFname=outputFname, openMode='w', delimiter='\t')
		if outputFileFormat==1:
			header = ['alignmentID', 'medianDepth', "individualID"]
		else:
			header = ['alignmentID', 'minDepth', 'maxDepth', 'minGQ']
		writer.writeHeader(header)
		
		counter = 0
		for row in alignmentLs:
			read_group = row.read_group
			if outputFileFormat==1:
				data_row = [read_group, row.median_depth, row.individual_sequence.individual.id]
			else:
				minDepth = row.median_depth/float(depthFoldChange)
				if abs(minDepth-0)<=0.001:	#if it's too close to 0, assign 0.
					minDepth = 0
				data_row = [read_group, minDepth, row.median_depth*float(depthFoldChange), minGQ]
			writer.writerow(data_row)
			counter += 1
		writer.close()
		sys.stderr.write("%s entries fetched.\n"%(counter))

예제 #5

0

파일 보기

	def outputGenotypeMarkedMissingStat(self, outputFname=None, \
									individual_index2no_of_genotype_marked_missing=None,\
									individualIDList=None):
		"""
		2013.07.24
		"""
		if outputFname and individual_index2no_of_genotype_marked_missing is not None:
			writer = MatrixFile(inputFname=outputFname, openMode='w', delimiter='\t')
			header = ["individualID", "noOfGenotypesMarkedMissing"]
			writer.writeHeader(header)
			for individual_index, no_of_genotype_marked_missing in individual_index2no_of_genotype_marked_missing.iteritems():
				individual_id = individualIDList[individual_index]
				writer.writerow([individual_id, no_of_genotype_marked_missing])
			writer.close()

예제 #6

0

파일 보기

파일: CheckTwoVCFOverlap.py 프로젝트: mjmontague/vervet-web

	def outputOverlapSites(self, overlapping_sites_set=None, outputFname=None):
		"""
		2011-12.9
			overlapping_sites_set is a set of (chromosome, pos) tuples.
			output is tab-delimited, 3-column. Last column is always 0 to mimic output of CalculateSNPMismatchRateOfTwoVCF.py
				chromosome	position	0
		"""
		sys.stderr.write("Outputting overlap %s sites ..."%(len(overlapping_sites_set)))
		header = ['chromosome', 'position', 'random']
		overlapping_sites_list = list(overlapping_sites_set)
		writer = MatrixFile(outputFname, openMode='w', delimiter='\t')
		writer.writerow(header)
		overlapping_sites_list.sort()
		for chromosome, pos in overlapping_sites_list:
			writer.writerow([chromosome, pos, 0])
		sys.stderr.write("%s sites.\n"%(len(overlapping_sites_list)))

예제 #7

0

파일 보기

파일: CalculateLociAndGenomeCoveredAtEachSwitchFrequencyThreshold.py 프로젝트: bopopescu/gwasmodules

    def run(self):
        if self.debug:
            import pdb
            pdb.set_trace()

        outputDir = os.path.split(self.outputFname)[0]
        if outputDir and not os.path.isdir(outputDir):
            os.makedirs(outputDir)

        switchPointData = self.readInStats(inputFname=self.inputFname)

        sys.stderr.write("Processing data ...")
        writer = MatrixFile(self.outputFname, openMode='w')
        header = [
            "maxSwitchFrequency", "genomeCovered", 'genomeCoveredFraction',
            "noOfLoci", 'noOfLociFraction'
        ]
        writer.writeHeader(header)

        data_matrix = switchPointData.data_matrix
        totalSpan = switchPointData.totalSpan
        totalNoOfLoci = switchPointData.totalNoOfLoci

        #sort it based on switchFrequency
        data_matrix.sort(reverse=True)
        maxSwitchFrequencyLs = []
        cumulativeRegionSpanLs = []
        cumulativeNoOfLociLs = []
        for i in xrange(len(data_matrix)):
            switchFrequency, regionSpan, noOfLoci = data_matrix[i]
            maxSwitchFrequencyLs.append(switchFrequency)
            if i == 0:
                cumulativeRegionSpan = totalSpan - regionSpan

                cumulativeNoOfLoci = totalNoOfLoci - noOfLoci
            else:
                cumulativeRegionSpan = cumulativeRegionSpanLs[i -
                                                              1] - regionSpan
                cumulativeNoOfLoci = cumulativeNoOfLociLs[i - 1] - noOfLoci
            cumulativeRegionSpanLs.append(cumulativeRegionSpan)
            cumulativeNoOfLociLs.append(cumulativeNoOfLoci)
            writer.writerow([switchFrequency, cumulativeRegionSpan, cumulativeRegionSpan/float(totalSpan),\
                cumulativeNoOfLoci, cumulativeNoOfLoci/float(totalNoOfLoci)])
        writer.close()
        sys.stderr.write(".\n")

예제 #8

0

파일 보기

class ComputeLiftOverLocusProbability(parentClass):
    __doc__ = __doc__
    option_default_dict = parentClass.option_default_dict.copy()
    option_default_dict.update({
         ('locusIntervalDeltaOutputFname', 1, ): ['', '', 1, 'file that would contain delta of intervals from old and new coordinate system. \
	Used to check if normal distribution on each chromosome. Output format: oldChromosome, oldStart, oldStop, newChromosome, newStart, newStop, intervalDelta.'                                                                                                                                                               , ],\
         ('startPosition', 0, int):[None, '', 1, 'probability for loci whose start positions are bigger than this argument would be computed.\
	Model parameters are estimated using all input data. This argument is used to avoid edge/boundary effect.'                                                                                                              ],\
         ('stopPosition', 0, int):[None, '', 1, 'probability for loci whose stop positions are less than this argument would be computed.\
	Model parameters are estimated using all input data. This argument is used to avoid edge/boundary effect.'                                                                                                              ],\

         })

    def __init__(self, inputFnameLs=None, **keywords):
        """
		"""
        parentClass.__init__(self, inputFnameLs=inputFnameLs, **keywords)

    def setup(self, **keywords):
        """
		noOfTotalIntervals = 0
		noOfCrossChromosomeIntervals = 0
		
		targetChromosome 2 mapData
			intervalDeltaList	=> median
			orientation  (queryStrand)
				0=forward
				1=backward
			mean	=> using 80% of data (sort the delta list, then take 10% to 90% of the list)
			stddev	=> if stddev is zero, use 1.
		
		locusKey (oldChromosome, oldStart, oldStop) 2 mapData
			targetCoordinate (newChromosome, newStart, newStop).
			leftIntervalDelta: None = boundary
			rightIntervalDelta: None = boundary, 10E10 = cross chromosome
			
			probability: max( P(SNP_i_left_interval), P(SNP_i_right_interval)).
				P(interval):
					If one interval is on the same chromosome,  P(target-chromosome)*P(interval delta size)
					If not, P(chromosome-cross event). 
			
		Not implemented: for a whole genome input (rather than a window),
			an RBTree of windows should be used to counter regional effect.
		
		2013.11.24
			run before anything is run
		"""
        AbstractMatrixFileWalker.setup(self, **keywords)

        self.noOfTotalIntervals = 0.0
        self.noOfCrossChromosomeIntervals = 0.0  #make it float for division

        self.targetChromosome2mapData = {}
        self.locusKey2mapData = {}
        self.previousLocusData = None

        #write header for the main output
        header = [
            'oldChromosome', 'oldStart', 'oldStop', 'oldStrand',
            'newChromosome', 'newStart', 'newStop', 'mapPvalue'
        ]
        self.writer.writerow(header)
        self.invariantPData.headerOutputted = True  #avoid double header output

        #open the other writer and write header
        self.sideOutput = MatrixFile(self.locusIntervalDeltaOutputFname,
                                     openMode='w',
                                     delimiter='\t')
        header = [
            'oldChromosome', 'oldStart', 'oldStop', 'oldStrand',
            'newChromosome', 'newStart', 'newStop', 'intervalDelta'
        ]
        self.sideOutput.writeHeader(header)

    def processRow(self, row=None, pdata=None):
        """
		2012.10.7
		"""
        returnValue = 1
        self.col_name2index = getattr(pdata, 'col_name2index', None)
        queryStrandIndex = self.col_name2index.get("queryStrand")

        queryChromosomeIndex = self.col_name2index.get("queryChromosome")
        queryStartIndex = self.col_name2index.get("queryStart")
        queryStopIndex = self.col_name2index.get("queryStop")

        newChrIndex = self.col_name2index.get("newChr")
        newRefStartIndex = self.col_name2index.get("newRefStart")
        newRefStopIndex = self.col_name2index.get("newRefStop")

        queryStrand = row[queryStrandIndex]
        queryChromosome = row[queryChromosomeIndex]
        queryStart = int(row[queryStartIndex])
        queryStop = int(row[queryStopIndex])

        newChr = row[newChrIndex]
        newRefStart = int(row[newRefStartIndex])
        newRefStop = int(row[newRefStopIndex])

        #create current locus data
        locusKey = (queryChromosome, queryStart, queryStop)
        currentLocusData = PassingData(locusKey=locusKey, queryStrand=queryStrand, queryChromosome=queryChromosome,\
           queryStart=queryStart, queryStop=queryStop, \
           newChr=newChr, newRefStart=newRefStart, newRefStop=newRefStop)

        #insert entry into locusKey2mapData
        self.locusKey2mapData[locusKey] = PassingData(locusData = currentLocusData, leftIntervalDelta=None,\
                rightIntervalDelta=None, mapProbability=None)
        if self.previousLocusData is not None:
            #calculate interval delta
            if self.previousLocusData.newChr != currentLocusData.newChr:
                intervalDelta = 10E10
                self.noOfCrossChromosomeIntervals += 1
            else:
                querySpan = currentLocusData.queryStart - currentLocusData.queryStop
                targetSpan = currentLocusData.newRefStart - currentLocusData.newRefStop
                if queryStrand == '+':
                    intervalDelta = targetSpan - querySpan
                else:
                    intervalDelta = targetSpan + querySpan
                # insert it into self.targetChromosome2mapData
                if currentLocusData.newChr not in self.targetChromosome2mapData:
                    self.targetChromosome2mapData[currentLocusData.newChr] = PassingData(intervalDeltaList=[],\
                                  orientation=queryStrand,\
                                  mean=None,\
                                  stddev=None,\
                                  probability=None)
                self.targetChromosome2mapData[
                    currentLocusData.newChr].intervalDeltaList.append(
                        intervalDelta)

            #output to the side
            self.sideOutput.writerow([currentLocusData.queryChromosome,\
              currentLocusData.queryStart, currentLocusData.queryStop, currentLocusData.queryStrand, \
              currentLocusData.newChr, currentLocusData.newRefStart, currentLocusData.newRefStop, intervalDelta])

            #assign it as right interval delta of previous locus
            self.locusKey2mapData[self.previousLocusData.
                                  locusKey].rightIntervalDelta = intervalDelta

            # assign it as left interval delta of current locus.
            self.locusKey2mapData[locusKey].leftIntervalDelta = intervalDelta

            self.noOfTotalIntervals += 1

        self.previousLocusData = currentLocusData
        return returnValue

    def calculateLocusMapProbabilityGivenIntervalDelta(
            self,
            intervalDelta=None,
            targetChromosomeMapData=None,
            crossChromosomeProbability=None):
        """
		2013.11.25
		"""
        mapProbability = 1
        if intervalDelta is not None:
            if intervalDelta == 10E10:
                mapProbability *= crossChromosomeProbability
            else:
                lessThanGivenValueProb = norm.cdf(
                    intervalDelta,
                    loc=targetChromosomeMapData.mean,
                    scale=targetChromosomeMapData.stddev)
                if intervalDelta > targetChromosomeMapData.mean:  #two-sided p-value
                    deltaProb = 2 * (1 - lessThanGivenValueProb)
                else:
                    deltaProb = 2 * lessThanGivenValueProb
                mapProbability *= targetChromosomeMapData.probability * deltaProb

        return mapProbability

    def reduce(self, **keywords):
        """
		2012.10.15
			run after all files have been walked through
		"""
        counter = 0
        real_counter = 0

        locusKeyList = self.locusKey2mapData.keys()
        locusKeyList.sort()

        sys.stderr.write("%s target chromosomes, %s cross-chromosome intervals, %s total intervals .\n "%\
            (len(self.targetChromosome2mapData), self.noOfCrossChromosomeIntervals, self.noOfTotalIntervals))

        if self.noOfTotalIntervals > 0:
            sys.stderr.write(
                "Running estimates for each target chromosome ... ")
            #estimates for each chromosome
            self.crossChromosomeProbability = float(
                self.noOfCrossChromosomeIntervals) / self.noOfTotalIntervals
            for targetChromosome in self.targetChromosome2mapData:
                mapData = self.targetChromosome2mapData.get(targetChromosome)
                #overall probability for an interval to be on this chromosome
                if len(mapData.intervalDeltaList
                       ) == 0:  #just one crossing event
                    mapData.probability = 1 / float(self.noOfTotalIntervals)
                else:
                    mapData.probability = len(
                        mapData.intervalDeltaList) / float(
                            self.noOfTotalIntervals)
                #estimate mean and stddev
                mapData.intervalDeltaList.sort()
                startIndex = max(0, int(len(mapData.intervalDeltaList) * 0.1))
                stopIndex = max(
                    int(len(mapData.intervalDeltaList) * 0.9) + 1, 1)
                if startIndex >= stopIndex:
                    stopIndex = startIndex + 1
                robustDataList = mapData.intervalDeltaList[
                    startIndex:stopIndex]

                stddev = 1
                if len(robustDataList) > 0:
                    mapData.mean = numpy.mean(robustDataList)
                    if len(robustDataList) > 1:
                        stddev = numpy.std(robustDataList)
                else:
                    mapData.mean = 0
                if stddev == 0:
                    stddev = 1
                mapData.stddev = stddev
            sys.stderr.write(".\n")

            #output
            sys.stderr.write("Output %s SNPs with map p-value ..." %
                             (len(locusKeyList)))
            for locusKey in locusKeyList:
                counter += 1
                locusMapData = self.locusKey2mapData.get(locusKey)
                locusData = locusMapData.locusData
                if locusMapData.leftIntervalDelta != None:
                    leftProbability = self.calculateLocusMapProbabilityGivenIntervalDelta(intervalDelta=locusMapData.leftIntervalDelta, \
                             targetChromosomeMapData=self.targetChromosome2mapData.get(locusData.newChr),\
                             crossChromosomeProbability=self.crossChromosomeProbability)
                else:
                    leftProbability = 0
                if locusMapData.rightIntervalDelta != None:
                    rightProbability = self.calculateLocusMapProbabilityGivenIntervalDelta(intervalDelta=locusMapData.rightIntervalDelta, \
                           targetChromosomeMapData=self.targetChromosome2mapData.get(locusData.newChr),\
                           crossChromosomeProbability=self.crossChromosomeProbability)
                else:
                    rightProbability = 0
                mapProbability = max(leftProbability, rightProbability)
                data_row = [locusData.queryChromosome,\
                 locusData.queryStart, locusData.queryStop, locusData.queryStrand, \
                 locusData.newChr, locusData.newRefStart, locusData.newRefStop, mapProbability]
                self.writer.writerow(data_row)
                real_counter += 1
            sys.stderr.write("\n")
        else:  #single SNP (give a low probability)
            sys.stderr.write(
                "Zero intervals, output %s SNPs with 0.001 map p-value ..." %
                (len(locusKeyList)))
            for locusKey in locusKeyList:
                counter += 1
                locusMapData = self.locusKey2mapData.get(locusKey)
                locusData = locusMapData.locusData
                mapProbability = 0.001
                data_row = [locusData.queryChromosome,\
                 locusData.queryStart, locusData.queryStop, locusData.queryStrand, \
                 locusData.newChr, locusData.newRefStart, locusData.newRefStop, mapProbability]
                self.writer.writerow(data_row)
                real_counter += 1
            sys.stderr.write("\n")

        if counter > 0:
            fraction = float(real_counter) / float(counter)
        else:
            fraction = -1
        sys.stderr.write("%s/%s (%.3f) outputted.\n" %
                         (real_counter, counter, fraction))

        self.sideOutput.close()
        #close the self.invariantPData.writer
        AbstractMatrixFileWalker.reduce(self, **keywords)

예제 #9

0

파일 보기

	def outputSwitchPointInfo(self, querySNPID2NewReferenceCoordinateLs=None, outputFname=None):
		"""
		2013.07.11
			output the switch point (adjacent sites mapped to two different chromosomes) information
		"""
		
		sys.stderr.write("Converting querySNPID2NewReferenceCoordinateLs to oldCoordinateKey2newCoordinateDataLs ... ")
		oldCoordinateKey2newCoordinateDataLs = {}
		counter = 0
		for querySNPID, newRefCoordinateLs in querySNPID2NewReferenceCoordinateLs.iteritems():
			oldCoordinateKey = None
			counter += len(newRefCoordinateLs)
			for newRefCoordinate in newRefCoordinateLs:
				if oldCoordinateKey is None:
					oldCoordinateKey = (newRefCoordinate.queryChromosome, newRefCoordinate.queryStart, newRefCoordinate.queryStop)
				if oldCoordinateKey not in oldCoordinateKey2newCoordinateDataLs:
					oldCoordinateKey2newCoordinateDataLs[oldCoordinateKey] = []
				oldCoordinateKey2newCoordinateDataLs[oldCoordinateKey].append(newRefCoordinate)
		sys.stderr.write(" %s old coordinate keys with %s new coordinates.\n"%(len(oldCoordinateKey2newCoordinateDataLs),\
																		counter))
		
		sys.stderr.write("Finding switch points ...")
		counter =0
		real_counter = 0
		noOfRecordsWithMultiNewCoords = 0

		oldChromosome2SwitchData = {}
		
		oldCoordinateKeyLs = oldCoordinateKey2newCoordinateDataLs.keys()
		oldCoordinateKeyLs.sort()
		for oldCoordinateKey in oldCoordinateKeyLs:
			counter +=1
			newRefCoordinateLs = oldCoordinateKey2newCoordinateDataLs.get(oldCoordinateKey)
			
			oldChromosome = oldCoordinateKey[0]
			
			if oldChromosome not in oldChromosome2SwitchData:
				oldChromosome2SwitchData[oldChromosome] = PassingData(noOfLociWithUniqueHit=0, noOfLoci=0, \
														spanStart=oldCoordinateKey[1], \
														spanStop=oldCoordinateKey[2], noOfSwitchPoints=0,\
														previousNewChromosome=None, previousNewRefStart=None,\
														previousNewRefStop=None,\
														previousOrientationOnNewChromosome=None)
			
			switchData = oldChromosome2SwitchData[oldChromosome]
			switchData.noOfLoci += 1
			
			if len(newRefCoordinateLs)>1:
				noOfRecordsWithMultiNewCoords += 1
				continue
			
			switchData.noOfLociWithUniqueHit += 1
			newRefCoordinate = newRefCoordinateLs[0]
			
			if switchData.previousNewChromosome is not None:
				if newRefCoordinate.newChr!=switchData.previousNewChromosome:
					switchData.noOfSwitchPoints += 1
					#reset the orientation
					switchData.previousOrientationOnNewChromosome = None
					
				else:	#on the same chromosome
					currentOrientation = (newRefCoordinate.newRefStart - switchData.previousNewRefStart)>=0
					if switchData.previousOrientationOnNewChromosome is not None:
						if currentOrientation !=switchData.previousOrientationOnNewChromosome:
							switchData.noOfSwitchPoints += 1
					switchData.previousOrientationOnNewChromosome = currentOrientation
					
			#adjust the spanStop
			if newRefCoordinate.queryStop > switchData.spanStop:
				switchData.spanStop = newRefCoordinate.queryStop
					
			
			switchData.previousNewChromosome = newRefCoordinate.newChr
			switchData.previousNewRefStart = newRefCoordinate.newRefStart
			switchData.previousNewRefStop = newRefCoordinate.newRefStop
			real_counter  += 1
		if counter >0:
			fraction = real_counter/float(counter)
		else:
			fraction = -1
		sys.stderr.write("%s (out of %s, %s) records found new coordinates. %s records with >1 new coordinates, discarded.\n"%(real_counter, counter, \
																	fraction, noOfRecordsWithMultiNewCoords))
		
		
		sys.stderr.write("Outputting switch points of %s old chromosomes ..."%(len(oldChromosome2SwitchData)))
		statFile = MatrixFile(inputFname=outputFname, openMode='w', delimiter='\t')
		header = ['oldChromosome', "noOfSwitchPoints", "regionSpan", "noOfLociWithUniqueHit", "noOfSwitchesPerLocus", "noOfLoci"]
		statFile.writeHeader(header)
		noOfTotalSwitchPoints = 0
		noOfTotalLoci = 0
		for oldChromosome, switchData in oldChromosome2SwitchData.iteritems():
			if switchData.noOfLociWithUniqueHit>0:
				switchPointFraction = switchData.noOfSwitchPoints/float(switchData.noOfLociWithUniqueHit)
			else:
				switchPointFraction = -1
			data_row = [oldChromosome, switchData.noOfSwitchPoints, switchData.spanStop-switchData.spanStart+1, \
					switchData.noOfLociWithUniqueHit, switchPointFraction, len(oldCoordinateKey2newCoordinateDataLs)]
			statFile.writerow(data_row)
			noOfTotalSwitchPoints += switchData.noOfSwitchPoints
			noOfTotalLoci += switchData.noOfLociWithUniqueHit
		statFile.close()
		sys.stderr.write(' %s total switch points, %s total loci with unique hit.\n'%(noOfTotalSwitchPoints, noOfTotalLoci))

예제 #10

0

파일 보기

파일: CheckTwoVCFOverlap.py 프로젝트: mjmontague/vervet-web

	def calculateOverlappingSites(self, vcfFile1=None, vcfFile2=None, outputFname=None, overlappingSitesOutputFname=None,\
						chromosome=None, chrLength=None):
		"""
		2013.09.10
			added argument overlappingSitesOutputFname
		2013.07.17 vcf files are no longer pre-loaded. read in locus ids first. 
		2012.8.16
		"""
		writer = MatrixFile(outputFname, openMode='w', delimiter='\t')
		header = ['#chromosome', 'length', '#sitesInInput1', '#sitesInInput2', '#overlapping', 'overlappingOverTotal', \
				'overlappingOverInput1', 'overlappingOverInput2', '#segregatingSitesNormalized', ]
		
		vcf1_locus_id_list = []
		for row in vcfFile1.reader:
			vcf1_locus_id_list.append((row[0], row[1]))
		vcf2_locus_id_list = []
		for row in vcfFile2.reader:
			vcf2_locus_id_list.append((row[0], row[1]))
		
		no_of_sites_of_input1 = len(vcf1_locus_id_list)
		no_of_sites_of_input2 = len(vcf2_locus_id_list)
		overlapping_sites_set = set(vcf1_locus_id_list)&set(vcf2_locus_id_list)
		if overlappingSitesOutputFname:
			#outputFname = "%s_overlapSitePos.tsv"%(outputFnamePrefix)
			self.outputOverlapSites(overlapping_sites_set=overlapping_sites_set, outputFname=overlappingSitesOutputFname)
		
		no_of_overlapping_sites = len(overlapping_sites_set)
		no_of_total_sites = no_of_sites_of_input1+no_of_sites_of_input2-no_of_overlapping_sites
		if no_of_total_sites>0:
			overlapping_fraction = no_of_overlapping_sites/float(no_of_total_sites)
		else:
			overlapping_fraction = -1
		
		if no_of_sites_of_input1>0:
			overlappingOverInput1 = no_of_overlapping_sites/float(no_of_sites_of_input1)
		else:
			overlappingOverInput1 = -1
		
		if no_of_sites_of_input2>0:
			overlappingOverInput2 = no_of_overlapping_sites/float(no_of_sites_of_input2)
		else:
			overlappingOverInput2 = -1
		
		no_of_samples = len(vcfFile1.sample_id2index)
		no_of_samples_in_vcf2 = len(vcfFile2.sample_id2index)
		overlapping_sample_id_set = set(vcfFile1.sample_id2index.keys()) & set(vcfFile2.sample_id2index.keys())
		
		if no_of_samples!=no_of_samples_in_vcf2:
			sys.stderr.write("Warning: sample size in %s is %s, in %s is %s. not matching.\n"%\
							(vcfFile1.inputFname, no_of_samples, vcfFile2.inputFname, no_of_samples_in_vcf2))
		
		#exclude the ref sample in the 1st column
		if no_of_samples>1:
			normalizingConstant = float(utils.sumOfReciprocals(no_of_samples*2-1))
		else:
			normalizingConstant = 1
		noOfSegregatesSitesNormalized = no_of_overlapping_sites/(normalizingConstant*chrLength)
		
		writer.writerow(header)
		"""
		#reformat for output
		no_of_matches_per_sample_ls = map(repr, no_of_matches_per_sample_ls)
		no_of_non_NA_pairs_per_sample_ls = map(repr, no_of_non_NA_pairs_per_sample_ls)
		matchFractionLs = map(repr, matchFractionLs)
		"""
		writer.writerow([chromosome, chrLength, no_of_sites_of_input1, no_of_sites_of_input2, no_of_overlapping_sites, \
						overlapping_fraction, overlappingOverInput1, overlappingOverInput2, \
						noOfSegregatesSitesNormalized])
		del writer
		return PassingData(overlapping_sample_id_set=overlapping_sample_id_set,overlapping_sites_set=overlapping_sites_set)

예제 #11

0

파일 보기

파일: CheckTwoVCFOverlap.py 프로젝트: mjmontague/vervet-web

	def calculatePerSampleMismatchFraction(self, vcfFile1=None, vcfFile2=None, outputFname=None, overlapping_sample_id_set=None,\
										NA_call_encoding_set = set(['.', 'NA'])):
		"""
		2013.08.13 bugfix, derive overlapping_sites_set by itself, rather than use calculateOverlappingSites()
		2013.07.17 vcf files are no longer pre-loaded.
		2012.8.16
		"""
		sys.stderr.write("Finding matches for each sample at overlapping sites ...")
		writer = MatrixFile(outputFname, openMode='w', delimiter='\t')
		header = ['sample_id', 'no_of_matches', 'no_of_non_NA_pairs', 'matchFraction']
		no_of_samples_to_compare = len(overlapping_sample_id_set)
		
		vcfFile1._resetInput()
		vcfFile1.parseFile()
		vcfFile2._resetInput()
		vcfFile2.parseFile()
		
		overlapping_sites_set = set(vcfFile1.locus_id_ls) & set(vcfFile2.locus_id_ls)
		sys.stderr.write(" %s overlapping loci, "%(len(overlapping_sites_set)))
		
		header_ls_for_no_of_matches = []
		header_ls_for_no_of_non_NA_pairs = []
		header_ls_for_matchFraction = []
		overlapping_sample_id_list = list(overlapping_sample_id_set)
		overlapping_sample_id_list.sort()
		"""
		for sample_id in overlapping_sample_id_list:
			header_ls_for_no_of_matches.append('no_of_matches_for_%s'%(sample_id))
			header_ls_for_no_of_non_NA_pairs.append('no_of_non_NA_pairs_for_%s'%(sample_id))
			header_ls_for_matchFraction.append('matchFraction_for_%s'%(sample_id))
		
		#header = header + header_ls_for_no_of_matches + header_ls_for_no_of_non_NA_pairs + header_ls_for_matchFraction
		"""
		no_of_matches_per_sample_ls = [0]*no_of_samples_to_compare
		no_of_non_NA_pairs_per_sample_ls = [0]*no_of_samples_to_compare
		
		for locus_id in overlapping_sites_set:
			row_index1 = vcfFile1.locus_id2row_index[locus_id]
			row_index2 = vcfFile2.locus_id2row_index[locus_id]
			for j in xrange(len(overlapping_sample_id_list)):
				sample_id = overlapping_sample_id_list[j]
				col_index1 = vcfFile1.sample_id2index.get(sample_id)
				col_index2 = vcfFile2.sample_id2index.get(sample_id)
				#2012.1.17 bugfix below. so that 'AG' and 'GA' are same.
				call1 = vcfFile1.genotype_call_matrix[row_index1][col_index1]
				call2 = vcfFile2.genotype_call_matrix[row_index2][col_index2]
				if call1 not in NA_call_encoding_set and call2 not in NA_call_encoding_set:
					no_of_non_NA_pairs_per_sample_ls[j] += 1
					if nt2number[call1]==nt2number[call2]:	#2013.07.03 bugfix, 'AT' and 'TA' should be same. no phase
						no_of_matches_per_sample_ls[j] += 1
					else:
						#do nothing
						pass
		matchFractionLs = [-1]*no_of_samples_to_compare
		for j in xrange(no_of_samples_to_compare):
			if no_of_non_NA_pairs_per_sample_ls[j]>0:
				matchFractionLs[j] = no_of_matches_per_sample_ls[j]/float(no_of_non_NA_pairs_per_sample_ls[j])
		
		writer.writerow(header)
		for i in xrange(no_of_samples_to_compare):
			data_row = [overlapping_sample_id_list[i], no_of_matches_per_sample_ls[i], no_of_non_NA_pairs_per_sample_ls[i],\
					matchFractionLs[i]]
			writer.writerow(data_row)
		del writer
		sys.stderr.write("%s samples.\n"%(no_of_samples_to_compare))