def run(self):
        """
		2013.07.24
		"""

        if self.debug:
            import pdb
            pdb.set_trace()

        #inf = utils.openGzipFile(self.inputFname)
        reader = MatrixFile(inputFname=self.inputFname)
        reader.constructColName2IndexFromHeader()
        writer = MatrixFile(inputFname=self.outputFname,
                            openMode='w',
                            delimiter='\t')
        header = ["SNPID", "oldChromosome", "Chromosome", "Start", "Stop", "N"]
        writer.writeHeader(header)

        counter = 0
        for row in reader:
            new_row = self.processRow(row)
            writer.writerow(new_row)
            counter += 1
        sys.stderr.write("%s lines processed.\n" % (counter))

        del reader
        del writer
示例#2
0
	def getMendelErrorIndividualLocusData(self, mendelErrorFname=None, individualID2Index=None):
		"""
		2013.1.29
		
		"""
		sys.stderr.write("Getting data on loci involved in mendel-errors from %s ..."%(mendelErrorFname))
		locus_id2individual_index_ls = {}
		#inf = utils.openGzipFile(mendelErrorFname, 'r')
		reader = MatrixFile(inputFname=mendelErrorFname)
		#header = reader.next()
		reader.constructColName2IndexFromHeader()
		counter = 0
		for row in reader:
			individual_id = row[reader.getColIndexGivenColHeader('KID')]
			if individual_id in individualID2Index:
				index =individualID2Index.get(individual_id)
			else:
				sys.stderr.write("Individual %s not in individualID2Index.\n"%(individual_id))
				sys.exit(3)
			snp_id = row[3]
			if snp_id not in locus_id2individual_index_ls:
				locus_id2individual_index_ls[snp_id] = []
			locus_id2individual_index_ls[snp_id].append(index)
			counter += 1
		del reader
		sys.stderr.write(" %s calls of %s loci, involved in mendel errors.\n"%\
						(counter, len(locus_id2individual_index_ls)))
		return locus_id2individual_index_ls
示例#3
0
    def setup(self, **keywords):
        """
		noOfTotalIntervals = 0
		noOfCrossChromosomeIntervals = 0
		
		targetChromosome 2 mapData
			intervalDeltaList	=> median
			orientation  (queryStrand)
				0=forward
				1=backward
			mean	=> using 80% of data (sort the delta list, then take 10% to 90% of the list)
			stddev	=> if stddev is zero, use 1.
		
		locusKey (oldChromosome, oldStart, oldStop) 2 mapData
			targetCoordinate (newChromosome, newStart, newStop).
			leftIntervalDelta: None = boundary
			rightIntervalDelta: None = boundary, 10E10 = cross chromosome
			
			probability: max( P(SNP_i_left_interval), P(SNP_i_right_interval)).
				P(interval):
					If one interval is on the same chromosome,  P(target-chromosome)*P(interval delta size)
					If not, P(chromosome-cross event). 
			
		Not implemented: for a whole genome input (rather than a window),
			an RBTree of windows should be used to counter regional effect.
		
		2013.11.24
			run before anything is run
		"""
        AbstractMatrixFileWalker.setup(self, **keywords)

        self.noOfTotalIntervals = 0.0
        self.noOfCrossChromosomeIntervals = 0.0  #make it float for division

        self.targetChromosome2mapData = {}
        self.locusKey2mapData = {}
        self.previousLocusData = None

        #write header for the main output
        header = [
            'oldChromosome', 'oldStart', 'oldStop', 'oldStrand',
            'newChromosome', 'newStart', 'newStop', 'mapPvalue'
        ]
        self.writer.writerow(header)
        self.invariantPData.headerOutputted = True  #avoid double header output

        #open the other writer and write header
        self.sideOutput = MatrixFile(self.locusIntervalDeltaOutputFname,
                                     openMode='w',
                                     delimiter='\t')
        header = [
            'oldChromosome', 'oldStart', 'oldStop', 'oldStrand',
            'newChromosome', 'newStart', 'newStop', 'intervalDelta'
        ]
        self.sideOutput.writeHeader(header)
	def appendInfo(self, inputFname=None, db_vervet=None, outputFname=None,\
				inversePCValue=True):
		"""
		#2012.9.25 skip samples whose individual_alignment entry could not be parsed.
		2012.9.5
		"""
		sys.stderr.write("Appending info to %s ..."%(inputFname))
		reader = MatrixFile(inputFname)
		header = reader.next()
		newHeader = ['individualID']
		for i in xrange(1, len(header)):
			newHeader.append('PC%s'%(i))
		newHeader.extend(['sex|string', 'country|string', 'site-id', 'site-name|string', 'latitude', 'longitude', 'ucla_id|string', \
						'tax_id|string',\
						'species|string', 'collectionYear', 'medianDepth'])
		writer = csv.writer(open(outputFname, 'w'), delimiter='\t')
		writer.writerow(newHeader)
		counter = 0
		for row in reader:
			row = row[:len(header)]	#don't take extra columns
			sampleID = row[0]
			individualAlignment = db_vervet.parseAlignmentReadGroup(sampleID).individualAlignment
			if individualAlignment is None:
				#2012.9.25
				#sampleID is not beginned with alignment ID, probably "ref" but could be something , skip them
				sys.stderr.write("Warning: sampleID %s is not parsable to get alignment out of it. Skip.\n"%(sampleID))
				continue
			individual = individualAlignment.individual_sequence.individual
			data_row = ['%s_%s'%(individual.code, individualAlignment.id)]
			
			floatValue_row = row[1:]
			if inversePCValue:
				floatValue_row = map(float, floatValue_row)
				floatValue_row = numpy.array(floatValue_row)
				floatValue_row = -floatValue_row
			data_row.extend(list(floatValue_row))
			scientifcName = self.db_taxonomy.returnScientificNameGivenTaxID(individual.tax_id)
			if scientifcName is None:
				scientifcName = ""
			if individual.collection_date:
				collectionYear = individual.collection_date.year
			else:
				collectionYear = ''
			data_row.extend([individual.sex, individual.site.country.name, individual.site.id, individual.site.short_name, \
							individual.latitude, individual.longitude, individual.ucla_id, \
							individual.tax_id, scientifcName, collectionYear, individualAlignment.median_depth])
			writer.writerow(data_row)
			counter += 1
		del writer
		sys.stderr.write("%s rows outputted.\n"%(counter))
示例#5
0
	def run(self):
		if self.debug:
			import pdb
			pdb.set_trace()
		
		
		outputDir = os.path.split(self.outputFname)[0]
		if outputDir and not os.path.isdir(outputDir):
			os.makedirs(outputDir)
		
		reader = VCFFile(inputFname=self.inputFname)
		
		alignmentFile = pysam.Samfile(self.alignmentFilename, "rb")
		
		writer = VCFFile(outputFname=self.outputFname, openMode='w')
		writer.metaInfoLs = reader.metaInfoLs
		writer.header = reader.header
		writer.writeMetaAndHeader()
		
		statWriter = MatrixFile(self.missingStatFname, openMode='w', delimiter='\t')
		header = ["sampleID", "locusID", 'chromosome', 'start', 'stop', 'occurrence', 'missingReason', \
				'fractionOfGoodRead', 'medianMapQ', 'totalNoOfReads']
		statWriter.writeHeader(header)
		
		counter = 0
		real_counter = 0
		minDepth = self.alignmentMedianDepth/self.alignmentDepthFold
		maxDepth = self.alignmentMedianDepth*self.alignmentDepthFold
		
		for vcfRecord in reader:
			locusID = "%s_%s"%(vcfRecord.chromosome, vcfRecord.position)
			alignedReadLs = alignmentFile.fetch(vcfRecord.chromosome, vcfRecord.position-1, vcfRecord.position+1)	#start and end in fetch() are 0-based.
			locusLowMapQData = self.returnLocusLowMapQualityIndicator(alignedReadLs=alignedReadLs,\
												minMapQGoodRead=self.minMapQGoodRead, minFractionOfGoodRead=self.minFractionOfGoodRead)
			locusLowMapQIndicator = locusLowMapQData.locusLowMapQIndicator
			depth = locusLowMapQData.totalNoOfReads
			if depth>=minDepth and depth <=maxDepth:
				locusOutOfDepthIndicator = 0 	#good
			else:
				locusOutOfDepthIndicator = 1
			
			locusLowQualityIndicator = locusOutOfDepthIndicator + locusLowMapQIndicator
			data_row = [self.sampleID, locusID, vcfRecord.chromosome, vcfRecord.position, vcfRecord.position,\
						1, locusLowQualityIndicator, locusLowMapQData.fractionOfGoodRead, \
						locusLowMapQData.medianMapQ, locusLowMapQData.totalNoOfReads]
			statWriter.writerow(data_row)
			if locusLowQualityIndicator>0:
				real_counter += 1
				#modify the VCF record
				#get sample ID column, then set its genotype missing
				vcfRecord.setGenotypeCallForOneSample(sampleID=self.sampleID, genotype="./.", convertGLToPL=True)
			#2014.1.4 output VCF record
			writer.writeVCFRecord(vcfRecord)
			counter += 1
		reader.close()
		statWriter.close()
		writer.close()
		sys.stderr.write("%s (out of %s, %s) genotypes marked missing.\n"%(real_counter, counter, \
												real_counter/float(counter)))
示例#6
0
    def run(self):
        """
		"""

        if self.debug:
            import pdb
            pdb.set_trace()

        reader = MatrixFile(self.inputFname)
        #reader.constructColName2IndexFromHeader()	#no header
        #noOfMendelErrorColumnIndex = reader.getColIndexGivenColHeader(colHeader='N')
        SNPIDColumnIndex = 1
        writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t')
        header = ['chromosome', 'position']
        writer.writerow(header)

        counter = 0
        real_counter = 0
        for row in reader:
            SNPID = row[SNPIDColumnIndex]
            chr, pos = SNPID.split('_')
            data_row = [chr, pos]
            writer.writerow(data_row)
            real_counter += 1
            counter += 1

        del reader
        del writer
        sys.stderr.write("%s/%s lines outputted.\n" % (real_counter, counter))
	def outputOverlapSites(self, overlapping_sites_set=None, outputFname=None):
		"""
		2011-12.9
			overlapping_sites_set is a set of (chromosome, pos) tuples.
			output is tab-delimited, 3-column. Last column is always 0 to mimic output of CalculateSNPMismatchRateOfTwoVCF.py
				chromosome	position	0
		"""
		sys.stderr.write("Outputting overlap %s sites ..."%(len(overlapping_sites_set)))
		header = ['chromosome', 'position', 'random']
		overlapping_sites_list = list(overlapping_sites_set)
		writer = MatrixFile(outputFname, openMode='w', delimiter='\t')
		writer.writerow(header)
		overlapping_sites_list.sort()
		for chromosome, pos in overlapping_sites_list:
			writer.writerow([chromosome, pos, 0])
		sys.stderr.write("%s sites.\n"%(len(overlapping_sites_list)))
示例#8
0
	def run(self):
		"""
		"""
		
		if self.debug:
			import pdb
			pdb.set_trace()
		
		#inf = utils.openGzipFile(self.inputFname)
		reader = MatrixFile(inputFname=self.inputFname)	#a TPED file
		
		writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t')
		counter = 0
		tfamIndividualData = self.getIndividualID2IndexFromTFAMFile(tfamFname=self.tfamFname)
		individualID2Index = tfamIndividualData.individualID2Index
		noOfIndividuals = len(individualID2Index)
		
		noOfExtraIndividuals = None
		for row in reader:
			#chromosome, snp_id, genetic_distace, physical_distance = row[:4]
			noOfExistingIndividuals = len(row[4:])/2
			noOfExtraIndividuals = noOfIndividuals - noOfExistingIndividuals
			writer.writerow(row+ [0]*2*noOfExtraIndividuals)
			counter += 1
			
		del reader
		del writer
		sys.stderr.write("%s rows (loci) and added %s extra individuals.\n"%(counter, noOfExtraIndividuals))
示例#9
0
	def run(self):
		if self.debug:
			import pdb
			pdb.set_trace()
		
		
		
		outputDir = os.path.split(self.outputFname)[0]
		if outputDir and not os.path.isdir(outputDir):
			os.makedirs(outputDir)
		
		snp_pos2genotypeVectorLs =self.readInSNPID2GenotypeVectorLs(self.inputFname).snp_pos2returnData
		
		
		
		writer = MatrixFile(self.outputFname, openMode='w', delimiter='\t')
		header = ['chromosome', 'position', 'noOfMatches', 'noOfTotal', 'concordance']
		writer.writeHeader(header)
		
		
		counter = 0
		real_counter = 0
		no_of_pairs = 0
		snp_pos_ls = snp_pos2genotypeVectorLs.keys()
		snp_pos_ls.sort()
		for i in xrange(len(snp_pos_ls)):
			counter += 1
			key = snp_pos_ls[i]
			chromosome, position = snp_pos_ls[i][:2]
			genotypeVectorLs = snp_pos2genotypeVectorLs.get(key)
			if len(genotypeVectorLs)>1:
				real_counter += 1
				for k in xrange(0, len(genotypeVectorLs)-1):
					for l in xrange(k+1, len(genotypeVectorLs)):
						no_of_pairs +=1
						noOfMatches = 0
						noOfTotal = 0
						genotypeVector0 = genotypeVectorLs[k]
						genotypeVector1 = genotypeVectorLs[l]
						for j in xrange(len(genotypeVector0)):
							call1 = genotypeVector0[j]['GT']
							call2 = genotypeVector1[j]['GT']
							if call1!='NA' and call2!='NA':
								noOfTotal += 1
								if SNP.nt2number[call1]==SNP.nt2number[call2]:
									noOfMatches += 1
						if noOfTotal>0:
							concordance = float(noOfMatches)/float(noOfTotal)
						else:
							concordance = -1
						data_row = [chromosome, position,noOfMatches, noOfTotal, concordance ]
						writer.writerow(data_row)
		writer.close()
		sys.stderr.write("%s (out of %s, %s) snps have >1 same-position entries. %s pairs.\n"%(real_counter, counter, \
												real_counter/float(counter), no_of_pairs))
	def setup(self, **keywords):
		"""
		"""
		AbstractMatrixFileWalker.setup(self, **keywords)
		
		#construct a individualCode2readGroup from readGroupFname
		self.invariantPData.individualCode2readGroup = {}
		reader = MatrixFile(inputFname=self.readGroupFname)
		reader.constructColName2IndexFromHeader()
		if self.readGroupHeader:
			readGroupIndex = reader.getColIndexGivenColHeader(self.readGroupHeader)
		else:
			readGroupIndex = 0
		for row in reader:
			readGroup = row[readGroupIndex]
			individualAlignment = self.db_vervet.parseAlignmentReadGroup(readGroup).individualAlignment
			if individualAlignment:
				individual_code = individualAlignment.individual_sequence.individual.code
				self.invariantPData.individualCode2readGroup[individual_code] = readGroup
		del reader
		return 1
	def getSampleID2IndividualData_UNGC(self, inputFname=None):
		"""
		2013.04.04
			Format is like this from UNGC  = UCLA Neuroscience Genomics Core:
			FCID	Lane	sample ID	sample code	sample name	Index	Description	SampleProject
			D1HYNACXX	1	Ilmn Human control pool ( 4plex)	IP1			INDEX IS UNKNOWN prepared by Illumina (4 plex pool)	2013-029A
			D1HYNACXX	2	UNGC Human Sample 1	S1	AS001A	ATTACTCG	TruSeq DNA PCR Free beta kit	2013-029A
		"""
		sys.stderr.write("Getting  sampleID2IndividualData from %s ..."%(inputFname))
		sampleID2IndividualData = {}
		
		reader = MatrixFile(inputFname, openMode='r', delimiter=',')
		reader.constructColName2IndexFromHeader()
		sampleIDIndex = reader.getColIndexGivenColHeader("sample ID")
		sampleNameIndex = reader.getColIndexGivenColHeader("sample name")
		libraryIndexIndex = reader.getColIndexGivenColHeader("Index")
		
		for row in reader:
			sampleID = row[sampleIDIndex].replace(' ', '_')	#2013.04.04 stupid quirks
			sampleName = row[sampleNameIndex]
			libraryIndex = row[libraryIndexIndex]
			if sampleID not in sampleID2IndividualData:
				sampleID2IndividualData[sampleID] = PassingData(sampleName=sampleName, libraryIndexList=[])
			if sampleName!=sampleID2IndividualData[sampleID].sampleName:
				sys.stderr.write("Error: sampleID %s is associated with two different sample names (%s, %s).\n"%\
								(sampleID, sampleName, sampleID2IndividualData[sampleID].sampleName))
				raise
			sampleID2IndividualData[sampleID].libraryIndexList.append(libraryIndex)
		
		sys.stderr.write("%s entries.\n"%(len(sampleID2IndividualData)))
		return sampleID2IndividualData
示例#12
0
    def getMonkeyID2Coverage(self, inputFname):
        """
		2012.9.4
			copied from vervet/src/misc.py
		2012.2.10
			inputFname is output of SequencingStrategy.assignVRCSequencePriorityBasedOnPedigree() + manual change of top ones
		"""
        sys.stderr.write("Reading the list of ranked monkeys from %s ..." % (inputFname))
        reader = MatrixFile(inputFname)
        reader.constructColName2IndexFromHeader()

        monkey_id_index = reader.getColIndexGivenColHeader("UCLAID")
        pre_set_coverage_index = reader.getColIndexGivenColHeader("pre-set-coverage")
        future_coverage_index = reader.getColIndexGivenColHeader("future coverage")
        to_sequence_monkey_id2coverage = {}
        for row in reader:
            monkey_id = row[monkey_id_index]
            pre_set_coverage = row[pre_set_coverage_index]
            if pre_set_coverage:
                pre_set_coverage = float(pre_set_coverage)
            else:
                pre_set_coverage = 0
            future_coverage = 0
            if len(row) >= future_coverage_index + 1:
                future_coverage = float(row[future_coverage_index])
            to_sequence_monkey_id2coverage[monkey_id] = max(future_coverage, pre_set_coverage)
        del reader
        sys.stderr.write(" %s monkeys are to-be-sequenced.\n" % (len(to_sequence_monkey_id2coverage)))
        return to_sequence_monkey_id2coverage

        """
示例#13
0
	def outputGenotypeMarkedMissingStat(self, outputFname=None, \
									individual_index2no_of_genotype_marked_missing=None,\
									individualIDList=None):
		"""
		2013.07.24
		"""
		if outputFname and individual_index2no_of_genotype_marked_missing is not None:
			writer = MatrixFile(inputFname=outputFname, openMode='w', delimiter='\t')
			header = ["individualID", "noOfGenotypesMarkedMissing"]
			writer.writeHeader(header)
			for individual_index, no_of_genotype_marked_missing in individual_index2no_of_genotype_marked_missing.iteritems():
				individual_id = individualIDList[individual_index]
				writer.writerow([individual_id, no_of_genotype_marked_missing])
			writer.close()
	def constructPedigreeGraphFromPOEdgeFile(self, inputFname=None):
		"""
		2012.8.23
			inputFname is output of vervet/src/pedigree/DiscoverParentOffspringFromPlinkIBD.py
		"""
		sys.stderr.write("Constructing pedigree-graph out of %s ..."%(inputFname))
		DG=nx.DiGraph()
		reader = None
		childNodeSet = set()
		reader = MatrixFile(inputFname)
		reader.constructColName2IndexFromHeader()
		
		parentIDIndex = reader.getColIndexGivenColHeader("parentID")
		childIDIndex = reader.getColIndexGivenColHeader("childID")
		distToPOVectorIndex = reader.getColIndexGivenColHeader("distToPOVector")
		counter = 0
		for row in reader:
			childID = row[childIDIndex]
			childNodeSet.add(childID)
			parentID = row[parentIDIndex]
			distToPOVector = float(row[distToPOVectorIndex])
			DG.add_edge(parentID, childID, weight=distToPOVector)
			counter += 1
		del reader
		sys.stderr.write("%s children, %s nodes. %s edges. %s connected components.\n"%(\
										len(childNodeSet), DG.number_of_nodes(), DG.number_of_edges(), \
										nx.number_connected_components(DG.to_undirected())))
		return PassingData(DG=DG, childNodeSet=childNodeSet)
示例#15
0
    def getLocusID2MissingFraction(self, inputFname=None):
        """
		2014.01.08
			
		"""
        sys.stderr.write("Reading in the missing statistics from %s ... " %
                         (inputFname))
        locusID2Stat = {}

        reader = MatrixFile(inputFname=inputFname)
        reader.constructColName2IndexFromHeader()
        locusIDIndex = reader.getColIndexGivenColHeader("locusID")
        statIndex = reader.getColIndexGivenColHeader("occurrence_byFixedValue")
        counter = 0
        for row in reader:
            locusID = row[locusIDIndex]
            chromosome, start = locusID.split('_')[:2]
            start = int(start)
            stat = float(row[statIndex])

            key = (chromosome, start, start)
            if key in locusID2Stat:
                if stat < locusID2Stat[key]:
                    #take lowest value
                    locusID2Stat[key] = stat
            else:
                locusID2Stat[key] = stat
            counter += 1
        del reader
        sys.stderr.write(
            " %s unique loci with missing fraction out of %s total loci.\n" %
            (len(locusID2Stat), counter))
        return locusID2Stat
	def outputAlignmentDepthAndOthersForFilter(self, db_vervet=None, inputFname=None, \
						ref_ind_seq_id=None, depthFoldChange=2, minGQ=30, \
						outputFname=None, outputFileFormat=1):
		"""
		2012.6.12
			added argument db_vervet, moved from FilterVCFPipeline.py
		2011-9-2
		"""
		sys.stderr.write("Outputting alignment (from %s) coverage to %s ..."%(inputFname, outputFname))
		if inputFname:
			alignmentLs = db_vervet.getAlignmentsFromVCFFile(inputFname=inputFname)
		else:
			alignmentLs = db_vervet.getAlignments(ref_ind_seq_id=self.ref_ind_seq_id, \
										alignment_method_id=self.alignment_method_id, data_dir=self.data_dir,\
										local_realigned=self.local_realigned, outdated_index=self.alignment_outdated_index,\
										completedAlignment=self.completedAlignment, \
										reduce_reads=self.reduce_reads)
			"""
			TableClass = VervetDB.IndividualAlignment
			query = TableClass.query.filter(TableClass.median_depth!=None)
			if ref_ind_seq_id:
				query = query.filter(TableClass.ref_ind_seq_id==ref_ind_seq_id)
			alignmentLs = query.order_by(TableClass.id)
			"""
			
		alignmentLs = db_vervet.filterAlignments(data_dir=self.data_dir, alignmentLs=alignmentLs, sequence_filtered=self.sequence_filtered, \
						mask_genotype_method_id=None, parent_individual_alignment_id=None,\
						excludeContaminant=self.excludeContaminant,local_realigned=self.local_realigned,\
						reduce_reads=self.reduce_reads,\
						completedAlignment=self.completedAlignment,\
						alignment_method_id=self.alignment_method_id, \
						outdated_index=self.alignment_outdated_index)
		writer = MatrixFile(inputFname=outputFname, openMode='w', delimiter='\t')
		if outputFileFormat==1:
			header = ['alignmentID', 'medianDepth', "individualID"]
		else:
			header = ['alignmentID', 'minDepth', 'maxDepth', 'minGQ']
		writer.writeHeader(header)
		
		counter = 0
		for row in alignmentLs:
			read_group = row.read_group
			if outputFileFormat==1:
				data_row = [read_group, row.median_depth, row.individual_sequence.individual.id]
			else:
				minDepth = row.median_depth/float(depthFoldChange)
				if abs(minDepth-0)<=0.001:	#if it's too close to 0, assign 0.
					minDepth = 0
				data_row = [read_group, minDepth, row.median_depth*float(depthFoldChange), minGQ]
			writer.writerow(data_row)
			counter += 1
		writer.close()
		sys.stderr.write("%s entries fetched.\n"%(counter))
	def readInDataToPlot(self, input_fname, sampling_probability=1.0):
		"""
		2015.01.23 added argument sampling_probability to sub-sample data
		2013.07.11 use MatrixFile to read in the file
		2009-5-20
			add the column index into the column header for easy picking
		2009-3-13
			wrap the float conversion part into try...except to report what goes wrong
		2009-3-13
		"""
		if sampling_probability>1 or sampling_probability<0:
			sampling_probability=1.0
		reader = MatrixFile(inputFname=input_fname)
		self.column_header=reader.next()
		for i in range(len(self.column_header)):
			self.column_header[i] = '%s %s'%(i, self.column_header[i])
		no_of_cols = len(self.column_header)
		self.column_types = [str]*2 + [float]*(no_of_cols-2)
		self.column_editable_flag_ls = [True, True] + [False]*(no_of_cols-2)
		self.list_2d = []
		for row in reader:
			if sampling_probability>0 and sampling_probability<1:
				if random.random()>sampling_probability:	#skip
					continue
			float_part = row[2:]
			try:
				float_part = map(float, float_part)
			except:
				sys.stderr.write('Except type: %s\n'%repr(sys.exc_info()))
				traceback.print_exc()
			new_row = row[:2]+float_part
			self.list_2d.append(new_row)
		reader.close()
		self.setupColumns(self.treeview_matrix)
		#update status to reflect the input filename
		self.app1.set_title(os.path.basename(input_fname))
		self.app1_appbar1.push(input_fname)
		self.plotXY(self.ax, self.canvas, self.liststore, self.plot_title)
    def run(self):
        if self.debug:
            import pdb
            pdb.set_trace()

        outputDir = os.path.split(self.outputFname)[0]
        if outputDir and not os.path.isdir(outputDir):
            os.makedirs(outputDir)

        switchPointData = self.readInStats(inputFname=self.inputFname)

        sys.stderr.write("Processing data ...")
        writer = MatrixFile(self.outputFname, openMode='w')
        header = [
            "maxSwitchFrequency", "genomeCovered", 'genomeCoveredFraction',
            "noOfLoci", 'noOfLociFraction'
        ]
        writer.writeHeader(header)

        data_matrix = switchPointData.data_matrix
        totalSpan = switchPointData.totalSpan
        totalNoOfLoci = switchPointData.totalNoOfLoci

        #sort it based on switchFrequency
        data_matrix.sort(reverse=True)
        maxSwitchFrequencyLs = []
        cumulativeRegionSpanLs = []
        cumulativeNoOfLociLs = []
        for i in xrange(len(data_matrix)):
            switchFrequency, regionSpan, noOfLoci = data_matrix[i]
            maxSwitchFrequencyLs.append(switchFrequency)
            if i == 0:
                cumulativeRegionSpan = totalSpan - regionSpan

                cumulativeNoOfLoci = totalNoOfLoci - noOfLoci
            else:
                cumulativeRegionSpan = cumulativeRegionSpanLs[i -
                                                              1] - regionSpan
                cumulativeNoOfLoci = cumulativeNoOfLociLs[i - 1] - noOfLoci
            cumulativeRegionSpanLs.append(cumulativeRegionSpan)
            cumulativeNoOfLociLs.append(cumulativeNoOfLoci)
            writer.writerow([switchFrequency, cumulativeRegionSpan, cumulativeRegionSpan/float(totalSpan),\
                cumulativeNoOfLoci, cumulativeNoOfLoci/float(totalNoOfLoci)])
        writer.close()
        sys.stderr.write(".\n")
	def constructPedigreeGraphFromPlinkIBD(self, inputFname=None, maxDistanceToPOVector=0.04, drawDistribution=False, outputFnamePrefix=None):
		"""
		2012.8.14
		"""
		sys.stderr.write("Constructing pedigree-graph out of plink-ibd %s ..."%(inputFname))
		DG=nx.DiGraph()
		childNodeSet = set()
		reader = MatrixFile(inputFname)
		reader.constructColName2IndexFromHeader()
		
		monkey1IDIndex = reader.getColIndexGivenColHeader("IID1")
		monkey2IDIndex = reader.getColIndexGivenColHeader("IID2")
		Z0Index = reader.getColIndexGivenColHeader("Z0")
		Z1Index = reader.getColIndexGivenColHeader("Z1")
		Z2Index = reader.getColIndexGivenColHeader("Z2")
		
		poVector = numpy.array([0,1,0.0])
		counter = 0
		real_counter = 0
		
		data_ls = []
		for row in reader:
			monkey1ID = int(row[monkey1IDIndex])	#turn it into integer so could compare age
			monkey2ID = int(row[monkey2IDIndex])
			Z0 = float(row[Z0Index])
			Z1 = float(row[Z1Index])
			Z2 = float(row[Z2Index])
			ZVector = numpy.array([Z0, Z1, Z2])
			dist = numpy.linalg.norm(poVector-ZVector)
			if drawDistribution and outputFnamePrefix:
				data_ls.append(dist)
			if dist<=maxDistanceToPOVector:
				if monkey1ID>monkey2ID:
					childID = monkey1ID
					parentID = monkey2ID
				else:
					childID = monkey2ID
					parentID = monkey1ID
				DG.add_edge(parentID, childID, weight=dist)
				childNodeSet.add(childID)
				real_counter += 1
			counter += 1
		del reader
		sys.stderr.write("%s out of %s lines become PO pairs. %s children, %s nodes. %s edges. %s connected components.\n"%(\
							real_counter, counter, len(childNodeSet), DG.number_of_nodes(), DG.number_of_edges(), \
							nx.number_connected_components(DG.to_undirected())))
		if drawDistribution and outputFnamePrefix:
			outputFname = '%s_IBDVector2POVectorDist_hist.png'%(outputFnamePrefix)
			yh_matplotlib.drawHist(data_ls, title='', \
								xlabel_1D="dist(ZVector,POVector)", xticks=None, \
								outputFname=outputFname, min_no_of_data_points=10, \
								needLog=True, \
								dpi=200, min_no_of_bins=25)
		return PassingData(DG=DG, childNodeSet=childNodeSet)
示例#20
0
	def run(self):
		"""
		2013.07.24
		"""
		
		if self.debug:
			import pdb
			pdb.set_trace()
		
		#inf = utils.openGzipFile(self.inputFname)
		reader = MatrixFile(inputFname=self.inputFname)
		writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t')
		counter = 0
		if self.run_type==4:	#2013.2.1
			tfamIndividualData = self.getIndividualID2IndexFromTFAMFile(tfamFname=self.tfamFname)
			individualID2Index = tfamIndividualData.individualID2Index
			individualIDList = tfamIndividualData.individualIDList
			locus_id2individual_index_ls = self.getMendelErrorIndividualLocusData(mendelErrorFname=self.mendelErrorFname, \
												individualID2Index=individualID2Index)
			individual_index2no_of_genotype_marked_missing = {}
		else:
			individualID2Index = None
			individualIDList = None
			locus_id2individual_index_ls = None
			individual_index2no_of_genotype_marked_missing = None
		for row in reader:
			if self.run_type==2:
				new_row = self.processRow_ChangeChromosomeIDToX(row)
			elif self.run_type==3:
				new_row = self.processRow_addPositionStartBase(row)
			elif self.run_type==4:
				new_row = self.markGenotypeMissingIfInvolvedInMendelError(row=row, \
											locus_id2individual_index_ls=locus_id2individual_index_ls,\
											individual_index2no_of_genotype_marked_missing=individual_index2no_of_genotype_marked_missing)
				
			else:
				new_row = self.processRow(row)
			writer.writerow(new_row)
			counter += 1
		sys.stderr.write("%s lines modified.\n"%(counter))
		
		del reader
		del writer
		self.outputGenotypeMarkedMissingStat(outputFname=self.markMissingStatFname, \
								individual_index2no_of_genotype_marked_missing=individual_index2no_of_genotype_marked_missing, \
								individualIDList=individualIDList)
示例#21
0
	def getIndividualID2IndexFromTFAMFile(self, tfamFname=None):
		"""
		2013.07.24 return individualIDList as well
		2013.1.29
		"""
		sys.stderr.write("Getting individualID2Index from tfam file %s ..."%(tfamFname))
		individualID2Index = {}
		individualIDList = []
		reader = MatrixFile(inputFname=tfamFname)
		counter = 0
		for row in reader:
			individualID = row[1]
			individualID2Index[individualID] = len(individualID2Index)
			individualIDList.append(individualID)
			counter += 1
		del reader
		sys.stderr.write(" %s individuals.\n"%(len(individualID2Index)))
		return PassingData(individualID2Index=individualID2Index, individualIDList=individualIDList)
示例#22
0
    def getLocusNewID2mapPvalue(self, liftOverLocusMapPvalueFname=None):
        """
		2014.01.04
			oldChromosome, oldStart, oldStop, oldStrand, newChromosome, newStart, newStop, mapPvalue
		"""
        sys.stderr.write("Reading in the coordinate map from %s ..." %
                         (liftOverLocusMapPvalueFname))
        locusNewID2mapPvalue = {}
        reader = MatrixFile(inputFname=liftOverLocusMapPvalueFname)
        reader.constructColName2IndexFromHeader()
        strandIndex = reader.getColIndexGivenColHeader("oldStrand")
        newChromosomeIndex = reader.getColIndexGivenColHeader("newChromosome")
        newStartIndex = reader.getColIndexGivenColHeader("newStart")
        newStopIndex = reader.getColIndexGivenColHeader("newStop")
        mapPvalueIndex = reader.getColIndexGivenColHeader("mapPvalue")
        counter = 0
        for row in reader:
            strand = row[strandIndex]
            newChromosome = row[newChromosomeIndex]
            newStart = int(row[newStartIndex])
            newStop = int(row[newStopIndex])
            mapPvalue = float(row[mapPvalueIndex])

            key = (newChromosome, newStart, newStop)
            if key in locusNewID2mapPvalue:
                if mapPvalue < locusNewID2mapPvalue[key]:
                    #take lowest value
                    locusNewID2mapPvalue[key] = mapPvalue
            else:
                locusNewID2mapPvalue[key] = mapPvalue
            counter += 1
        del reader
        sys.stderr.write(
            "%s unique loci with map p-value out of %s total loci.\n" %
            (len(locusNewID2mapPvalue), counter))
        return locusNewID2mapPvalue
示例#23
0
    def getMonkeyPair2IBDVector(self, inputFname=None):
        """
		2012.9.10
			return monkeyIDSet as well
		2012.9.6
		"""
        sys.stderr.write("Getting monkey pair 2 IBD vector from %s  ..." % (inputFname))
        reader = MatrixFile(inputFname)
        reader.constructColName2IndexFromHeader()
        monkey1IDIndex = reader.getColIndexGivenColHeader("IID1")
        monkey2IDIndex = reader.getColIndexGivenColHeader("IID2")
        IBDIndex = reader.getColIndexGivenColHeader("PI_HAT")
        Z0Index = reader.getColIndexGivenColHeader("Z0")
        Z1Index = reader.getColIndexGivenColHeader("Z1")
        Z2Index = reader.getColIndexGivenColHeader("Z2")
        formatFunc = lambda x: "%.2f" % (x)
        monkeyPair2IBDVector = {}
        counter = 0
        monkeyIDSet = set()
        for row in reader:
            monkey1ID = row[monkey1IDIndex]
            monkey2ID = row[monkey2IDIndex]
            monkey_id_pair = [monkey1ID, monkey2ID]
            monkey_id_pair.sort()
            key = tuple(monkey_id_pair)
            Z0 = float(row[Z0Index])
            Z1 = float(row[Z1Index])
            Z2 = float(row[Z2Index])
            IBD = float(row[IBDIndex])
            IBDVector = [Z0, Z1, Z2]
            IBDVector = map(formatFunc, IBDVector)
            IBDVectorStr = ",".join(IBDVector)
            data = PassingData(IBD=IBD, IBDVector=IBDVector, IBDVectorStr=IBDVectorStr)
            if key in monkeyPair2IBDVector:
                sys.stderr.write(
                    "WARNING: key %s has value %s in monkeyPair2IBDVector already. value overwritten with %s.\n"
                    % (repr(key), monkeyPair2IBDVector.get(key), data)
                )
            monkeyPair2IBDVector[key] = data
            monkeyIDSet.add(monkey1ID)
            monkeyIDSet.add(monkey2ID)
            counter += 1
        sys.stderr.write(
            " %s pairs of IBD vectors for %s unique monkeys.\n" % (len(monkeyPair2IBDVector), len(monkeyIDSet))
        )
        return PassingData(monkeyPair2IBDVector=monkeyPair2IBDVector, monkeyIDSet=monkeyIDSet)
    def readInStats(self, inputFname=None):
        """
		2013.07.15
		"""
        sys.stderr.write("Reading stats from %s ..." % (inputFname))

        data_matrix = []

        reader = MatrixFile(inputFname)
        reader.constructColName2IndexFromHeader()
        switchFrequencyIndex = reader.getColIndexGivenColHeader(
            "noOfSwitchPoints_by_noOfLociWithUniqueHit")
        regionSpanIndex = reader.getColIndexGivenColHeader("regionSpan")
        noOfLociIndex = reader.getColIndexGivenColHeader("#sitesInInput2")

        totalSpan = 0
        totalNoOfLoci = 0
        counter = 0
        for row in reader:
            counter += 1
            switchFrequency = row[switchFrequencyIndex]
            regionSpan = row[regionSpanIndex]
            noOfLoci = row[noOfLociIndex]
            if switchFrequency and regionSpan and noOfLoci:  #non-empty
                switchFrequency = float(switchFrequency)
                regionSpan = int(float(regionSpan))
                noOfLoci = int(float(noOfLoci))
                data_matrix.append([switchFrequency, regionSpan, noOfLoci])
                totalSpan += regionSpan
                totalNoOfLoci += noOfLoci
        reader.close()
        sys.stderr.write(" %s valid entries (from %s rows) with totalSpan=%s, totalNoOfLoci=%s.\n"%\
            (len(data_matrix), counter, totalSpan, totalNoOfLoci))
        return PassingData(data_matrix=data_matrix,
                           totalSpan=totalSpan,
                           totalNoOfLoci=totalNoOfLoci)
示例#25
0
class ComputeLiftOverLocusProbability(parentClass):
    __doc__ = __doc__
    option_default_dict = parentClass.option_default_dict.copy()
    option_default_dict.update({
         ('locusIntervalDeltaOutputFname', 1, ): ['', '', 1, 'file that would contain delta of intervals from old and new coordinate system. \
	Used to check if normal distribution on each chromosome. Output format: oldChromosome, oldStart, oldStop, newChromosome, newStart, newStop, intervalDelta.'                                                                                                                                                               , ],\
         ('startPosition', 0, int):[None, '', 1, 'probability for loci whose start positions are bigger than this argument would be computed.\
	Model parameters are estimated using all input data. This argument is used to avoid edge/boundary effect.'                                                                                                              ],\
         ('stopPosition', 0, int):[None, '', 1, 'probability for loci whose stop positions are less than this argument would be computed.\
	Model parameters are estimated using all input data. This argument is used to avoid edge/boundary effect.'                                                                                                              ],\

         })

    def __init__(self, inputFnameLs=None, **keywords):
        """
		"""
        parentClass.__init__(self, inputFnameLs=inputFnameLs, **keywords)

    def setup(self, **keywords):
        """
		noOfTotalIntervals = 0
		noOfCrossChromosomeIntervals = 0
		
		targetChromosome 2 mapData
			intervalDeltaList	=> median
			orientation  (queryStrand)
				0=forward
				1=backward
			mean	=> using 80% of data (sort the delta list, then take 10% to 90% of the list)
			stddev	=> if stddev is zero, use 1.
		
		locusKey (oldChromosome, oldStart, oldStop) 2 mapData
			targetCoordinate (newChromosome, newStart, newStop).
			leftIntervalDelta: None = boundary
			rightIntervalDelta: None = boundary, 10E10 = cross chromosome
			
			probability: max( P(SNP_i_left_interval), P(SNP_i_right_interval)).
				P(interval):
					If one interval is on the same chromosome,  P(target-chromosome)*P(interval delta size)
					If not, P(chromosome-cross event). 
			
		Not implemented: for a whole genome input (rather than a window),
			an RBTree of windows should be used to counter regional effect.
		
		2013.11.24
			run before anything is run
		"""
        AbstractMatrixFileWalker.setup(self, **keywords)

        self.noOfTotalIntervals = 0.0
        self.noOfCrossChromosomeIntervals = 0.0  #make it float for division

        self.targetChromosome2mapData = {}
        self.locusKey2mapData = {}
        self.previousLocusData = None

        #write header for the main output
        header = [
            'oldChromosome', 'oldStart', 'oldStop', 'oldStrand',
            'newChromosome', 'newStart', 'newStop', 'mapPvalue'
        ]
        self.writer.writerow(header)
        self.invariantPData.headerOutputted = True  #avoid double header output

        #open the other writer and write header
        self.sideOutput = MatrixFile(self.locusIntervalDeltaOutputFname,
                                     openMode='w',
                                     delimiter='\t')
        header = [
            'oldChromosome', 'oldStart', 'oldStop', 'oldStrand',
            'newChromosome', 'newStart', 'newStop', 'intervalDelta'
        ]
        self.sideOutput.writeHeader(header)

    def processRow(self, row=None, pdata=None):
        """
		2012.10.7
		"""
        returnValue = 1
        self.col_name2index = getattr(pdata, 'col_name2index', None)
        queryStrandIndex = self.col_name2index.get("queryStrand")

        queryChromosomeIndex = self.col_name2index.get("queryChromosome")
        queryStartIndex = self.col_name2index.get("queryStart")
        queryStopIndex = self.col_name2index.get("queryStop")

        newChrIndex = self.col_name2index.get("newChr")
        newRefStartIndex = self.col_name2index.get("newRefStart")
        newRefStopIndex = self.col_name2index.get("newRefStop")

        queryStrand = row[queryStrandIndex]
        queryChromosome = row[queryChromosomeIndex]
        queryStart = int(row[queryStartIndex])
        queryStop = int(row[queryStopIndex])

        newChr = row[newChrIndex]
        newRefStart = int(row[newRefStartIndex])
        newRefStop = int(row[newRefStopIndex])

        #create current locus data
        locusKey = (queryChromosome, queryStart, queryStop)
        currentLocusData = PassingData(locusKey=locusKey, queryStrand=queryStrand, queryChromosome=queryChromosome,\
           queryStart=queryStart, queryStop=queryStop, \
           newChr=newChr, newRefStart=newRefStart, newRefStop=newRefStop)

        #insert entry into locusKey2mapData
        self.locusKey2mapData[locusKey] = PassingData(locusData = currentLocusData, leftIntervalDelta=None,\
                rightIntervalDelta=None, mapProbability=None)
        if self.previousLocusData is not None:
            #calculate interval delta
            if self.previousLocusData.newChr != currentLocusData.newChr:
                intervalDelta = 10E10
                self.noOfCrossChromosomeIntervals += 1
            else:
                querySpan = currentLocusData.queryStart - currentLocusData.queryStop
                targetSpan = currentLocusData.newRefStart - currentLocusData.newRefStop
                if queryStrand == '+':
                    intervalDelta = targetSpan - querySpan
                else:
                    intervalDelta = targetSpan + querySpan
                # insert it into self.targetChromosome2mapData
                if currentLocusData.newChr not in self.targetChromosome2mapData:
                    self.targetChromosome2mapData[currentLocusData.newChr] = PassingData(intervalDeltaList=[],\
                                  orientation=queryStrand,\
                                  mean=None,\
                                  stddev=None,\
                                  probability=None)
                self.targetChromosome2mapData[
                    currentLocusData.newChr].intervalDeltaList.append(
                        intervalDelta)

            #output to the side
            self.sideOutput.writerow([currentLocusData.queryChromosome,\
              currentLocusData.queryStart, currentLocusData.queryStop, currentLocusData.queryStrand, \
              currentLocusData.newChr, currentLocusData.newRefStart, currentLocusData.newRefStop, intervalDelta])

            #assign it as right interval delta of previous locus
            self.locusKey2mapData[self.previousLocusData.
                                  locusKey].rightIntervalDelta = intervalDelta

            # assign it as left interval delta of current locus.
            self.locusKey2mapData[locusKey].leftIntervalDelta = intervalDelta

            self.noOfTotalIntervals += 1

        self.previousLocusData = currentLocusData
        return returnValue

    def calculateLocusMapProbabilityGivenIntervalDelta(
            self,
            intervalDelta=None,
            targetChromosomeMapData=None,
            crossChromosomeProbability=None):
        """
		2013.11.25
		"""
        mapProbability = 1
        if intervalDelta is not None:
            if intervalDelta == 10E10:
                mapProbability *= crossChromosomeProbability
            else:
                lessThanGivenValueProb = norm.cdf(
                    intervalDelta,
                    loc=targetChromosomeMapData.mean,
                    scale=targetChromosomeMapData.stddev)
                if intervalDelta > targetChromosomeMapData.mean:  #two-sided p-value
                    deltaProb = 2 * (1 - lessThanGivenValueProb)
                else:
                    deltaProb = 2 * lessThanGivenValueProb
                mapProbability *= targetChromosomeMapData.probability * deltaProb

        return mapProbability

    def reduce(self, **keywords):
        """
		2012.10.15
			run after all files have been walked through
		"""
        counter = 0
        real_counter = 0

        locusKeyList = self.locusKey2mapData.keys()
        locusKeyList.sort()

        sys.stderr.write("%s target chromosomes, %s cross-chromosome intervals, %s total intervals .\n "%\
            (len(self.targetChromosome2mapData), self.noOfCrossChromosomeIntervals, self.noOfTotalIntervals))

        if self.noOfTotalIntervals > 0:
            sys.stderr.write(
                "Running estimates for each target chromosome ... ")
            #estimates for each chromosome
            self.crossChromosomeProbability = float(
                self.noOfCrossChromosomeIntervals) / self.noOfTotalIntervals
            for targetChromosome in self.targetChromosome2mapData:
                mapData = self.targetChromosome2mapData.get(targetChromosome)
                #overall probability for an interval to be on this chromosome
                if len(mapData.intervalDeltaList
                       ) == 0:  #just one crossing event
                    mapData.probability = 1 / float(self.noOfTotalIntervals)
                else:
                    mapData.probability = len(
                        mapData.intervalDeltaList) / float(
                            self.noOfTotalIntervals)
                #estimate mean and stddev
                mapData.intervalDeltaList.sort()
                startIndex = max(0, int(len(mapData.intervalDeltaList) * 0.1))
                stopIndex = max(
                    int(len(mapData.intervalDeltaList) * 0.9) + 1, 1)
                if startIndex >= stopIndex:
                    stopIndex = startIndex + 1
                robustDataList = mapData.intervalDeltaList[
                    startIndex:stopIndex]

                stddev = 1
                if len(robustDataList) > 0:
                    mapData.mean = numpy.mean(robustDataList)
                    if len(robustDataList) > 1:
                        stddev = numpy.std(robustDataList)
                else:
                    mapData.mean = 0
                if stddev == 0:
                    stddev = 1
                mapData.stddev = stddev
            sys.stderr.write(".\n")

            #output
            sys.stderr.write("Output %s SNPs with map p-value ..." %
                             (len(locusKeyList)))
            for locusKey in locusKeyList:
                counter += 1
                locusMapData = self.locusKey2mapData.get(locusKey)
                locusData = locusMapData.locusData
                if locusMapData.leftIntervalDelta != None:
                    leftProbability = self.calculateLocusMapProbabilityGivenIntervalDelta(intervalDelta=locusMapData.leftIntervalDelta, \
                             targetChromosomeMapData=self.targetChromosome2mapData.get(locusData.newChr),\
                             crossChromosomeProbability=self.crossChromosomeProbability)
                else:
                    leftProbability = 0
                if locusMapData.rightIntervalDelta != None:
                    rightProbability = self.calculateLocusMapProbabilityGivenIntervalDelta(intervalDelta=locusMapData.rightIntervalDelta, \
                           targetChromosomeMapData=self.targetChromosome2mapData.get(locusData.newChr),\
                           crossChromosomeProbability=self.crossChromosomeProbability)
                else:
                    rightProbability = 0
                mapProbability = max(leftProbability, rightProbability)
                data_row = [locusData.queryChromosome,\
                 locusData.queryStart, locusData.queryStop, locusData.queryStrand, \
                 locusData.newChr, locusData.newRefStart, locusData.newRefStop, mapProbability]
                self.writer.writerow(data_row)
                real_counter += 1
            sys.stderr.write("\n")
        else:  #single SNP (give a low probability)
            sys.stderr.write(
                "Zero intervals, output %s SNPs with 0.001 map p-value ..." %
                (len(locusKeyList)))
            for locusKey in locusKeyList:
                counter += 1
                locusMapData = self.locusKey2mapData.get(locusKey)
                locusData = locusMapData.locusData
                mapProbability = 0.001
                data_row = [locusData.queryChromosome,\
                 locusData.queryStart, locusData.queryStop, locusData.queryStrand, \
                 locusData.newChr, locusData.newRefStart, locusData.newRefStop, mapProbability]
                self.writer.writerow(data_row)
                real_counter += 1
            sys.stderr.write("\n")

        if counter > 0:
            fraction = float(real_counter) / float(counter)
        else:
            fraction = -1
        sys.stderr.write("%s/%s (%.3f) outputted.\n" %
                         (real_counter, counter, fraction))

        self.sideOutput.close()
        #close the self.invariantPData.writer
        AbstractMatrixFileWalker.reduce(self, **keywords)
示例#26
0
	def calculatePerSampleMismatchFraction(self, vcfFile1=None, vcfFile2=None, outputFname=None, overlapping_sample_id_set=None,\
										NA_call_encoding_set = set(['.', 'NA'])):
		"""
		2013.08.13 bugfix, derive overlapping_sites_set by itself, rather than use calculateOverlappingSites()
		2013.07.17 vcf files are no longer pre-loaded.
		2012.8.16
		"""
		sys.stderr.write("Finding matches for each sample at overlapping sites ...")
		writer = MatrixFile(outputFname, openMode='w', delimiter='\t')
		header = ['sample_id', 'no_of_matches', 'no_of_non_NA_pairs', 'matchFraction']
		no_of_samples_to_compare = len(overlapping_sample_id_set)
		
		vcfFile1._resetInput()
		vcfFile1.parseFile()
		vcfFile2._resetInput()
		vcfFile2.parseFile()
		
		overlapping_sites_set = set(vcfFile1.locus_id_ls) & set(vcfFile2.locus_id_ls)
		sys.stderr.write(" %s overlapping loci, "%(len(overlapping_sites_set)))
		
		header_ls_for_no_of_matches = []
		header_ls_for_no_of_non_NA_pairs = []
		header_ls_for_matchFraction = []
		overlapping_sample_id_list = list(overlapping_sample_id_set)
		overlapping_sample_id_list.sort()
		"""
		for sample_id in overlapping_sample_id_list:
			header_ls_for_no_of_matches.append('no_of_matches_for_%s'%(sample_id))
			header_ls_for_no_of_non_NA_pairs.append('no_of_non_NA_pairs_for_%s'%(sample_id))
			header_ls_for_matchFraction.append('matchFraction_for_%s'%(sample_id))
		
		#header = header + header_ls_for_no_of_matches + header_ls_for_no_of_non_NA_pairs + header_ls_for_matchFraction
		"""
		no_of_matches_per_sample_ls = [0]*no_of_samples_to_compare
		no_of_non_NA_pairs_per_sample_ls = [0]*no_of_samples_to_compare
		
		for locus_id in overlapping_sites_set:
			row_index1 = vcfFile1.locus_id2row_index[locus_id]
			row_index2 = vcfFile2.locus_id2row_index[locus_id]
			for j in xrange(len(overlapping_sample_id_list)):
				sample_id = overlapping_sample_id_list[j]
				col_index1 = vcfFile1.sample_id2index.get(sample_id)
				col_index2 = vcfFile2.sample_id2index.get(sample_id)
				#2012.1.17 bugfix below. so that 'AG' and 'GA' are same.
				call1 = vcfFile1.genotype_call_matrix[row_index1][col_index1]
				call2 = vcfFile2.genotype_call_matrix[row_index2][col_index2]
				if call1 not in NA_call_encoding_set and call2 not in NA_call_encoding_set:
					no_of_non_NA_pairs_per_sample_ls[j] += 1
					if nt2number[call1]==nt2number[call2]:	#2013.07.03 bugfix, 'AT' and 'TA' should be same. no phase
						no_of_matches_per_sample_ls[j] += 1
					else:
						#do nothing
						pass
		matchFractionLs = [-1]*no_of_samples_to_compare
		for j in xrange(no_of_samples_to_compare):
			if no_of_non_NA_pairs_per_sample_ls[j]>0:
				matchFractionLs[j] = no_of_matches_per_sample_ls[j]/float(no_of_non_NA_pairs_per_sample_ls[j])
		
		writer.writerow(header)
		for i in xrange(no_of_samples_to_compare):
			data_row = [overlapping_sample_id_list[i], no_of_matches_per_sample_ls[i], no_of_non_NA_pairs_per_sample_ls[i],\
					matchFractionLs[i]]
			writer.writerow(data_row)
		del writer
		sys.stderr.write("%s samples.\n"%(no_of_samples_to_compare))
示例#27
0
	def outputSwitchPointInfo(self, querySNPID2NewReferenceCoordinateLs=None, outputFname=None):
		"""
		2013.07.11
			output the switch point (adjacent sites mapped to two different chromosomes) information
		"""
		
		sys.stderr.write("Converting querySNPID2NewReferenceCoordinateLs to oldCoordinateKey2newCoordinateDataLs ... ")
		oldCoordinateKey2newCoordinateDataLs = {}
		counter = 0
		for querySNPID, newRefCoordinateLs in querySNPID2NewReferenceCoordinateLs.iteritems():
			oldCoordinateKey = None
			counter += len(newRefCoordinateLs)
			for newRefCoordinate in newRefCoordinateLs:
				if oldCoordinateKey is None:
					oldCoordinateKey = (newRefCoordinate.queryChromosome, newRefCoordinate.queryStart, newRefCoordinate.queryStop)
				if oldCoordinateKey not in oldCoordinateKey2newCoordinateDataLs:
					oldCoordinateKey2newCoordinateDataLs[oldCoordinateKey] = []
				oldCoordinateKey2newCoordinateDataLs[oldCoordinateKey].append(newRefCoordinate)
		sys.stderr.write(" %s old coordinate keys with %s new coordinates.\n"%(len(oldCoordinateKey2newCoordinateDataLs),\
																		counter))
		
		sys.stderr.write("Finding switch points ...")
		counter =0
		real_counter = 0
		noOfRecordsWithMultiNewCoords = 0

		oldChromosome2SwitchData = {}
		
		oldCoordinateKeyLs = oldCoordinateKey2newCoordinateDataLs.keys()
		oldCoordinateKeyLs.sort()
		for oldCoordinateKey in oldCoordinateKeyLs:
			counter +=1
			newRefCoordinateLs = oldCoordinateKey2newCoordinateDataLs.get(oldCoordinateKey)
			
			oldChromosome = oldCoordinateKey[0]
			
			if oldChromosome not in oldChromosome2SwitchData:
				oldChromosome2SwitchData[oldChromosome] = PassingData(noOfLociWithUniqueHit=0, noOfLoci=0, \
														spanStart=oldCoordinateKey[1], \
														spanStop=oldCoordinateKey[2], noOfSwitchPoints=0,\
														previousNewChromosome=None, previousNewRefStart=None,\
														previousNewRefStop=None,\
														previousOrientationOnNewChromosome=None)
			
			switchData = oldChromosome2SwitchData[oldChromosome]
			switchData.noOfLoci += 1
			
			if len(newRefCoordinateLs)>1:
				noOfRecordsWithMultiNewCoords += 1
				continue
			
			switchData.noOfLociWithUniqueHit += 1
			newRefCoordinate = newRefCoordinateLs[0]
			
			if switchData.previousNewChromosome is not None:
				if newRefCoordinate.newChr!=switchData.previousNewChromosome:
					switchData.noOfSwitchPoints += 1
					#reset the orientation
					switchData.previousOrientationOnNewChromosome = None
					
				else:	#on the same chromosome
					currentOrientation = (newRefCoordinate.newRefStart - switchData.previousNewRefStart)>=0
					if switchData.previousOrientationOnNewChromosome is not None:
						if currentOrientation !=switchData.previousOrientationOnNewChromosome:
							switchData.noOfSwitchPoints += 1
					switchData.previousOrientationOnNewChromosome = currentOrientation
					
			#adjust the spanStop
			if newRefCoordinate.queryStop > switchData.spanStop:
				switchData.spanStop = newRefCoordinate.queryStop
					
			
			switchData.previousNewChromosome = newRefCoordinate.newChr
			switchData.previousNewRefStart = newRefCoordinate.newRefStart
			switchData.previousNewRefStop = newRefCoordinate.newRefStop
			real_counter  += 1
		if counter >0:
			fraction = real_counter/float(counter)
		else:
			fraction = -1
		sys.stderr.write("%s (out of %s, %s) records found new coordinates. %s records with >1 new coordinates, discarded.\n"%(real_counter, counter, \
																	fraction, noOfRecordsWithMultiNewCoords))
		
		
		sys.stderr.write("Outputting switch points of %s old chromosomes ..."%(len(oldChromosome2SwitchData)))
		statFile = MatrixFile(inputFname=outputFname, openMode='w', delimiter='\t')
		header = ['oldChromosome', "noOfSwitchPoints", "regionSpan", "noOfLociWithUniqueHit", "noOfSwitchesPerLocus", "noOfLoci"]
		statFile.writeHeader(header)
		noOfTotalSwitchPoints = 0
		noOfTotalLoci = 0
		for oldChromosome, switchData in oldChromosome2SwitchData.iteritems():
			if switchData.noOfLociWithUniqueHit>0:
				switchPointFraction = switchData.noOfSwitchPoints/float(switchData.noOfLociWithUniqueHit)
			else:
				switchPointFraction = -1
			data_row = [oldChromosome, switchData.noOfSwitchPoints, switchData.spanStop-switchData.spanStart+1, \
					switchData.noOfLociWithUniqueHit, switchPointFraction, len(oldCoordinateKey2newCoordinateDataLs)]
			statFile.writerow(data_row)
			noOfTotalSwitchPoints += switchData.noOfSwitchPoints
			noOfTotalLoci += switchData.noOfLociWithUniqueHit
		statFile.close()
		sys.stderr.write(' %s total switch points, %s total loci with unique hit.\n'%(noOfTotalSwitchPoints, noOfTotalLoci))
示例#28
0
	def calculateOverlappingSites(self, vcfFile1=None, vcfFile2=None, outputFname=None, overlappingSitesOutputFname=None,\
						chromosome=None, chrLength=None):
		"""
		2013.09.10
			added argument overlappingSitesOutputFname
		2013.07.17 vcf files are no longer pre-loaded. read in locus ids first. 
		2012.8.16
		"""
		writer = MatrixFile(outputFname, openMode='w', delimiter='\t')
		header = ['#chromosome', 'length', '#sitesInInput1', '#sitesInInput2', '#overlapping', 'overlappingOverTotal', \
				'overlappingOverInput1', 'overlappingOverInput2', '#segregatingSitesNormalized', ]
		
		vcf1_locus_id_list = []
		for row in vcfFile1.reader:
			vcf1_locus_id_list.append((row[0], row[1]))
		vcf2_locus_id_list = []
		for row in vcfFile2.reader:
			vcf2_locus_id_list.append((row[0], row[1]))
		
		no_of_sites_of_input1 = len(vcf1_locus_id_list)
		no_of_sites_of_input2 = len(vcf2_locus_id_list)
		overlapping_sites_set = set(vcf1_locus_id_list)&set(vcf2_locus_id_list)
		if overlappingSitesOutputFname:
			#outputFname = "%s_overlapSitePos.tsv"%(outputFnamePrefix)
			self.outputOverlapSites(overlapping_sites_set=overlapping_sites_set, outputFname=overlappingSitesOutputFname)
		
		no_of_overlapping_sites = len(overlapping_sites_set)
		no_of_total_sites = no_of_sites_of_input1+no_of_sites_of_input2-no_of_overlapping_sites
		if no_of_total_sites>0:
			overlapping_fraction = no_of_overlapping_sites/float(no_of_total_sites)
		else:
			overlapping_fraction = -1
		
		if no_of_sites_of_input1>0:
			overlappingOverInput1 = no_of_overlapping_sites/float(no_of_sites_of_input1)
		else:
			overlappingOverInput1 = -1
		
		if no_of_sites_of_input2>0:
			overlappingOverInput2 = no_of_overlapping_sites/float(no_of_sites_of_input2)
		else:
			overlappingOverInput2 = -1
		
		no_of_samples = len(vcfFile1.sample_id2index)
		no_of_samples_in_vcf2 = len(vcfFile2.sample_id2index)
		overlapping_sample_id_set = set(vcfFile1.sample_id2index.keys()) & set(vcfFile2.sample_id2index.keys())
		
		if no_of_samples!=no_of_samples_in_vcf2:
			sys.stderr.write("Warning: sample size in %s is %s, in %s is %s. not matching.\n"%\
							(vcfFile1.inputFname, no_of_samples, vcfFile2.inputFname, no_of_samples_in_vcf2))
		
		#exclude the ref sample in the 1st column
		if no_of_samples>1:
			normalizingConstant = float(utils.sumOfReciprocals(no_of_samples*2-1))
		else:
			normalizingConstant = 1
		noOfSegregatesSitesNormalized = no_of_overlapping_sites/(normalizingConstant*chrLength)
		
		writer.writerow(header)
		"""
		#reformat for output
		no_of_matches_per_sample_ls = map(repr, no_of_matches_per_sample_ls)
		no_of_non_NA_pairs_per_sample_ls = map(repr, no_of_non_NA_pairs_per_sample_ls)
		matchFractionLs = map(repr, matchFractionLs)
		"""
		writer.writerow([chromosome, chrLength, no_of_sites_of_input1, no_of_sites_of_input2, no_of_overlapping_sites, \
						overlapping_fraction, overlappingOverInput1, overlappingOverInput2, \
						noOfSegregatesSitesNormalized])
		del writer
		return PassingData(overlapping_sample_id_set=overlapping_sample_id_set,overlapping_sites_set=overlapping_sites_set) 
示例#29
0
    def readInCoordinateMap(self, coordinateMapFname=None):
        """
		2013.07.11
			querySNPID      queryStrand     queryChromosome queryStart      queryStop       queryRefBase    queryAltBase    queryAlignmentSpan
			queryAlignmentStart     queryAlignmentStop      newChr  newRefStart     newRefStop      newRefBase      targetAlignmentSpan
			targetAlignmentStart    targetAlignmentStop
		"""
        sys.stderr.write("Reading in the coordinate map from %s ..." %
                         (coordinateMapFname))
        oldCoordinate2newCoordinateDataLs = {}
        reader = MatrixFile(inputFname=coordinateMapFname)
        reader.constructColName2IndexFromHeader()
        oldChromosomeIndex = reader.getColIndexGivenColHeader(
            "queryChromosome")
        oldStartIndex = reader.getColIndexGivenColHeader("queryStart")
        strandIndex = reader.getColIndexGivenColHeader("queryStrand")
        oldRefBaseIndex = reader.getColIndexGivenColHeader("queryRefBase")
        oldAltBaseIndex = reader.getColIndexGivenColHeader("queryAltBase")

        newChromosomeIndex = reader.getColIndexGivenColHeader("newChr")
        newStartIndex = reader.getColIndexGivenColHeader("newRefStart")
        newStopIndex = reader.getColIndexGivenColHeader("newRefStop")
        newRefBaseIndex = reader.getColIndexGivenColHeader("newRefBase")
        counter = 0
        for row in reader:
            oldChromosome = row[oldChromosomeIndex]
            oldStart = int(row[oldStartIndex])
            strand = row[strandIndex]
            oldRefBase = row[oldRefBaseIndex]
            oldAltBase = row[oldAltBaseIndex]

            newChromosome = row[newChromosomeIndex]
            newStart = int(row[newStartIndex])
            newStop = int(row[newStopIndex])
            newRefBase = row[newRefBaseIndex]

            key = (oldChromosome, oldStart)
            if key not in oldCoordinate2newCoordinateDataLs:
                oldCoordinate2newCoordinateDataLs[key] = []
            oldCoordinate2newCoordinateDataLs[key].append(PassingData(strand=strand, oldRefBase=oldRefBase, \
                     oldAltBase=oldAltBase, newChromosome=newChromosome, newStart=newStart,\
                     newStop=newStop, newRefBase=newRefBase))
            counter += 1
        del reader
        sys.stderr.write("%s old coordinates with %s new coordinates.\n" %
                         (len(oldCoordinate2newCoordinateDataLs), counter))
        return oldCoordinate2newCoordinateDataLs
示例#30
0
#!/usr/bin/env python

import os, sys
inputFname = os.path.expanduser("~/RefGenomes/dustPlus10_M1-22XY.bed.gz")
inputFname = os.path.expanduser("~/script/varcmp/scripts/LCR-hs37d5.bed.gz")
inputFname = os.path.expanduser("~/RefGenomes/dust_M1-22XY.bed.gz")
inputFname = os.path.expanduser("/illumina/scratch/CompetitiveAnalysis/CAG/Data/AnnotDB/Repeats/SegDups/genomicSuperDups_hg19.bed")

inputFname = os.path.expanduser("~/RefGenomes/dustPlus10_M1-22XY.overlap.genomicSuperDups_hg19.merged.bed")
inputFname=sys.argv[1]
sys.path.insert(0, os.path.expanduser('~/lib/python'))
sys.path.insert(0, os.path.join(os.path.expanduser('~/script')))
from pymodule import utils
from pymodule import MatrixFile
reader = MatrixFile(inputFname=inputFname, openMode='r', delimiter='\t')
span=0

for row in reader:
    if row[0][0]=='#':
        continue
    subSpan = int(row[2])-int(row[1]) + 1
    span += subSpan

print("span is %s \n"%(span))