def openWriteBeagleFiles(self,
                             pedigreeFamilyData=None,
                             outputFnamePrefix=None):
        """
        2013.05.02
            
        The non-likelihood (unphased, trios, pairs) Beagle format:
            I id sample1 sample1 sample2 sample2
            A diabetes 1 1 2 2
            M rs12082861 C C C C
            M rs4912233 T C C C
            M rs12732823 G A A A
            M rs17451521 C C C C
            M rs12033358 C T T T
        
        The likelihood version is
            marker alleleA alleleB 1000_709_1996093_GA_vs_524 1000_709_1996093_GA_vs_524 1000_709_1996093_GA_vs_524 1001_710_1995025_GA_vs_524 1001_710_1995025_GA_vs_524 1001_710_1995025_GA_vs_524 1002_711_2001039_GA_vs_524
            Contig791:1086 C A 0.9693 0.0307 0.0000 0.6660 0.3338 0.0003 0.0000
            Contig791:1649 G C 0.9406 0.0594 0.0000 0.9693 0.0307 0.0000 0.0000
            Contig791:4084 A C 0.9980 0.0020 0.0000 0.9844 0.0156 0.0000 0.0000
        
        The markers file has this format (markerID, position, alleleA, alleleB)
            Contig791:1086 1086 C A
        """
        sys.stderr.write(
            "Opening beagle files (outputFnamePrefix =%s) to write ..." %
            (outputFnamePrefix))
        familySize2BeagleFileHandler = {}
        familySize2SampleIDList = pedigreeFamilyData.familySize2SampleIDList
        counter = 0
        for familySize, sampleIDList in familySize2SampleIDList.items():
            if familySize not in familySize2BeagleFileHandler:
                tmpOutputFnamePrefix = '%s_familySize%s' % (outputFnamePrefix,
                                                            familySize)
                writer = MatrixFile(path='%s.bgl' % (tmpOutputFnamePrefix),
                                    mode='w',
                                    delimiter=' ')
                familySize2BeagleFileHandler[familySize] = writer
                if familySize == 1:
                    headerRow = ['marker', 'alleleA', 'alleleB']
                else:
                    headerRow = ['I', 'id']
                for sampleID in sampleIDList:
                    if familySize == 1:
                        #likelihood format has sample name replicated three times, rather than 2 times
                        headerRow.extend([sampleID] * 3)
                    else:
                        headerRow.extend([sampleID] * 2)
                writer.writeHeader(headerRow)
                counter += 1
        markersFile = MatrixFile(path='%s.markers' % (outputFnamePrefix),
                                 mode='w',
                                 delimiter=' ')

        counter += 1
        sys.stderr.write("%s files outputted.\n" % (counter))

        return PassingData(
            familySize2BeagleFileHandler=familySize2BeagleFileHandler,
            markersFile=markersFile)
	def run(self):
		if self.debug:
			import pdb
			pdb.set_trace()
		
		
		outputDir = os.path.split(self.outputFname)[0]
		if outputDir and not os.path.isdir(outputDir):
			os.makedirs(outputDir)
		
		reader = VCFFile(inputFname=self.inputFname)
		
		alignmentFile = pysam.Samfile(self.alignmentFilename, "rb")
		
		writer = VCFFile(outputFname=self.outputFname, mode='w')
		writer.metaInfoLs = reader.metaInfoLs
		writer.header = reader.header
		writer.writeMetaAndHeader()
		
		statWriter = MatrixFile(self.missingStatFname, mode='w', delimiter='\t')
		header = ["sampleID", "locusID", 'chromosome', 'start', 'stop', 'occurrence', 'missingReason', \
				'fractionOfGoodRead', 'medianMapQ', 'totalNoOfReads']
		statWriter.writeHeader(header)
		
		counter = 0
		real_counter = 0
		minDepth = self.alignmentMedianDepth/self.alignmentDepthFold
		maxDepth = self.alignmentMedianDepth*self.alignmentDepthFold
		
		for vcfRecord in reader:
			locusID = "%s_%s"%(vcfRecord.chromosome, vcfRecord.position)
			alignedReadLs = alignmentFile.fetch(vcfRecord.chromosome, vcfRecord.position-1, vcfRecord.position+1)	#start and end in fetch() are 0-based.
			locusLowMapQData = self.returnLocusLowMapQualityIndicator(alignedReadLs=alignedReadLs,\
												minMapQGoodRead=self.minMapQGoodRead, minFractionOfGoodRead=self.minFractionOfGoodRead)
			locusLowMapQIndicator = locusLowMapQData.locusLowMapQIndicator
			depth = locusLowMapQData.totalNoOfReads
			if depth>=minDepth and depth <=maxDepth:
				locusOutOfDepthIndicator = 0 	#good
			else:
				locusOutOfDepthIndicator = 1
			
			locusLowQualityIndicator = locusOutOfDepthIndicator + locusLowMapQIndicator
			data_row = [self.sampleID, locusID, vcfRecord.chromosome, vcfRecord.position, vcfRecord.position,\
						1, locusLowQualityIndicator, locusLowMapQData.fractionOfGoodRead, \
						locusLowMapQData.medianMapQ, locusLowMapQData.totalNoOfReads]
			statWriter.writerow(data_row)
			if locusLowQualityIndicator>0:
				real_counter += 1
				#modify the VCF record
				#get sample ID column, then set its genotype missing
				vcfRecord.setGenotypeCallForOneSample(sampleID=self.sampleID, genotype="./.", convertGLToPL=True)
			#2014.1.4 output VCF record
			writer.writeVCFRecord(vcfRecord)
			counter += 1
		reader.close()
		statWriter.close()
		writer.close()
		sys.stderr.write("%s (out of %s, %s) genotypes marked missing.\n"%(real_counter, counter, \
												real_counter/float(counter)))
예제 #3
0
    def run(self):
        if self.debug:
            import pdb
            pdb.set_trace()

        outputDir = os.path.split(self.outputFname)[0]
        if outputDir and not os.path.isdir(outputDir):
            os.makedirs(outputDir)

        snp_pos2genotypeVectorLs = self.readInSNPID2GenotypeVectorLs(
            self.inputFname).snp_pos2returnData

        writer = MatrixFile(self.outputFname, mode='w', delimiter='\t')
        header = [
            'chromosome', 'position', 'noOfMatches', 'noOfTotal', 'concordance'
        ]
        writer.writeHeader(header)

        counter = 0
        real_counter = 0
        no_of_pairs = 0
        snp_pos_ls = sorted(snp_pos2genotypeVectorLs)
        for i in range(len(snp_pos_ls)):
            counter += 1
            key = snp_pos_ls[i]
            chromosome, position = snp_pos_ls[i][:2]
            genotypeVectorLs = snp_pos2genotypeVectorLs.get(key)
            if len(genotypeVectorLs) > 1:
                real_counter += 1
                for k in range(0, len(genotypeVectorLs) - 1):
                    for l in range(k + 1, len(genotypeVectorLs)):
                        no_of_pairs += 1
                        noOfMatches = 0
                        noOfTotal = 0
                        genotypeVector0 = genotypeVectorLs[k]
                        genotypeVector1 = genotypeVectorLs[l]
                        for j in range(len(genotypeVector0)):
                            call1 = genotypeVector0[j]['GT']
                            call2 = genotypeVector1[j]['GT']
                            if call1 != 'NA' and call2 != 'NA':
                                noOfTotal += 1
                                if SNP.nt2number[call1] == SNP.nt2number[
                                        call2]:
                                    noOfMatches += 1
                        if noOfTotal > 0:
                            concordance = float(noOfMatches) / float(noOfTotal)
                        else:
                            concordance = -1
                        data_row = [
                            chromosome, position, noOfMatches, noOfTotal,
                            concordance
                        ]
                        writer.writerow(data_row)
        writer.close()
        sys.stderr.write("%s (out of %s, %s) snps have >1 same-position entries. %s pairs.\n"%(real_counter, counter, \
                                                real_counter/float(counter), no_of_pairs))
    def run(self):
        if self.debug:
            import pdb
            pdb.set_trace()

        outputDir = os.path.split(self.outputFname)[0]
        if outputDir and not os.path.isdir(outputDir):
            os.makedirs(outputDir)

        switchPointData = self.readInStats(inputFname=self.inputFname)

        sys.stderr.write("Processing data ...")
        writer = MatrixFile(self.outputFname, mode='w')
        header = [
            "maxSwitchFrequency", "genomeCovered", 'genomeCoveredFraction',
            "noOfLoci", 'noOfLociFraction'
        ]
        writer.writeHeader(header)

        data_matrix = switchPointData.data_matrix
        totalSpan = switchPointData.totalSpan
        totalNoOfLoci = switchPointData.totalNoOfLoci

        #sort it based on switchFrequency
        data_matrix.sort(reverse=True)
        maxSwitchFrequencyLs = []
        cumulativeRegionSpanLs = []
        cumulativeNoOfLociLs = []
        for i in range(len(data_matrix)):
            switchFrequency, regionSpan, noOfLoci = data_matrix[i]
            maxSwitchFrequencyLs.append(switchFrequency)
            if i == 0:
                cumulativeRegionSpan = totalSpan - regionSpan

                cumulativeNoOfLoci = totalNoOfLoci - noOfLoci
            else:
                cumulativeRegionSpan = cumulativeRegionSpanLs[i -
                                                              1] - regionSpan
                cumulativeNoOfLoci = cumulativeNoOfLociLs[i - 1] - noOfLoci
            cumulativeRegionSpanLs.append(cumulativeRegionSpan)
            cumulativeNoOfLociLs.append(cumulativeNoOfLoci)
            writer.writerow([switchFrequency, cumulativeRegionSpan, cumulativeRegionSpan/float(totalSpan),\
                cumulativeNoOfLoci, cumulativeNoOfLoci/float(totalNoOfLoci)])
        writer.close()
        sys.stderr.write(".\n")
예제 #5
0
    def run(self):
        """
		"""

        if self.debug:
            import pdb
            pdb.set_trace()

        reader = MatrixFile(path=self.inputFname)
        reader.constructColName2IndexFromHeader()

        meanMendelErrorIndex = reader.getColIndexGivenColHeader(
            "meanMendelError")
        noOfLociIndex = reader.getColIndexGivenColHeader("sampled_base_count")
        sumOfMendelErrorIndex = reader.getColIndexGivenColHeader(
            "sumOfMendelError")

        plinkPedigreeFile = PlinkPedigreeFile(path=self.pedigreeFname)
        familyStructureData = plinkPedigreeFile.getFamilyStructurePlinkWay()

        twoParentFamilyCountData = self.getNoOfFamiliesAndKidsGivenParentSetSize(noOfParents2FamilyData=familyStructureData.noOfParents2FamilyData, \
                        parentSetSize=2)
        singleParentFamilyCountData = self.getNoOfFamiliesAndKidsGivenParentSetSize(noOfParents2FamilyData=familyStructureData.noOfParents2FamilyData, \
                        parentSetSize=1)
        zeroParentFamilyCountData = self.getNoOfFamiliesAndKidsGivenParentSetSize(noOfParents2FamilyData=familyStructureData.noOfParents2FamilyData, \
                        parentSetSize=0)

        writer = MatrixFile(self.outputFname, mode='w', delimiter='\t')
        header = ["ID", "noOfTotalLoci", \
          "noOfTwoParentFamilies", "noOfParentsInTwoParentFamilies", "noOfKidsInTwoParentFamilies", "noOfIndividualsInTwoParentFamilies", \
          "noOfSingleParentFamilies", "noOfParentsInSingleParentFamilies", "noOfKidsInSingleParentFamilies",  "noOfIndividualsInSingleParentFamilies", \
          "noOfZeroParentFamilies", "noOfParentsInZeroParentFamilies", "noOfKidsInZeroParentFamilies", "noOfIndividualsInZeroParentFamilies", \
          "noOfTotalMendelErrors", \
          "noOfMendelErrorsPerLocusPerNuclearFamily", "noOfMendelErrorsPerNuclearFamily"]
        writer.writeHeader(header)
        for row in reader:
            meanMendelError = float(row[meanMendelErrorIndex])
            noOfLoci = int(row[noOfLociIndex])
            sumOfMendelError = int(row[sumOfMendelErrorIndex])
            noOfNuclearFamilies = twoParentFamilyCountData.noOfFamilies
            if noOfNuclearFamilies > 0:
                noOfMendelErrorsPerLocusPerNuclearFamily = meanMendelError / float(
                    noOfNuclearFamilies)
                noOfMendelErrorsPerNuclearFamily = sumOfMendelError / float(
                    noOfNuclearFamilies)
            else:
                noOfMendelErrorsPerLocusPerNuclearFamily = -1
                noOfMendelErrorsPerNuclearFamily = -1
            data_row = [row[0], noOfLoci, \
              noOfNuclearFamilies, twoParentFamilyCountData.noOfParents, twoParentFamilyCountData.noOfKids, \
               twoParentFamilyCountData.noOfIndividuals,\
              singleParentFamilyCountData.noOfFamilies,  singleParentFamilyCountData.noOfParents,  singleParentFamilyCountData.noOfKids,\
               singleParentFamilyCountData.noOfIndividuals,\
              zeroParentFamilyCountData.noOfFamilies, zeroParentFamilyCountData.noOfParents,  zeroParentFamilyCountData.noOfKids,\
               zeroParentFamilyCountData.noOfIndividuals,\
              sumOfMendelError, \
              noOfMendelErrorsPerLocusPerNuclearFamily,noOfMendelErrorsPerNuclearFamily ]
            writer.writerow(data_row)

        plinkPedigreeFile.close()
        reader.close()
        writer.close()