def getAssociationLandscapeDataFromHDF5File(inputFname=None, associationTableName='association', \ landscapeTableName='landscape', min_MAF=0.1): """ 2012.11.20 input is in HDF5MatrixFile format (which is output of variation/src/association_peak/ contains two hdf5 groups. one is by associationTableName. the other is by landscapeTableName. """ pdata = PassingData(min_MAF=min_MAF) genome_wide_result = getGenomeWideResultFromHDF5MatrixFile(inputFname=inputFname, \ min_value_cutoff=None, do_log10_transformation=False, pdata=pdata,\ construct_chr_pos2index=False, construct_data_obj_id2index=False, \ construct_locus_db_id2index=True,\ report=True, tableName=associationTableName) returnData = PassingData(genome_wide_result=genome_wide_result) sys.stderr.write("Reading landscape from %s ..."%(inputFname)) current_obj = None bridge_ls = [] locusLandscapeNeighborGraph = nx.Graph() reader = HDF5MatrixFile(inputFname, mode='r') landscapeTableObject = reader.getTableObject(tableName=landscapeTableName) returnData.HDF5AttributeNameLs = [] for attributeName, value in landscapeTableObject.getAttributes().items(): returnData.HDF5AttributeNameLs.append(attributeName) setattr(returnData, attributeName, value) for row in landscapeTableObject: if row.start_locus_id==0: #empty data. happens when inputFname contains no valid landscape, but one default null data point. continue start_locus_id = row.start_locus_id stop_locus_id = row.stop_locus_id no_of_loci = row.no_of_loci deltaX = row.deltaX start_obj = genome_wide_result.get_data_obj_by_locus_db_id(start_locus_id) stop_obj = genome_wide_result.get_data_obj_by_locus_db_id(stop_locus_id) bridge_ls.append([start_obj, stop_obj, no_of_loci, deltaX]) source_index = start_obj.index #genome_wide_result.get_data_obj_index_by_locus_db_id(start_locus_id) target_index = stop_obj.index locusLandscapeNeighborGraph.add_edge(source_index, target_index, \ weight=None) locusLandscapeNeighborGraph[source_index][target_index]['no_of_loci'] = no_of_loci locusLandscapeNeighborGraph[source_index][target_index]['deltaX'] = deltaX del reader sys.stderr.write("%s bridges.\n"%(len(bridge_ls))) returnData.bridge_ls = bridge_ls returnData.locusLandscapeNeighborGraph = locusLandscapeNeighborGraph return returnData
def addPolymorphism(self, name=None, individualName=None, locusName=None, chromosome_copy = None,\ allele_sequence=None, allele_sequence_length=None, allele_type =None, **keywords): """ 2013.3.10 """ if name: individual_id = self.getIndividual(name=individualName).id locus_id = self.getLocus(name=locusName).id oneCell = PassingData(name=name, individual_id=individual_id, locus_id = locus_id, chromosome_copy=chromosome_copy, allele_sequence = allele_sequence, allele_sequence_length=allele_sequence_length,\ allele_type=allele_type, **keywords) self.polymorphismTable.writeOneCell(oneCell, cellType=2) self.flush() return self.checkPolymorphism(name=name) #would this work without flush()?
def getNoOfFamiliesAndKidsGivenParentSetSize(self, noOfParents2FamilyData=None, parentSetSize=2): """ 2013.07.19 """ familyData = noOfParents2FamilyData.get(parentSetSize, None) if familyData: noOfFamilies = len(familyData.parentTupleSet) noOfParents = len(familyData.parentIDSet) noOfKids = len(familyData.childIDSet) noOfIndividuals = len(familyData.individualIDSet) else: noOfFamilies = 0 noOfParents = 0 noOfKids = 0 noOfIndividuals = 0 return PassingData(noOfFamilies=noOfFamilies, noOfParents=noOfParents, noOfKids=noOfKids, noOfIndividuals=noOfIndividuals)
def parseChrStartStopFromFilename(filename=None, chr2size=None, defaultChromosomeSize=10000000000): """ 2013.09.18 #10000000000 is used when filename contains data from a whole chromosome # and chr2size is not available or not containing chromosome make it very big so that it could be intersected with any interval from any chromosome. """ searchResult = if searchResult: chromosome = start = int( stop = int( else: #try chromosome = getChrFromFname(filename=filename) start =1 if chr2size is not None: stop = chr2size.get(chromosome, defaultChromosomeSize) else: stop = defaultChromosomeSize return PassingData(chromosome=chromosome, start=start, stop=stop)
def writeChrStartStopTupleList2LocusTable(self, chr_start_stop_list=None, chromosomeLength=None,\ speciesName=None, ploidy=None): """ 2013.3.7 #. establish _locus_index2id, to be used in writeIndividualName2PolymorphismData() #. make sure chr_start_stop_list is in the same order as the haplotype in writeIndividualName2PolymorphismData() """ sys.stderr.write("Writing a %s-element list of (chr, start,stop) out ..."%(len(chr_start_stop_list))) chr_start_stop_list.sort() #make sure it's sorted if ploidy is None: ploidy=self.ploidy for i in range(len(chr_start_stop_list)): chromosomeName, start, stop = chr_start_stop_list[i][:3] if chromosomeName: chromosomeEntry = self.getChromosome(name=chromosomeName, length=chromosomeLength, speciesName=speciesName,\ ploidy=ploidy) else: chromosomeEntry = None name = '%s_%s_%s'%(chromosomeName, start, stop) oneCell = PassingData(name=name, chromosome_id=getattr(chromosomeEntry, 'id', None), start=start, stop=stop) self.locusTable.writeOneCell(oneCell, cellType=2) self._locus_index2id[i] = self.locusTable.no_of_rows sys.stderr.write("%s loci \n") return self._locus_index2id
def setup(self, **keywords): """ 2012.10.15 run before anything is run """ AbstractMatrixFileWalker.setup(self, **keywords) #self.writer = BeagleGenotypeFile(path=self.outputFname, mode='w') #read in the IBD check result self.ibdData = SNP.readAdjacencyListDataIntoMatrix(inputFname=self.pedigreeKinshipFilePath, \ rowIDHeader=None, colIDHeader=None, \ rowIDIndex=0, colIDIndex=1, \ dataHeader=None, dataIndex=2, hasHeader=False) #. read in the alignment coverage data alignmentCoverageFile = MatrixFile( path=self.individualAlignmentCoverageFname) alignmentCoverageFile.constructColName2IndexFromHeader() alignmentReadGroup2coverageLs = alignmentCoverageFile.constructDictionary( keyColumnIndexList=[0], valueColumnIndexList=[1]) alignmentCoverageFile.close() sys.stderr.write( "Reading in all samples from %s VCF input files ... \n" % (len(self.inputFnameLs))) # read all the Beagle files individualID2HaplotypeData = {} for inputFname in self.inputFnameLs: vcfFile = VCFFile(inputFname=inputFname) #vcfFile.readInAllHaplotypes() for individualID in vcfFile.getSampleIDList(): individualID2HaplotypeData[individualID] = None #haplotypeList = vcfFile.getHaplotypeListOfOneSample(individualID) #individualID2HaplotypeData[individualID] = PassingData(haplotypeList=haplotypeList, # locusIDList=vcfFile.locusIDList) # get all haplotypes , etc. # get all sample IDs sys.stderr.write("%s individuals total.\n" % (len(individualID2HaplotypeData))) #. read in the pedigree or deduce it from Beagle Trio/Duo genotype file (columns) #. construct individualID2pedigreeContext, context: familySize=1/2/3, familyPosition=1/2 (parent/child) sys.stderr.write("Constructing individualID2pedigreeContext ...") plinkPedigreeFile = PlinkPedigreeFile(path=self.pedigreeFname) pGraph = plinkPedigreeFile.pedigreeGraph #shrink the graph to only individuals with data pGraph = nx.subgraph(pGraph, individualID2HaplotypeData.keys()) cc_subgraph_list = nx.connected_component_subgraphs( pGraph.to_undirected()) individualID2familyContext = {} outDegreeContainer = NumberContainer(minValue=0) familySizeContainer = NumberContainer(minValue=0) individualCoverageContainer = NumberContainer(minValue=0) familyCoverageContainer = NumberContainer(minValue=0) for cc_subgraph in cc_subgraph_list: familySize = len(cc_subgraph) familySizeContainer.addOneValue(familySize) familyCoverage = 0 for n in cc_subgraph: #assuming each family is a two-generation trio/nuclear family individualCoverage = self.getIndividualCoverage( individualID=n, alignmentReadGroup2coverageLs=alignmentReadGroup2coverageLs ) individualCoverage = float(individualCoverage) individualCoverageContainer.addOneValue(individualCoverage) familyCoverage += individualCoverage in_degree = pGraph.in_degree(n) out_degree = pGraph.out_degree(n) outDegreeContainer.addOneValue(out_degree) familyContext = PassingData(familySize=familySize, in_degree=in_degree, out_degree=out_degree, \ individualCoverage=individualCoverage,\ familyCoverage=None) if n not in individualID2familyContext: individualID2familyContext[n] = familyContext else: sys.stderr.write( "Node %s already in individualID2familyContext.\n" % (n)) familyCoverageContainer.addOneValue(familyCoverage) #set the family coverage for each member, used in weighing the individual. better covered family => better haplotype for n in cc_subgraph: individualID2familyContext[n].familyCoverage = familyCoverage plinkPedigreeFile.close() sys.stderr.write("%s individuals.\n" % (len(individualID2familyContext))) # weigh each unique individual based on its sequencing coverage + no of offspring => probability mass for each individual sys.stderr.write( "Weighing each individual , assigning probability mass ...") individualID2probabilityMass = {} for individualID, familyContext in individualID2familyContext.items(): outDegreeQuotient = outDegreeContainer.normalizeValue( familyContext.familySize) individualCoverageQuotient = individualCoverageContainer.normalizeValue( familyContext.individualCoverage) #familyCoverageQuotient = familyCoverageContainer.normalizeValue(familyContext.familyCoverage) importanceScore = outDegreeQuotient + individualCoverageQuotient representativeImportanceScore = importanceScore individualID2probabilityMass[ individualID] = representativeImportanceScore sys.stderr.write(" %s IDs with probability mass assigned.\n" % (len(individualID2probabilityMass))) self.individualID2probabilityMass = individualID2probabilityMass self.individualID2HaplotypeData = individualID2HaplotypeData
def calLD(cls, locus1_allele_ls, locus2_allele_ls, locus1_id=None, locus2_id=None): """ 2010-9-30 copied from palos/ locus1_allele_ls, locus2_allele_ls should be bi-allelic. If locus1_allele_ls and locus2_allele_ls are of different size, the extra elements are discarded. 2008-09-05 adapted from variation.src.misc's LD.calculate_LD class only deal with 2-allele loci skip if either is NA, or if both are heterozygous (not phased) """ counter_matrix = numpy.zeros([2, 2]) #only 2 alleles snp1_allele2index = {} snp2_allele2index = {} no_of_individuals = min(len(locus1_allele_ls), len(locus2_allele_ls)) for k in range(no_of_individuals): snp1_allele = locus1_allele_ls[k] snp2_allele = locus2_allele_ls[k] snp1_allele_index = cls.fill_in_snp_allele2index( snp1_allele, snp1_allele2index) snp2_allele_index = cls.fill_in_snp_allele2index( snp2_allele, snp2_allele2index) if snp1_allele_index > 1 or snp2_allele_index > 1: #ignore the 3rd allele continue counter_matrix[snp1_allele_index, snp2_allele_index] += 1 #counter_matrix[snp1_allele_index, snp2_allele_index] += 1 #this is to mimic the diploid. PA = sum(counter_matrix[0, :]) Pa = sum(counter_matrix[1, :]) PB = sum(counter_matrix[:, 0]) Pb = sum(counter_matrix[:, 1]) total_num = float(PA + Pa) try: PA = PA / total_num Pa = Pa / total_num PB = PB / total_num Pb = Pb / total_num PAB = counter_matrix[0, 0] / total_num D = PAB - PA * PB PAPB = PA * PB PAPb = PA * Pb PaPB = Pa * PB PaPb = Pa * Pb Dmin = max(-PAPB, -PaPb) Dmax = min(PAPb, PaPB) if D < 0: D_prime = D / Dmin else: D_prime = D / Dmax r2 = D * D / (PA * Pa * PB * Pb) except: #2008-01-23 exceptions.ZeroDivisionError, Dmin or Dmax could be 0 if one of(-PAPB, -PaPb) is >0 or <0 sys.stderr.write('Unknown except, ignore: %s\n' % repr(sys.exc_info()[0])) return None allele_freq = (min(PA, Pa), min(PB, Pb)) return_data = PassingData() return_data.D = D return_data.D_prime = D_prime return_data.r2 = r2 return_data.allele_freq = allele_freq return_data.snp_pair_ls = (locus1_id, locus2_id) return_data.no_of_pairs = total_num return return_data
def ltsFit(x_ls, y_ls, fractionUsed=0.6, startX=1, stopX=5): """ 2010-6-1 solve the computing node hang-up (I/O stuck) issue by adding these: import ROOT try: # 2010-5-31 old version (5.18.0) doesn't have IgnoreCommandLineOptions. ROOT.PyConfig.IgnoreCommandLineOptions = True #otherwise # Warning in <TApplication::GetOptions>: file <output file by -o > has size 0, skipping except: pass try: # 2010-5-31 disable .StartGuiThread ROOT.PyConfig.StartGuiThread = 0 except: pass 2010-5-30 return chiSquare as well 2010-5-21 use ROOT to do least trimmed square (LTS) fitting: fit the y=a+bx with trimming fraction = 1-fractionUsed. Example: import numpy x_ls = numpy.array(range(100), numpy.float) y_ls = x_ls/2. for i in range(len(y_ls)): import random new_y = random.random()-0.5 y_ls[i] += new_y # mess up some portion of y for i in range(5): import random new_y = random.random() new_y_index = random.sample(range(100),1) y_ls[new_y_index[0]] = new_y import numpy x_ls = numpy.array([ 2.64884758, 3.51235008, 2.83090925, 3.41229248, 3.01451969,\ 2.49899888, 3.69988108, 2.74896216, 3.05307841, 3.75705409,\ 3.08653784, 3.10703993, 3.61071348, 3.21285319, 2.91460752,\ 3.53737831, 3.06333303, 3.35391617, 3.43568516, 3.34429312,\ 3.31576061, 2.8007164 , 2.73639655, 3.14690256, 3.10174704,\ 2.80888581, 2.72754121, 2.90064001, 3.19270658, 3.50596333,\ 2.61804676, 3.18127131, 3.27542663, 3.09586573], dtype=numpy.float32) # numpy.float32 is not supported by ROOT y_ls = numpy.array([ 2.52827311, 3.27265358, 2.36172366, 2.95760489, 2.50920248,\ 2.3443923 , 3.23502254, 2.35410833, 2.50582743, 2.48501062,\ 2.82510138, 2.70799541, 2.43136382, 2.76342535, 2.45178652,\ 3.08224201, 2.26481771, 2.7387805 , 3.23274207, 2.82769203,\ 2.25042009, 2.56702638, 2.4082365 , 2.44793224, 2.65127802,\ 2.57460976, 2.43136382, 2.39005065, 2.70027065, 3.04452848,\ 2.28555727, 2.71933126, 2.6468935 , 2.54157925], dtype=numpy.float32) fit_y_ls = ltsFit(x_ls, y_ls) import pylab pylab.plot(x_ls, y_ls, '.') pylab.plot(x_ls, fit_y_ls, '.') pylab.legend(['raw data','fitted']) sys.exit(0) """ import ROOT try: # 2010-5-31 old version (5.18.0) doesn't have IgnoreCommandLineOptions. ROOT.PyConfig.IgnoreCommandLineOptions = True #otherwise # Warning in <TApplication::GetOptions>: file <output file by -o > has size 0, skipping except: pass try: # 2010-5-31 disable .StartGuiThread ROOT.PyConfig.StartGuiThread = 0 except: pass #ROOT.gROOT.Reset() # 2010-5-31 dont' know what this is for. ROOT.gROOT.SetBatch( True) #to avoid interative mode (drawing canvas and etc.) from ROOT import TFormula, TF1, TGraph import numpy lm = TF1( 'lm', 'pol1', startX, stopX ) #[0]+[1]*x is essentially same as pol1 but option rob in Fit() only works with pol1. #ROOT is very dtype-sensitive. numpy.float32 won't work. if hasattr(x_ls, 'dtype') and x_ls.dtype == numpy.float: pass else: sys.stderr.write('converting x_ls') x_ls = numpy.array(x_ls, dtype=numpy.float) sys.stderr.write(".\n") if hasattr(y_ls, 'dtype') and y_ls.dtype == numpy.float: pass else: sys.stderr.write('converting y_ls') y_ls = numpy.array(y_ls, dtype=numpy.float) sys.stderr.write(".\n") gr = TGraph(len(x_ls), x_ls, y_ls) gr.Fit(lm, "+rob=%s" % fractionUsed) fit = gr.GetFunction('lm') chiSquare = fit.GetChisquare() fit_y_ls = [] for x in x_ls: fit_y_ls.append(fit.Eval(x)) from utils import PassingData return PassingData(fit_y_ls=fit_y_ls, chiSquare=chiSquare)
def parseOneVCFRow(row, col_name2index, col_index_individual_name_ls, sample_id2index, minDepth=1,\ dataEntryType=1): """ 2014.01.08 fix a bug that skips calls and shortens data_row. 2012.9.6 turn pos into integer 2012.5.10 complete representation of one locus 2012.1.17 common snippet split out of VCFFile & VCFRecord row is a list of input columns from one VCF file line dataEntryType 1: each cell is base call 2: each cell is a dictionary {'GT': base-call, 'DP': depth} """ chromosome = row[0] pos = int(row[1]) #2012.9.6 turn pos into integer vcf_locus_id = row[2] quality = row[5] filter = row[6] info = row[7] format = row[8] info_ls = info.split(';') info_tag2value = {} for info_entry in info_ls: try: tag, value = info_entry.split('=') except: #sys.stderr.write("Error in splitting %s by =.\n"%info) # ###Error in splitting DS by =. continue info_tag2value[tag] = value locus_id = (chromosome, pos) refBase = row[col_name2index['REF']] altBase = row[col_name2index['ALT']] altBaseLs = altBase.split(',') #altBase could be just "C" or "C,G" (multi-nucleotide) alleleLs = [refBase] + altBaseLs alleleNumber2Base = {'.': 'NA'} for i in range(len(alleleLs)): alleleNumber2Base[repr(i)] = alleleLs[i] format_column = row[col_name2index['FORMAT']] format_column_ls = format_column.split(':') format_column_name2index = getColName2IndexFromHeader(format_column_ls) if dataEntryType == 1: data_row = ['NA'] * (len(col_index_individual_name_ls) + 1 ) # extra 1 for the ref data_row[0] = refBase else: data_row = [None] * (len(col_index_individual_name_ls) + 1 ) # extra 1 for the ref data_row[0] = {'GT': refBase, 'DP': -1} genotypeCall2Count = {} for individual_col_index, individual_name in col_index_individual_name_ls: individual_name = individual_name if individual_name not in sample_id2index: sample_id2index[individual_name] = len(sample_id2index) #coverage = read_group2coverage[individual_name] genotype_data = row[individual_col_index] genotype_data_ls = genotype_data.split(':') genotype_call_index = format_column_name2index.get('GT') genotype_quality_index = format_column_name2index.get('GQ') if genotype_quality_index is None: genotype_quality_index = format_column_name2index.get('DP') depth_index = format_column_name2index.get("DP") #GL_index = format_column_name2index.get('GL') genotypeCallInBase = 'NA' if genotype_call_index is not None and len(genotype_data_ls) > 0: # or (genotype_call_index is not None and len(genotype_data_ls)<=genotype_call_index): # #<len(format_column_name2index): # #this genotype call is probably empty "./." due to no reads #genotype_quality = genotype_data_ls[genotype_quality_index] if genotype_call_index is not None and len( genotype_data_ls) > genotype_call_index: genotype_call = genotype_data_ls[genotype_call_index] else: genotype_call = './.' #missing callData = {} if genotype_call != './.' and genotype_call != '.' and genotype_call != '.|.': #missing data patternSearchResult = genotype_call) if patternSearchResult: allele1 = alleleNumber2Base[] allele2 = alleleNumber2Base[] if allele1 != 'N' and allele2 != 'N': genotypeCallInBase = '%s%s' % (allele1, allele2) if depth_index is not None: if len(genotype_data_ls) > depth_index: depth = genotype_data_ls[depth_index] else: depth = '.' #missing DP if depth == '.': #this means depth=0 depth = 0 else: depth = int(depth) if minDepth > 0 and depth < minDepth: #no read. samtools would still assign ref/ref to this individual genotypeCallInBase = 'NA' #set it to missing #if depth>maxNoOfReads*coverage or depth<minNoOfReads*coverage: # #2011-3-29 skip. coverage too high or too low # continue callData['DP'] = depth """ if genotype_call=='0/1' or genotype_call =='1/0': #heterozygous, the latter notation is never used though. allele = '%s%s'%(refBase, altBase) GL_list = genotype_data_ls[GL_index] GL_list = GL_list.split(',') GL_list = map(float, GL_list) GL = GL_list[1] sndHighestGL = max([GL_list[0], GL_list[2]]) deltaGL = GL-sndHighestGL AD = genotype_data_ls[format_column_name2index.get('AD')] AD = map(int, AD.split(',')) minorAlleleCoverage = min(AD) majorAlleleCoverage = max(AD) if minorAlleleCoverage<=minorAlleleDepthUpperBoundCoeff*coverage and \ minorAlleleCoverage>=minorAlleleDepthLowerBoundCoeff*coverage and \ majorAlleleCoverage<=majorAlleleDepthUpperBoundCoeff*coverage: DP4_ratio = float(AD[0])/AD[1] allele = '%s%s'%(refBase, altBase) elif genotype_call=='./.' or genotype_call=='.|.': #missing allele = 'NA' elif genotype_call =='1/1' or genotype_call =='1|1': allele = '%s%s'%(altBase, altBase) elif genotype_call =='0/0' or genotype_call=='0|0': allele = '%s%s'%(refBase, refBase) """ col_index = sample_id2index.get(individual_name) if dataEntryType == 1: data_row[col_index] = genotypeCallInBase else: callData['GT'] = genotypeCallInBase data_row[col_index] = callData if genotypeCallInBase != 'NA': if genotypeCallInBase not in genotypeCall2Count: genotypeCall2Count[genotypeCallInBase] = 0 genotypeCall2Count[genotypeCallInBase] += 1 return PassingData(chr=chromosome, chromosome=chromosome, pos=pos, position=pos, locus_id=locus_id, quality=quality, info_tag2value=info_tag2value, refBase=refBase, altBase=altBase, alleleLs=alleleLs, alleleNumber2Base=alleleNumber2Base, genotypeCall2Count=genotypeCall2Count, data_row=data_row, info=info, format=format, filter=filter, vcf_locus_id=vcf_locus_id, format_column_name2index=format_column_name2index, format_column_ls=format_column_ls)