def __ssBonds(self, model, cutoff=4.): """ Identify disulfide bonds. @param model: model @type model: PDBModel @param cutoff: distance cutoff for S-S distance (default: 4.0) @type cutoff: float @return: list with numbers of residue pairs forming S-S @rtype: [(int, int)] """ m = model.compress(model.mask(['SG'])) if len(m) < 2: return [] pw = MU.pairwiseDistances(m.xyz, m.xyz) pw = N.less(pw, cutoff) r = [] for i in range(len(pw)): for j in range(i + 1, len(pw)): if pw[i, j]: r += [(m.atoms['residue_number'][i], m.atoms['residue_number'][j])] return r
def __checkProfileIntegrity(self, profile, upperLimit=1.0, lowerLimit=-1.0): """ In some cases SurfaceRacer generates incorrect curvature values for some atoms. This function sets values outside a given range to 0 @param profile: profile name @type profile: str @param upperLimit: upper limit for a valid value (default: 1.0) @type upperLimit: float @param lowerLimit: lower limit for a valid value (default: -1.0) @type lowerLimit: float @return: profile with inspected values @rtype: [float] """ mask = N.greater(profile, upperLimit) mask += N.less(profile, lowerLimit) for i in N.nonzero(mask): print 'WARNING! Profile value %.2f set to O\n' % profile[i] profile[i] = 0 return profile
def chipdata(self, data): """Input data: [(dirname0, [et0, et1, ...]), ...] """ self.numRowsMissingChipData = 0 self._chipdataMA = [] if data != None: self._chipdata = data numValsAll = 0 numValsNonMasked = 0 numFiles = 0 numExamplesList = [] attribDict = {} numColMissing = 0 for (name, etList) in data: numFiles += len(etList) self._chipdataMA.append((name, [])) for et in etList: attribDict.update( dict( zip(map(lambda x: x.name, et.domain.attributes), et.domain.attributes))) numExamplesList.append(len(et)) etm = et.toNumpyMA("a")[0] colNonMissingInd = Numeric.compress( Numeric.not_equal(MA.count(etm, 0), 0), Numeric.arange(etm.shape[1]) ) # indices of columns that are not completely missing numColMissing += etm.shape[1] - colNonMissingInd.shape[0] self.numRowsMissingChipData += int( Numeric.add.reduce( Numeric.less( MA.count(etm.take(colNonMissingInd, 1), 1), etm.shape[1]))) numValsAll += int(Numeric.multiply.reduce(etm.shape)) numValsNonMasked += int(MA.count(etm)) self._chipdataMA[-1][1].append(etm) # info text self.infoc.setText( "Structured Data: %i data files with %i profiles on %i points" % (numFiles, numExamplesList[0], len(attribDict))) numTotalMissing = numValsAll - numValsNonMasked if numTotalMissing > 0: print numTotalMissing, numColMissing, self.numRowsMissingChipData print type(numTotalMissing), type(numColMissing), type( self.numRowsMissingChipData) self.infod.setText( "missing %i values, %i column%s completely, %i row%s partially" % (numTotalMissing, numColMissing, [ "", "s" ][numColMissing != 1], self.numRowsMissingChipData, ["", "s"][self.numRowsMissingChipData != 1])) else: self.infod.setText("") else: self._chipdata = None self.infoc.setText("No structured data on input") self.infod.setText("") self.setGuiCommonExpChip() if self.commitOnChange: self.senddata(2)
def __checkProfileIntegrity( self, profile, upperLimit=1.0, lowerLimit=-1.0): """ In some cases SurfaceRacer generates incorrect curvature values for some atoms. This function sets values outside a given range to 0 @param profile: profile name @type profile: str @param upperLimit: upper limit for a valid value (default: 1.0) @type upperLimit: float @param lowerLimit: lower limit for a valid value (default: -1.0) @type lowerLimit: float @return: profile with inspected values @rtype: [float] """ mask = N.greater( profile, upperLimit ) mask += N.less( profile, lowerLimit ) for i in N.nonzero(mask): print 'WARNING! Profile value %.2f set to O\n'%profile[i] profile[i] = 0 return profile
def __ssBonds( self, model, cutoff=4. ): """ Identify disulfide bonds. @param model: model @type model: PDBModel @param cutoff: distance cutoff for S-S distance (default: 4.0) @type cutoff: float @return: list with numbers of residue pairs forming S-S @rtype: [(int, int)] """ m = model.compress( model.mask( ['SG'] ) ) if len( m ) < 2: return [] pw = MU.pairwiseDistances( m.xyz, m.xyz ) pw = N.less( pw, cutoff ) r = [] for i in range( len( pw ) ): for j in range( i+1, len(pw) ): if pw[i,j]: r += [ (m.atoms['residue_number'][i], m.atoms['residue_number'][j]) ] return r
def takeFrames( self, indices ): """ Return a copy of the trajectory containing only the specified frames. @param indices: positions to take @type indices: [int] @return: copy of this Trajectory (fewer frames, semi-deep copy of ref) @rtype: Trajectory """ ## remove out-of-bound indices indices = N.compress( N.less( indices, len( self.frames) ), indices ) r = self.__class__() ## this step takes some time for large frames ! r.frames = N.take( self.frames, indices, 0 ) ## semi-deep copy of reference model r.setRef( self.ref.take( range( self.ref.lenAtoms() )) ) if self.frameNames != None: r.frameNames = N.take( self.frameNames, indices, 0 ) r.frameNames = map( ''.join, r.frameNames.tolist() ) r.pc = self.__takePca( indices ) r.profiles = self.profiles.take( indices ) r.resIndex = self.resIndex return r
def takeFrames(self, indices): """ Return a copy of the trajectory containing only the specified frames. @param indices: positions to take @type indices: [int] @return: copy of this Trajectory (fewer frames, semi-deep copy of ref) @rtype: Trajectory """ ## remove out-of-bound indices indices = N.compress(N.less(indices, len(self.frames)), indices) r = self.__class__() ## this step takes some time for large frames ! r.frames = N.take(self.frames, indices, 0) ## semi-deep copy of reference model r.setRef(self.ref.take(range(self.ref.lenAtoms()))) if self.frameNames != None: r.frameNames = N.take(self.frameNames, indices, 0) r.frameNames = map(''.join, r.frameNames.tolist()) r.pc = self.__takePca(indices) r.profiles = self.profiles.take(indices) r.resIndex = self.resIndex return r
def addDensity( self, radius=6, minasa=None, profName='density' ): """ Count the number of heavy atoms within the given radius. Values are only collected for atoms with |minasa| accessible surface area. @param minasa: relative exposed surface - 0 to 100% @type minasa: float @param radius: in Angstrom @type radius: float """ mHeavy = self.m.maskHeavy() xyz = N.compress( mHeavy, self.m.getXyz(), 0 ) if minasa and self.m.profile( 'relAS', 0 ) == 0: self.addASA() if minasa: mSurf = self.m.profile2mask( 'relAS', minasa ) else: mSurf = N.ones( self.m.lenAtoms() ) ## loop over all surface atoms surf_pos = N.nonzero( mSurf ) contacts = [] for i in surf_pos: dist = N.sum(( xyz - self.m.xyz[i])**2, 1) contacts += [ N.sum( N.less(dist, radius**2 )) -1] self.m.atoms.set( profName, contacts, mSurf, default=-1, comment='atom density radius %3.1fA' % radius, version= T.dateString() + ' ' + self.version() )
def __atomContacts(self, cutoff, rec_mask, lig_mask, cache): """ Intermolecular distances below cutoff after applying the two masks. @param cutoff: cutoff for B{atom-atom} contact in \AA @type cutoff: float @param rec_mask: atom mask @type rec_mask: [1|0] @param lig_mask: atom mask @type lig_mask: [1|0] @param cache: cache pairwise atom distance matrix @type cache: 1|0 @return: atom contact matrix, array sum_rec_mask x sum_lig_mask @rtype: array """ ## get atom coordinats as array 3 x all_atoms rec_xyz = self.rec().getXyz() lig_xyz = self.lig().getXyz() ## get pair-wise distances -> atoms_rec x atoms_lig dist = getattr( self, 'pw_dist', None ) if dist is None or \ N.shape( dist ) != ( N.sum(rec_mask), N.sum(lig_mask) ): dist = self.__pairwiseDistances(N.compress( rec_mask, rec_xyz, 0), N.compress( lig_mask, lig_xyz, 0) ) if cache: self.pw_dist = dist ## reduce to 1 (distance < cutoff) or 0 -> n_atoms_rec x n_atoms_lig return N.less( dist, cutoff )
def __call__(self,**params_to_override): p = ParamOverrides(self,params_to_override) xsize,ysize = SheetCoordinateSystem(p.bounds,p.xdensity,p.ydensity).shape xsize,ysize = int(round(xsize)),int(round(ysize)) xdisparity = int(round(xsize*p.xdisparity)) ydisparity = int(round(xsize*p.ydisparity)) dotsize = int(round(xsize*p.dotsize)) bigxsize = 2*xsize bigysize = 2*ysize ndots=int(round(p.dotdensity * (bigxsize+2*dotsize) * (bigysize+2*dotsize) / min(dotsize,xsize) / min(dotsize,ysize))) halfdot = floor(dotsize/2) # Choose random colors and locations of square dots random_seed = p.random_seed random_array.seed(random_seed*12,random_seed*99) col=where(random_array.random((ndots))>=0.5, 1.0, -1.0) random_array.seed(random_seed*122,random_seed*799) xpos=floor(random_array.random((ndots))*(bigxsize+2*dotsize)) - halfdot random_array.seed(random_seed*1243,random_seed*9349) ypos=floor(random_array.random((ndots))*(bigysize+2*dotsize)) - halfdot # Construct arrays of points specifying the boundaries of each # dot, cropping them by the big image size (0,0) to (bigxsize,bigysize) x1=xpos.astype(Int) ; x1=choose(less(x1,0),(x1,0)) y1=ypos.astype(Int) ; y1=choose(less(y1,0),(y1,0)) x2=(xpos+(dotsize-1)).astype(Int) ; x2=choose(greater(x2,bigxsize),(x2,bigxsize)) y2=(ypos+(dotsize-1)).astype(Int) ; y2=choose(greater(y2,bigysize),(y2,bigysize)) # Draw each dot in the big image, on a blank background bigimage = zeros((bigysize,bigxsize)) for i in range(ndots): bigimage[y1[i]:y2[i]+1,x1[i]:x2[i]+1] = col[i] result = p.offset + p.scale*bigimage[ (ysize/2)+ydisparity:(3*ysize/2)+ydisparity , (xsize/2)+xdisparity:(3*xsize/2)+xdisparity ] for of in p.output_fns: of(result) return result
def smooth_rectangle(x, y, rec_w, rec_h, gaussian_width_x, gaussian_width_y): """ Rectangle with a solid central region, then Gaussian fall-off at the edges. """ gaussian_x_coord = abs(x) - rec_w / 2.0 gaussian_y_coord = abs(y) - rec_h / 2.0 box_x = less(gaussian_x_coord, 0.0) box_y = less(gaussian_y_coord, 0.0) sigmasq_x = gaussian_width_x * gaussian_width_x sigmasq_y = gaussian_width_y * gaussian_width_y with float_error_ignore(): falloff_x=x*0.0 if sigmasq_x==0.0 else \ exp(divide(-gaussian_x_coord*gaussian_x_coord,2*sigmasq_x)) falloff_y=y*0.0 if sigmasq_y==0.0 else \ exp(divide(-gaussian_y_coord*gaussian_y_coord,2*sigmasq_y)) return minimum(maximum(box_x, falloff_x), maximum(box_y, falloff_y))
def smooth_rectangle(x, y, rec_w, rec_h, gaussian_width_x, gaussian_width_y): """ Rectangle with a solid central region, then Gaussian fall-off at the edges. """ gaussian_x_coord = abs(x)-rec_w/2.0 gaussian_y_coord = abs(y)-rec_h/2.0 box_x=less(gaussian_x_coord,0.0) box_y=less(gaussian_y_coord,0.0) sigmasq_x=gaussian_width_x*gaussian_width_x sigmasq_y=gaussian_width_y*gaussian_width_y with float_error_ignore(): falloff_x=x*0.0 if sigmasq_x==0.0 else \ exp(divide(-gaussian_x_coord*gaussian_x_coord,2*sigmasq_x)) falloff_y=y*0.0 if sigmasq_y==0.0 else \ exp(divide(-gaussian_y_coord*gaussian_y_coord,2*sigmasq_y)) return minimum(maximum(box_x,falloff_x), maximum(box_y,falloff_y))
def Translate(self, t): ## #scl = 0.25 ## scl = 1 ## print "t:", t ## t1 = int(t[0]*scl) ## t2 = int(t[1]*scl) ## t3 = int(t[2]*scl) t1, t2, t3 = Numeric.where(Numeric.less(t, 0), -2, 2) xmin = self.xmin xmax = self.xmax ymin = self.ymin ymax = self.ymax zmin = self.zmin zmax = self.zmax xmin = xmin + t1 xmax = xmax + t1 nx, ny, nz = self.volSize if xmin < 0: xmax = xmax - xmin xmin = 0 if xmax > nx: xmin = xmin - (xmax - nx) xmax = nx ymin = ymin + t2 ymax = ymax + t2 if ymin < 0: ymax = ymax - ymin ymin = 0 if ymax > ny: ymin = ymin - (ymax - ny) ymax = ny zmin = zmin + t3 zmax = zmax + t3 if zmin < 0: zmax = zmax - zmin zmin = 0 if zmax > nz: zmin = zmin - (zmax - nz) zmax = nz self.xmin = xmin self.xmax = xmax self.ymin = ymin self.ymax = ymax self.zmin = zmin self.zmax = zmax self.update()
def filter(self, dlg): fptr = open(dlg) dlg_lines = fptr.readlines() fptr.close() #STEP 1:accumulate lines of various poses model_lines = [] #keep all of them all_models = [] in_model = False for ll in dlg_lines: if ll.find("DOCKED:") == 0: #check for a new model if ll.find("DOCKED: MODEL") == 0: model_lines = [] in_model = True model_lines.append(ll) if ll.find("_") == 0 and in_model: all_models.append(model_lines) model_lines = [] in_model = False #initialize this ligand # loop over the models: for model_lines in all_models: self.setup_ligand(model_lines) bigR = self.bigRC[:self.lenK] bigM = self.bigC[:self.lenK] cutoff = bigR + self.keyRadii d = bigM - self.smallM dSQ = d * d dSQMAT = Numeric.sum(dSQ, 2) cutoffSQMAT = cutoff * cutoff ansMat = Numeric.logical_and(Numeric.less(dSQMAT, cutoffSQMAT), Numeric.not_equal(dSQMAT, 0.)) rowIndices = Numeric.nonzero(Numeric.sum(ansMat, 1)) num_contacts = 0 for ind in rowIndices: for j in ansMat[ind]: if j: num_contacts += 1 if num_contacts > 0: break return num_contacts
def filter(self, dlg): fptr = open(dlg) dlg_lines = fptr.readlines() fptr.close() #STEP 1:accumulate lines of various poses model_lines = [] #keep all of them all_models = [] in_model = False for ll in dlg_lines: if ll.find("DOCKED:")==0: #check for a new model if ll.find("DOCKED: MODEL")==0: model_lines = [] in_model = True model_lines.append(ll) if ll.find("_")==0 and in_model: all_models.append(model_lines) model_lines = [] in_model = False #initialize this ligand # loop over the models: for model_lines in all_models: self.setup_ligand(model_lines) bigR = self.bigRC[:self.lenK] bigM = self.bigC[:self.lenK] cutoff = bigR + self.keyRadii d = bigM - self.smallM dSQ = d*d dSQMAT = Numeric.sum(dSQ,2) cutoffSQMAT = cutoff*cutoff ansMat = Numeric.logical_and(Numeric.less(dSQMAT, cutoffSQMAT),Numeric.not_equal(dSQMAT, 0.)) rowIndices = Numeric.nonzero(Numeric.sum(ansMat,1)) num_contacts = 0 for ind in rowIndices: for j in ansMat[ind]: if j: num_contacts+=1 if num_contacts > 0: break return num_contacts
def updateSelectorInfos(self, selectorIdx=None): """updates the number of examples that match individual selectors; if selectorIdx is given, updates only the corresponding info. """ if not selectorIdx: selectorInd = range(3) else: selectorInd = [selectorIdx] alphas = [self.alphaA, self.alphaB, self.alphaI] boxSelectors = [self.boxSelectorA, self.boxSelectorB, self.boxSelectorI] for si in selectorInd: try: alpha = float(alphas[si]) ps = self.ps[si] except ValueError: alpha = None ps = None ## if ps != None and alpha != None and self.anovaType in [[0,1,3,4],[2,3,4],[4]][si]: if ps != None and alpha != None and boxSelectors[si].isEnabled(): numSelected = Numeric.add.reduce(Numeric.less(self.ps[si], alpha)) self.lblNumGenes[si].setText(' (%d example%s)' % (numSelected, ['', 's'][int(numSelected!=1)])) else: self.lblNumGenes[si].setText(' (no examples)')
def process_data(data, referencing=None, n_points=200, exposure_cutoff=0.05, atom_type='H', exclude_entries=(),molType='protein'): stats = process_secondary(data, referencing, n_points, atom_type, exclude_entries=exclude_entries,molType=molType) ## decompose with respect to accessibility S = {} bounds = {} for key, values in stats.items(): shifts, exposure = values ind = Numeric.argsort(exposure) shifts = Numeric.take(shifts, ind) exposure = Numeric.take(exposure, ind) B = [] mask = Numeric.less(exposure, exposure_cutoff) ind = Numeric.nonzero(mask) if len(shifts)-len(ind) < n_points: ind = range(len(shifts)) if len(ind) >= n_points: print key, 0, len(ind), len(shifts) S[key + (0,)] = (Numeric.take(shifts, ind), Numeric.take(exposure, ind)) B.append(exposure[len(ind)-1]) shifts = shifts[len(ind):] exposure = exposure[len(ind):] i = 1 else: i = 0 n_classes = len(exposure) / int(n_points) n = int(len(exposure) / float(max(1, n_classes))) + 1 while len(shifts) > n: print key, i, len(shifts[:n]), len(shifts) S[key + (i,)] = shifts[:n], exposure[:n] B.append(exposure[n]) shifts = shifts[n:] exposure = exposure[n:] i += 1 if len(shifts): print key, i, len(shifts) S[key + (i,)] = shifts, exposure B.append(exposure[-1]) bounds[key] = B return S, bounds
def senddata(self): """computes selectionList, partitions the examples and updates infoc; sends out selectionList and selected/other dataStructure or None; """ if self.dataStructure and self.ps.shape[1]: # set selectionList alphas = [self.alphaA, self.alphaB, self.alphaI] selectors = [self.selectorA, self.selectorB, self.selectorI] selectionList = Numeric.ones((self.numExamples,)) boxSelectors = [self.boxSelectorA, self.boxSelectorB, self.boxSelectorI] for si in range(3): try: ## if selectors[si] and self.anovaType in [[0,1,3,4],[2,3,4],[4]][si]: if selectors[si] and boxSelectors[si].isEnabled(): selectionList = Numeric.logical_and(selectionList, Numeric.less(self.ps[si], float(alphas[si]))) except ValueError: print "Warning: cannot convert %s to float" % str(alphas[si]) pass self.infoc.setText('Sending out data...') if self.sendProbabilities: # create example table with probabilities ## print self.ps ## print Numeric.transpose(self.ps).shape etProb = orange.ExampleTable(orange.Domain([orange.FloatVariable("Factor A p-val"),orange.FloatVariable("Factor B p-val"),orange.FloatVariable("Interaction p-val")]), Numeric.transpose(self.ps)) # in etProb, convert p-val to meta attribute domProb = orange.Domain([]) domProb.addmetas(dict(zip([orange.newmetaid(),orange.newmetaid(),orange.newmetaid()], etProb.domain.variables))) etProb = orange.ExampleTable(domProb, etProb) else: # create new etProb without attributes/metas and of length equal to etProb etProb = orange.ExampleTable(orange.Domain([]), Numeric.zeros((selectionList.shape[0],0))) # partition dataStructure and send out data selectionList = selectionList.tolist() self.send("Example Selection", (self.selectorName, selectionList)) dataStructS = [] dataStructN = [] self.progressBarInit() if self.sendNotSelectedData: pbStep = 50./len(self.dataStructure) else: pbStep = 100./len(self.dataStructure) for (dsName, etList) in self.dataStructure: etListS = [et.select(selectionList) for et in etList] for i in range(len(etList)): # append probabilities (if etProb not empty) etListS[i] = orange.ExampleTable([etListS[i], etProb.select(selectionList)]) # add name etListS[i].name = etList[i].name dataStructS.append((dsName, etListS)) self.progressBarAdvance(pbStep) self.send("Selected Structured Data", dataStructS) if self.sendNotSelectedData: for (dsName, etList) in self.dataStructure: etListN = [et.select(selectionList, negate=1) for et in etList] for i in range(len(etList)): # append probabilities (if etProb not empty) etListN[i] = orange.ExampleTable([etListN[i], etProb.select(selectionList, negate=1)]) # add name etListN[i].name = etList[i].name dataStructN.append((dsName, etListN)) self.progressBarAdvance(pbStep) self.send("Other Structured Data", dataStructN) else: self.send("Other Structured Data", None) self.progressBarFinished() # report the number of selected examples numExamples = Numeric.add.reduce(Numeric.greater(selectionList, 0)) self.infoc.setText('Total of %d example%s match criteria.' % (numExamples, ['', 's'][int(numExamples!=1)])) else: self.send("Example Selection", None) self.send("Selected Structured Data", None) self.send("Other Structured Data", None)
def deNAN(a, value=0.0): nans = Numeric.logical_not( Numeric.less(a, 0.0) + Numeric.greater_equal(a, 0.0)) return Numeric.where(nans, value, a)
def rgrd(self, dataIn, missingValueIn, missingMatch, logYes = 'yes', positionIn = None, missingValueOut = None): """ #--------------------------------------------------------------------------------- # # PURPOSE: To perform all the tasks required to regrid the input data, dataIn, into the ouput data, # dataout along the level dimension only. # # DEFINITION: # # def rgrd(self, dataIn, missingValueIn, missingMatch, positionIn = None, missingValueOut = None): # # # PASSED : dataIn -- data to regrid # # missingValueIn -- the missing data value to use in setting missing in the mask. It is required # and there are two choices: # None -- there is no missing data # A number -- the value to use in the search for possible missing data. # The presence of missing data at a grid point leads to recording 0.0 in the mask. # # missingMatch -- the comparison scheme used in searching for missing data in dataIn using the value passed # in as missingValueIn. The choices are: # None -- used if None is the entry for missingValueIn # exact -- used if missingValue is the exact value from the file # greater -- the missing data value is equal to or greater than missingValueIn # less -- the missing data value is equal to or less than missingValueIn # # logYes -- choose the level regrid as linear in log of level or linear in level. Set to # 'yes' for log. Anything else is linear in level. # # # # positionIn -- a tuple with the numerical position of the dimensions # in C or Python order specified in the sequence longitude, # latitude, level and time. Longitude, latitude and level are # required. If time is missing submit None in its slot in the # tuple. Notice that the length of the tuple is always four. # # Explicitly, in terms of the shape of dataIn as returned by Python's shape function # # positionIn[0] contains the position of longitude in dataIn # positionIn[1] contains the position of latitude in dataIn # positionIn[2] contains the position of level in dataIn or None # positionIn[3] contains the position of time in dataIn or None # # As examples: # If the C order shape of 4D data is # (number of longitudes, number of times, number of levels, number of latitudes) # submit # (0, 3, 2, 1) # # If the C order shape of 3D data is # (number of longitudes, number of times, number oflatitudes) # submit # (0, 2, 1, None) # # Send in None if the shape is a subset of (time, level, # latitude, longitude) which is evaluated as follows: # 3D -- code assumes (2,1,0,None) # 4D -- code assumes (3,2,1,0) # # missingValueOut -- the value for the missing data used in writing the output data. If left at the # default entry, None, the code uses missingValueIn if present or as a last resort # 1.0e20 # # # RETURNED : dataOut -- the regridded data # # # USAGE: # # Example 1. To regrid dataIn into dataOut using all the defaults where None, None signifies no # missing data. # dataOut = x.rgrd(dataIn, None, None) # # Example 2. To regrid dataIn into dataOut using 1.0e20 and greater as the missing data # # dataOut = x.rgrd(dataIn, 1.e20, 'greater') # #---------------------------------------------------------------------------------------------------------------------""" # check the required input -- dataIn, missingValueIn and missingMatch # make sure that dataIn is an array try: z = len(dataIn) except TypeError: sendmsg('Error in calling the rgrd method -- dataIn must be an array') raise TypeError # check the missingValueIn pass if missingValueIn != None: try: z = abs(missingValueIn) except TypeError: sendmsg('Error in calling the rgrd method -- missingvalueIn must be None or a number. Now it is ', missingValueIn) raise TypeError # check the missingMatch pass missingPossibilities = ['greater', 'equal', 'less', None] if missingMatch not in missingPossibilities: msg = 'Error in missingMatch -- it must be None or the string greater, equal, or less. Now it is ' sendmsg(msg, missingMatch) raise ValueError # --- Check data type and change to float if necessary ---- if dataIn.dtype.char != 'f': dataIn = dataIn.astype(Numeric.Float32) dataShape = dataIn.shape numberDim = len(dataShape) if numberDim < 2: msg = 'Error in call to rgrd -- data must have at least 2 dimensions' sendmsg(msg) raise TypeError # --- evaluate positionIn ---- # --- make standard positionIn as a check---- positionList =[] for n in range(numberDim): # insert a sequence of numbers positionList.append(n) positionList.reverse() for n in range(numberDim, 4): # fill end of list with Nones positionList.append(None) positionCheck = tuple(positionList) standardPosition = 0 # transpose required if positionIn == None: # construct the default positionIn tuple positionIn = positionCheck standardPosition = 1 # no need for a transpose with this data else: if positionIn == positionCheck: # compare to the standard standardPosition = 1 # no need for a transpose with this data if len(positionIn) != 4: msg = 'Error in call to rgrd -- positionIn must be a tuple of length 4' sendmsg(msg) raise TypeError if standardPosition == 0: # transpose data to the standard order (t,z,y,x) newOrder, inverseOrder = checkorder(positionIn) dataIn = Numeric.transpose(dataIn, newOrder) # transpose data to standard order (t,z,y,x) dataIn = Numeric.array(dataIn.astype(Numeric.Float32), Numeric.Float32) # make contiguous # set dimension sizes and check for consistency if positionIn[0] != None: self.nlon = (dataShape[ positionIn[0] ]) else: self.nlon = 0 if positionIn[1] != None: self.nlat = (dataShape[ positionIn[1] ]) else: self.nlat = 0 if positionIn[2] != None: if self.nlevi != (dataShape[ positionIn[2] ]): msg = 'Level size is inconsistent with input data' sendmsg(msg) raise ValueError if positionIn[3] != None: self.ntime = (dataShape[ positionIn[3] ]) else: self.ntime = 0 # allocate memory for dataOut -- the array with new number of levels outList = list(dataIn.shape) for i in range(len(outList)): if outList[i] == self.nlevi: outList[i] = self.nlevo break dataOut = Numeric.zeros(tuple(outList), Numeric.Float32) # memory for aout if missingMatch == None: # if no missing do not pass None missingMatch = 'none' if missingValueIn == None: # if no missing do not pass None missingValueIn = 1.333e33 if logYes != 'yes': logYes = 'no' levIn = self.axisIn[:].astype(Numeric.Float64) levOut = self.axisOut[:].astype(Numeric.Float64) _regrid.rgdpressure(self.nlevi, self.nlevo, self.nlat, self.nlon, self.ntime, missingValueIn, missingMatch, logYes, levIn, levOut, dataIn, dataOut) if missingMatch == 'none': # if no missing do not pass None missingMatch = None if missingValueIn == 1.333e33: missingValueIn = None if standardPosition == 0: dataOut = Numeric.transpose(dataOut, inverseOrder) # transpose data to original order dataOut = Numeric.array(dataOut.astype(Numeric.Float32), Numeric.Float32) # make contiguous if missingValueOut != None: # set the missing value in data to missingValueOut if missingMatch == 'greater': if missingValueIn > 0.0: missing = 0.99*missingValueIn else: missing = 1.01*missingValueIn dataOut = Numeric.where(Numeric.greater(dataOut,missing), missingValueOut, dataOut) elif missingMatch == 'equal': missing = missingValueIn dataOut = Numeric.where(Numeric.equal(dataOut,missing), missingValueOut, dataOut) elif missingMatch == 'less': if missingValueIn < 0.0: missing = 0.99*missingValueIn else: missing = 1.01*missingValueIn dataOut = Numeric.where(Numeric.less(dataOut,missing), missingValueOut, dataOut) return dataOut
def match(x, y, n_iterations=1, z=2, eps_rmsd=0.5, eps_stdv=0.05): """ Matches two arrays onto each other, while iteratively removing outliers. Superimposed array y would be C{ N.dot(y, N.transpose(r)) + t }. @param n_iterations: number of calculations:: 1 .. no iteration 0 .. until convergence @type n_iterations: 1|0 @param z: number of standard deviations for outlier definition (default: 2) @type z: float @param eps_rmsd: tolerance in rmsd (default: 0.5) @type eps_rmsd: float @param eps_stdv: tolerance in standard deviations (default: 0.05) @type eps_stdv: float @return: (r,t), [ [percent_considered, rmsd_for_it, outliers] ] @rtype: (array, array), [float, float, int] """ iter_trace = [] rmsd_old = 0 stdv_old = 0 n = 0 converged = 0 mask = N.ones(len(y), N.int32) while not converged: ## find transformation for best match r, t = findTransformation(N.compress(mask, x, 0), N.compress(mask, y, 0)) ## transform coordinates xt = N.dot(y, N.transpose(r)) + t ## calculate row distances d = N.sqrt(N.sum(N.power(x - xt, 2), 1)) * mask ## calculate rmsd and stdv rmsd = N.sqrt(N.average(N.compress(mask, d)**2)) stdv = MU.SD(N.compress(mask, d)) ## check conditions for convergence d_rmsd = abs(rmsd - rmsd_old) d_stdv = abs(1 - stdv_old / stdv) if d_rmsd < eps_rmsd and d_stdv < eps_stdv: converged = 1 else: rmsd_old = rmsd stdv_old = stdv ## store result perc = round(float(N.sum(mask)) / float(len(mask)), 2) ## throw out non-matching rows mask = N.logical_and(mask, N.less(d, rmsd + z * stdv)) outliers = N.nonzero(N.logical_not(mask)) iter_trace.append([perc, round(rmsd, 3), outliers]) n += 1 if n_iterations and n >= n_iterations: break return (r, t), iter_trace
def kNNimputeMA(arr2d, K=20, callback=None): """Returns a new 2D MA.array with missing values imputed from K nearest neighbours. Find K rows (axis 0) with the most similar values where similarity measure corresponds to weighted Euclidean distance. Imputed value = weighted average of the corresponding values of K nearest neighbours, where weights equal to tricubic distribution of distances to all rows. Impute missing rows by average over all rows. Version: 30.8.2005 """ arr2d = MA.asarray(arr2d) assert len(arr2d.shape) == 2, "2D array expected" # make a copy for imputation aImp2 = MA.array(arr2d) # leave out columns with 0 known values (columnInd: non-zero columns) columnCond = Numeric.greater(MA.count(arr2d, axis=0), 0) columnIndAll = Numeric.arange(arr2d.shape[1]) columnInd = Numeric.compress(columnCond, columnIndAll) # impute the rows where 0 < #known_values < #non_zero_columns, i.e. exclude the rows with 0 and all (non-zero-column) values countByRows = MA.count(arr2d, axis=1) for rowIdx in Numeric.compress( Numeric.logical_and(Numeric.greater(countByRows, 0), Numeric.less(countByRows, columnInd.shape[0])), Numeric.arange(arr2d.shape[0])): rowResized = MA.resize(arr2d[rowIdx], arr2d.shape) diff = arr2d - rowResized distances = MA.sqrt( MA.add.reduce((diff)**2, 1) / MA.count(diff, axis=1)) # nearest neighbours row indices (without the current row index) indSorted = MA.argsort(distances)[1:] distSorted = distances.take(indSorted) # number of distances different from MA.masked numNonMasked = distSorted.shape[0] - Numeric.add.reduce( Numeric.asarray(MA.getmaskarray(distSorted), Numeric.Int)) # number of distances to account for (K or less) if numNonMasked > 1: weightsSorted = MA.power( 1 - MA.power(distSorted / distSorted[numNonMasked - 1], 3), 3) # tricubic distribution of all weights else: weightsSorted = Numeric.ones(distSorted.shape[0]) # compute average for each column separately in order to account for K non-masked values colInd4CurrRow = Numeric.compress( Numeric.logical_and(MA.getmaskarray(arr2d[rowIdx]), columnCond), columnIndAll) for colIdx in colInd4CurrRow: # column values sorted by distances columnVals = arr2d[:, colIdx].take(indSorted) # take only those weights where columnVals does not equal MA.masked weightsSortedCompressed = MA.compress( 1 - MA.getmaskarray(columnVals), weightsSorted) # impute from K (or possibly less) values aImp2[rowIdx, colIdx] = MA.average(columnVals.compressed()[:K], weights=weightsSortedCompressed[:K]) if callback: callback() # impute the unknown rows with average profile avrgRow = MA.average(arr2d, 0) for rowIdx in Numeric.compress(Numeric.equal(countByRows, 0), Numeric.arange(arr2d.shape[0])): aImp2[rowIdx] = avrgRow if callback: callback() return aImp2
def kNNimputeMA(arr2d, K=20, callback=None): """Returns a new 2D MA.array with missing values imputed from K nearest neighbours. Find K rows (axis 0) with the most similar values where similarity measure corresponds to weighted Euclidean distance. Imputed value = weighted average of the corresponding values of K nearest neighbours, where weights equal to tricubic distribution of distances to all rows. Impute missing rows by average over all rows. Version: 30.8.2005 """ arr2d = MA.asarray(arr2d) assert len(arr2d.shape) == 2, "2D array expected" # make a copy for imputation aImp2 = MA.array(arr2d) # leave out columns with 0 known values (columnInd: non-zero columns) columnCond = Numeric.greater(MA.count(arr2d, axis=0), 0) columnIndAll = Numeric.arange(arr2d.shape[1]) columnInd = Numeric.compress(columnCond, columnIndAll) # impute the rows where 0 < #known_values < #non_zero_columns, i.e. exclude the rows with 0 and all (non-zero-column) values countByRows = MA.count(arr2d, axis=1) for rowIdx in Numeric.compress(Numeric.logical_and(Numeric.greater(countByRows, 0), Numeric.less(countByRows, columnInd.shape[0])), Numeric.arange(arr2d.shape[0])): rowResized = MA.resize(arr2d[rowIdx], arr2d.shape) diff = arr2d - rowResized distances = MA.sqrt(MA.add.reduce((diff)**2, 1) / MA.count(diff, axis=1)) # nearest neighbours row indices (without the current row index) indSorted = MA.argsort(distances)[1:] distSorted = distances.take(indSorted) # number of distances different from MA.masked numNonMasked = distSorted.shape[0] - Numeric.add.reduce(Numeric.asarray(MA.getmaskarray(distSorted), Numeric.Int)) # number of distances to account for (K or less) if numNonMasked > 1: weightsSorted = MA.power(1-MA.power(distSorted/distSorted[numNonMasked-1],3),3) # tricubic distribution of all weights else: weightsSorted = Numeric.ones(distSorted.shape[0]) # compute average for each column separately in order to account for K non-masked values colInd4CurrRow = Numeric.compress(Numeric.logical_and(MA.getmaskarray(arr2d[rowIdx]), columnCond), columnIndAll) for colIdx in colInd4CurrRow: # column values sorted by distances columnVals = arr2d[:,colIdx].take(indSorted) # take only those weights where columnVals does not equal MA.masked weightsSortedCompressed = MA.compress(1-MA.getmaskarray(columnVals), weightsSorted) # impute from K (or possibly less) values aImp2[rowIdx,colIdx] = MA.average(columnVals.compressed()[:K], weights=weightsSortedCompressed[:K]) if callback: callback() # impute the unknown rows with average profile avrgRow = MA.average(arr2d, 0) for rowIdx in Numeric.compress(Numeric.equal(countByRows, 0), Numeric.arange(arr2d.shape[0])): aImp2[rowIdx] = avrgRow if callback: callback() return aImp2
def match(x, y, n_iterations=1, z=2, eps_rmsd=0.5, eps_stdv=0.05): """ Matches two arrays onto each other, while iteratively removing outliers. Superimposed array y would be C{ N.dot(y, N.transpose(r)) + t }. @param n_iterations: number of calculations:: 1 .. no iteration 0 .. until convergence @type n_iterations: 1|0 @param z: number of standard deviations for outlier definition (default: 2) @type z: float @param eps_rmsd: tolerance in rmsd (default: 0.5) @type eps_rmsd: float @param eps_stdv: tolerance in standard deviations (default: 0.05) @type eps_stdv: float @return: (r,t), [ [percent_considered, rmsd_for_it, outliers] ] @rtype: (array, array), [float, float, int] """ iter_trace = [] rmsd_old = 0 stdv_old = 0 n = 0 converged = 0 mask = N.ones(len(y), N.int32 ) while not converged: ## find transformation for best match r, t = findTransformation(N.compress(mask, x, 0), N.compress(mask, y, 0)) ## transform coordinates xt = N.dot(y, N.transpose(r)) + t ## calculate row distances d = N.sqrt(N.sum(N.power(x - xt, 2), 1)) * mask ## calculate rmsd and stdv rmsd = N.sqrt(N.average(N.compress(mask, d)**2)) stdv = MU.SD(N.compress(mask, d)) ## check conditions for convergence d_rmsd = abs(rmsd - rmsd_old) d_stdv = abs(1 - stdv_old / stdv) if d_rmsd < eps_rmsd and d_stdv < eps_stdv: converged = 1 else: rmsd_old = rmsd stdv_old = stdv ## store result perc = round(float(N.sum(mask)) / float(len(mask)), 2) ## throw out non-matching rows mask = N.logical_and(mask, N.less(d, rmsd + z * stdv)) outliers = N.nonzero( N.logical_not( mask ) ) iter_trace.append([perc, round(rmsd, 3), outliers]) n += 1 if n_iterations and n >= n_iterations: break return (r, t), iter_trace
def estimate_reference_single(entry, stats, bounds, ref=0.0, verbose=False, exclude=None, entry_name=None, atom_type='H', exclude_outliers=False,molType='protein'): A = 0. B = 0. S = 0. N = 1 ## loop through all atom types classes = decompose_classes(entry, bounds, atom_type,molType=molType) if exclude and not entry_name: raise TypeError, 'attribute entry_name needs to be set.' n_excluded = 0 n_total = 0 for key, shifts in classes.items(): ## print entry_name, key if not key in stats: if verbose: print key,'no statistics.' continue if exclude and (entry_name, key) in exclude: print entry_name, key, 'excluded from ref estimation.' continue ## get statistics for current atom type mu, sd = stats[key][:2] k = 1./sd**2 if exclude_outliers is not False: ## calculate Z scores and exclude shifts with high Z scores from analysis Z = abs(shifts-mu)/sd mask_include = Numeric.less(Z, exclude_outliers) shifts = Numeric.compress(mask_include, shifts) n_excluded += len(Z)-Numeric.sum(mask_include) n_total += len(Z) n = len(shifts) if not n: continue A += k*n*(median(shifts)-mu) B += k*n S += -0.5*len(shifts)*Numeric.log(k)+0.5*k*sum((Numeric.array(shifts)-mu-ref)**2) N += n if B > 0.: ref_mu = A/B ref_sd = 1./Numeric.sqrt(B) else: ref_mu = None ref_sd = None if exclude_outliers is not False and n_excluded == n_total: print '%d/%d outliers discarded' % (n_excluded, n_total) return ref_mu, ref_sd, S/N
def __call__(self, data, data_min=None, data_max=None, val_min=None, val_max=None, map_type='linear', powerOf2=0): """ (data, data_min=None, data_max=None, val_min=None, val_max=None, map_type = 'linear', powerOf2=0) Maps an array of floats to integer values. data -- 3D numeric array; data_min, data_max -- min and max data values(other than actual array minimum/maximum) to map to integer values - val_min and val_max - in range (0 ... int_limit); map_type -- can be 'linear' or 'log'; powerOf2 -- if set to 1, then if the data array dimensions are not power of 2 - the returned array will be padded with zeros so that its dims are power of 2. """ # if data_min/max and val_min/max specified then the mapping # will proceed as follows: # [arr_min, data_min] is mapped to [int_min, val_min], # [data_min, data_max] is maped to [val_min, val_max], # [data_max, arr_max] is mapped to [val_max, int_limit]. int_limit = self.int_limit int_min = self.int_min shape = data.shape assert len(shape)==3 nx, ny, nz = shape arrsize = nx*ny*nz #arr_max = Numeric.maximum.reduce(data.ravel()) #arr_min = Numeric.minimum.reduce(data.ravel()) maxif = Numeric.maximum.reduce arr_max = maxif(maxif(maxif(data))) minif = Numeric.minimum.reduce arr_min = minif(minif(minif(data))) #print "min(arr)=%f" % arr_min #print "max(arr)=%f" % arr_max if val_min != None: assert val_min >= 0 and val_min < int_limit else: val_min = int_min if val_max != None: assert val_max <= int_limit and val_max > 0 else: val_max = int_limit if data_min != None: if data_min < arr_min: data_min = arr_min else: data_min = arr_min if data_max != None: if data_max > arr_max: data_max = arr_max else: data_max = arr_max print "mapping data_min %4f to val_min %d, data_max %4f to val_max %d"\ % (data_min, val_min, data_max, val_max) if map_type == 'linear': k2,c2 = self.ScaleMap((val_min, val_max), (data_min, data_max)) n_intervals = 3 if abs(data_min-arr_min) < 0.00001: # data_min==arr_min k1,c1 = k2, c2 n_intervals = n_intervals-1 else : k1, c1 = self.ScaleMap((int_min, val_min), (arr_min, data_min)) if abs(data_max-arr_max) < 0.00001: # data_max == arr_max k3, c3 = k2, c2 n_intervals = n_intervals-1 else: k3, c3 = self.ScaleMap((val_max, int_limit), (data_max, arr_max)) t1 = time() #print "n_intervals = ", n_intervals if n_intervals == 2: if data_max == arr_max: #print "data_max == arr_max" new_arr = Numeric.where(Numeric.less(data, data_min), k1*data+c1, k2*data+c2 ) elif data_min == arr_min: #print "data_min == arr_min" new_arr = Numeric.where(Numeric.greater_equal(data, data_max), k3*data+c3, k2*data+c2) elif n_intervals == 3: new_arr1 = Numeric.where(Numeric.less(data, data_min), k1*data+c1, k2*data+c2) new_arr = Numeric.where(Numeric.greater_equal(data, data_max), k3*data+c3, new_arr1) del(new_arr1) else : new_arr = k2*data+c2 arr = Numeric.transpose(new_arr).astype(self.int_type) del(new_arr) t2 = time() print "time to map : ", t2-t1 elif map_type == 'log': if arr_min < 0: diff = abs(arr_min)+1.0 elif arr_min >= 0 and arr_min < 1.0: diff = 1.0 elif arr_min >= 1.0: diff=0 k1, c1 = self.ScaleMap( (int_min, int_limit), (log(arr_min+diff), log(arr_max+diff)) ) arr=Numeric.transpose(k1*Numeric.log10(data+diff)+c1).astype(self.int_type) self.data_min = data_min self.data_max = data_max self.val_min = val_min self.val_max = val_max if powerOf2: nx1, ny1, nz1 = nx, ny, nz res, power = isPowerOf2(nx) if not res: nx1 = 2**power res, power = isPowerOf2(ny) if not res: ny1 = 2**power res, power = isPowerOf2(nz) if not res: nz1 = 2**power dx, dy, dz = 0, 0, 0 if nx1 != nx or ny1 != ny or nz1 != nz: #print "new data size: ", nx1,ny1,nz1 dx = (nx1-nx)/2. ; dy = (ny1-ny)/2. ; dz = (nz1-nz)/2. #narr = Numeric.zeros((nx1,ny1,nz1), self.int_type) #narr[:nx,:ny,:nz] = arr[:,:,:] narr = Numeric.zeros((nz1,ny1,nx1), self.int_type) narr[:nz,:ny,:nx] = arr[:,:,:] self.arr = narr #arr = Numeric.zeros((nx1,ny1,nz1), self.int_type) #arr[:nx,:ny,:nz] = new_arr[:,:,:] #arr = Numeric.transpose(arr).astype(self.int_type) #self.arr = arr return narr self.arr = arr return arr