def castHmmDic( self, hmmDic, repete, hmmGap, key ): """ Blow up hmmDic to the number of repetes of the profile used. Correct scores for possible deletions in the search sequence. @param hmmDic: dictionary from L{getHmmProfile} @type hmmDic: dict @param repete: repete information from L{align} @type repete: int @param hmmGap: information about gaps from L{align} @type hmmGap: [int] @param key: name of scoring method to adjust for gaps and repetes @type key: str @return: dictionary with information about the profile @rtype: dict """ s = hmmDic[key] for i in range( repete ): mask = N.ones( len(s) ) N.put( mask, hmmGap[i], 0 ) if i == 0: score = N.compress( mask, s, 0 ) if i > 0: score = N.concatenate( ( N.compress( mask, s, 0 ), score ) ) hmmDic[key] = score return hmmDic
def contactResDistribution( self, cm=None ): """ Count occurrence of residues in protein-protein interface. @param cm: pre-calculated contact matrix (default: None) @type cm: matrix @return: dict {'A':3, 'C':1, .. } (20 standard amino acids) @rtype: dict """ if cm == None: cm = self.resContacts() ## get mask for residues involved in contacts maskLig = N.sum( cm ) maskRec = N.sum( N.transpose( cm )) ## get sequence of contact residues only seqLig = N.compress( maskLig, self.lig().sequence() ) seqRec = N.compress( maskRec, self.rec().sequence() ) seq = ''.join( seqLig ) + ''.join(seqRec) ## convert back to string ## count occurrence of letters result = {} for aa in molUtils.allAA(): result[aa] = seq.count( aa ) return result
def __atomContacts(self, cutoff, rec_mask, lig_mask, cache): """ Intermolecular distances below cutoff after applying the two masks. @param cutoff: cutoff for B{atom-atom} contact in \AA @type cutoff: float @param rec_mask: atom mask @type rec_mask: [1|0] @param lig_mask: atom mask @type lig_mask: [1|0] @param cache: cache pairwise atom distance matrix @type cache: 1|0 @return: atom contact matrix, array sum_rec_mask x sum_lig_mask @rtype: array """ ## get atom coordinats as array 3 x all_atoms rec_xyz = self.rec().getXyz() lig_xyz = self.lig().getXyz() ## get pair-wise distances -> atoms_rec x atoms_lig dist = getattr( self, 'pw_dist', None ) if dist is None or \ N.shape( dist ) != ( N.sum(rec_mask), N.sum(lig_mask) ): dist = self.__pairwiseDistances(N.compress( rec_mask, rec_xyz, 0), N.compress( lig_mask, lig_xyz, 0) ) if cache: self.pw_dist = dist ## reduce to 1 (distance < cutoff) or 0 -> n_atoms_rec x n_atoms_lig return N.less( dist, cutoff )
def castHmmDic(self, hmmDic, repete, hmmGap, key): """ Blow up hmmDic to the number of repetes of the profile used. Correct scores for possible deletions in the search sequence. @param hmmDic: dictionary from L{getHmmProfile} @type hmmDic: dict @param repete: repete information from L{align} @type repete: int @param hmmGap: information about gaps from L{align} @type hmmGap: [int] @param key: name of scoring method to adjust for gaps and repetes @type key: str @return: dictionary with information about the profile @rtype: dict """ s = hmmDic[key] for i in range(repete): mask = N.ones(len(s)) N.put(mask, hmmGap[i], 0) if i == 0: score = N.compress(mask, s, 0) if i > 0: score = N.concatenate((N.compress(mask, s, 0), score)) hmmDic[key] = score return hmmDic
def triangularGet(m2d, upper=1): """Returns 1D masked array with elements from the upper (lower) triangular part of the given matrix. For a symetric matrix triangularGet(m2d, 0) and triangularGet(m2d, 1) return elements in different order. """ assert upper in [0,1], "upper: [0|1] expected" m2d = MA.asarray(m2d) assert MA.rank(m2d) == 2, "2D (masked) array expected" if upper: takeInd = Numeric.compress(Numeric.ravel(Numeric.fromfunction(lambda i,j: i<j, m2d.shape)), Numeric.arange(0, Numeric.multiply.reduce(m2d.shape), typecode=Numeric.Int)) else: takeInd = Numeric.compress(Numeric.ravel(Numeric.fromfunction(lambda i,j: i>j, m2d.shape)), Numeric.arange(0, Numeric.multiply.reduce(m2d.shape), typecode=Numeric.Int)) return MA.ravel(m2d).take(takeInd)
def kNNimputeMA(arr2d, K=20, callback=None): """Returns a new 2D MA.array with missing values imputed from K nearest neighbours. Find K rows (axis 0) with the most similar values where similarity measure corresponds to weighted Euclidean distance. Imputed value = weighted average of the corresponding values of K nearest neighbours, where weights equal to tricubic distribution of distances to all rows. Impute missing rows by average over all rows. Version: 30.8.2005 """ arr2d = MA.asarray(arr2d) assert len(arr2d.shape) == 2, "2D array expected" # make a copy for imputation aImp2 = MA.array(arr2d) # leave out columns with 0 known values (columnInd: non-zero columns) columnCond = Numeric.greater(MA.count(arr2d, axis=0), 0) columnIndAll = Numeric.arange(arr2d.shape[1]) columnInd = Numeric.compress(columnCond, columnIndAll) # impute the rows where 0 < #known_values < #non_zero_columns, i.e. exclude the rows with 0 and all (non-zero-column) values countByRows = MA.count(arr2d, axis=1) for rowIdx in Numeric.compress(Numeric.logical_and(Numeric.greater(countByRows, 0), Numeric.less(countByRows, columnInd.shape[0])), Numeric.arange(arr2d.shape[0])): rowResized = MA.resize(arr2d[rowIdx], arr2d.shape) diff = arr2d - rowResized distances = MA.sqrt(MA.add.reduce((diff)**2, 1) / MA.count(diff, axis=1)) # nearest neighbours row indices (without the current row index) indSorted = MA.argsort(distances)[1:] distSorted = distances.take(indSorted) # number of distances different from MA.masked numNonMasked = distSorted.shape[0] - Numeric.add.reduce(Numeric.asarray(MA.getmaskarray(distSorted), Numeric.Int)) # number of distances to account for (K or less) if numNonMasked > 1: weightsSorted = MA.power(1-MA.power(distSorted/distSorted[numNonMasked-1],3),3) # tricubic distribution of all weights else: weightsSorted = Numeric.ones(distSorted.shape[0]) # compute average for each column separately in order to account for K non-masked values colInd4CurrRow = Numeric.compress(Numeric.logical_and(MA.getmaskarray(arr2d[rowIdx]), columnCond), columnIndAll) for colIdx in colInd4CurrRow: # column values sorted by distances columnVals = arr2d[:,colIdx].take(indSorted) # take only those weights where columnVals does not equal MA.masked weightsSortedCompressed = MA.compress(1-MA.getmaskarray(columnVals), weightsSorted) # impute from K (or possibly less) values aImp2[rowIdx,colIdx] = MA.average(columnVals.compressed()[:K], weights=weightsSortedCompressed[:K]) if callback: callback() # impute the unknown rows with average profile avrgRow = MA.average(arr2d, 0) for rowIdx in Numeric.compress(Numeric.equal(countByRows, 0), Numeric.arange(arr2d.shape[0])): aImp2[rowIdx] = avrgRow if callback: callback() return aImp2
def residusMaximus(self, atomValues, mask=None): """ Take list of value per atom, return list where all atoms of any residue are set to the highest value of any atom in that residue. (after applying mask) @param atomValues: list 1 x N, values per atom @type atomValues: [ float ] @param mask: list 1 x N, 0|1, 'master' atoms of each residue @type mask: [1|0] @return: Numpy array 1 x N of float @rtype: array """ if mask is None: mask = N.ones(len(self.frames[0]), N.int32) ## eliminate all values that do not belong to the selected atoms masked = atomValues * mask result = [] ## set all atoms of each residue to uniform value for res in range(0, self.resMap()[-1] + 1): ## get atom entries for this residue resAtoms = N.compress(N.equal(self.resMap(), res), masked) ## get maximum value masterValue = max(resAtoms) result += resAtoms * 0.0 + masterValue return N.array(result)
def takeFrames( self, indices ): """ Return a copy of the trajectory containing only the specified frames. @param indices: positions to take @type indices: [int] @return: copy of this Trajectory (fewer frames, semi-deep copy of ref) @rtype: Trajectory """ ## remove out-of-bound indices indices = N.compress( N.less( indices, len( self.frames) ), indices ) r = self.__class__() ## this step takes some time for large frames ! r.frames = N.take( self.frames, indices, 0 ) ## semi-deep copy of reference model r.setRef( self.ref.take( range( self.ref.lenAtoms() )) ) if self.frameNames != None: r.frameNames = N.take( self.frameNames, indices, 0 ) r.frameNames = map( ''.join, r.frameNames.tolist() ) r.pc = self.__takePca( indices ) r.profiles = self.profiles.take( indices ) r.resIndex = self.resIndex return r
def pairwiseRmsd(self, aMask=None, noFit=0): """ Calculate rmsd between each 2 coordinate frames. @param aMask: atom mask @type aMask: [1|0] @return: frames x frames array of float @rtype: array """ frames = self.frames if aMask != None: frames = N.compress(aMask, frames, 1) result = N.zeros((len(frames), len(frames)), N.Float32) for i in range(0, len(frames)): for j in range(i + 1, len(frames)): if noFit: d = N.sqrt(N.sum(N.power(frames[i] - frames[j], 2), 1)) result[i, j] = result[j, i] = N.sqrt(N.average(d**2)) else: rt, rmsdLst = rmsFit.match(frames[i], frames[j], 1) result[i, j] = result[j, i] = rmsdLst[0][1] return result
def addDensity( self, radius=6, minasa=None, profName='density' ): """ Count the number of heavy atoms within the given radius. Values are only collected for atoms with |minasa| accessible surface area. @param minasa: relative exposed surface - 0 to 100% @type minasa: float @param radius: in Angstrom @type radius: float """ mHeavy = self.m.maskHeavy() xyz = N.compress( mHeavy, self.m.getXyz(), 0 ) if minasa and self.m.profile( 'relAS', 0 ) == 0: self.addASA() if minasa: mSurf = self.m.profile2mask( 'relAS', minasa ) else: mSurf = N.ones( self.m.lenAtoms() ) ## loop over all surface atoms surf_pos = N.nonzero( mSurf ) contacts = [] for i in surf_pos: dist = N.sum(( xyz - self.m.xyz[i])**2, 1) contacts += [ N.sum( N.less(dist, radius**2 )) -1] self.m.atoms.set( profName, contacts, mSurf, default=-1, comment='atom density radius %3.1fA' % radius, version= T.dateString() + ' ' + self.version() )
def centerSurfDist( model, surf_mask, mask=None ): """ Calculate the longest and shortest distance from the center of the molecule to the surface. @param mask: atoms not to be considerd (default: None) @type mask: [1|0] @param surf_mask: atom surface mask, needed for minimum surface distance @type surf_mask: [1|0] @return: max distance, min distance @rtype: float, float """ if mask is None: mask = model.maskHeavy() ## calculate center of mass center = model.centerOfMass() ## surface atom coordinates surf_xyz = N.compress( mask*surf_mask, model.getXyz(), 0 ) ## find the atom closest and furthest away from center dist = N.sqrt( N.sum( (surf_xyz-center)**2 , 1 ) ) minDist = min(dist) maxDist = max(dist) return maxDist, minDist
def logConfidence(x, R, clip=0): """ Estimate the probability of x NOT beeing a random observation from a lognormal distribution that is described by a set of random values. @param x: observed value @type x: float @param R: sample of random values @type R: [float] @param clip: clip zeros at this value 0->don't clip (default: 0) @type clip: float @return: confidence that x is not random, median of random distr. @rtype: (float, float) """ if clip and 0 in R: R = N.clip(R, clip, max(R)) if clip and x == 0: x = clip ## remove 0 instead of clipping R = N.compress(R, R) if x == 0: return 0, 0 ## get mean and stdv of log-transformed random sample alpha = N.average(N.log(R)) n = len(R) beta = N.sqrt(N.sum(N.power(N.log(R) - alpha, 2)) / (n - 1.)) return logArea(x, alpha, beta), logMedian(alpha)
def stable_sd(x, n_sd=3., min_length=20): if len(x) < min_length: if len(x) == 1: return 0. else: return standardDeviation(x) x = Numeric.array(x) _x = x _outliers = 0. i = 0 while i < 10: mu = median(_x) sd = standardDeviation(_x, mu) outliers = Numeric.greater(abs(x-mu), n_sd*sd) if not Numeric.sum(outliers) or Numeric.sum(outliers==_outliers) == len(x): break _x = Numeric.compress(Numeric.logical_not(outliers), x) _outliers = outliers i += 1 return sd
def centerSurfDist(model, surf_mask, mask=None): """ Calculate the longest and shortest distance from the center of the molecule to the surface. @param mask: atoms not to be considerd (default: None) @type mask: [1|0] @param surf_mask: atom surface mask, needed for minimum surface distance @type surf_mask: [1|0] @return: max distance, min distance @rtype: float, float """ if mask is None: mask = model.maskHeavy() ## calculate center of mass center = model.centerOfMass() ## surface atom coordinates surf_xyz = N.compress(mask * surf_mask, model.getXyz(), 0) ## find the atom closest and furthest away from center dist = N.sqrt(N.sum((surf_xyz - center)**2, 1)) minDist = min(dist) maxDist = max(dist) return maxDist, minDist
def takeFrames(self, indices): """ Return a copy of the trajectory containing only the specified frames. @param indices: positions to take @type indices: [int] @return: copy of this Trajectory (fewer frames, semi-deep copy of ref) @rtype: Trajectory """ ## remove out-of-bound indices indices = N.compress(N.less(indices, len(self.frames)), indices) r = self.__class__() ## this step takes some time for large frames ! r.frames = N.take(self.frames, indices, 0) ## semi-deep copy of reference model r.setRef(self.ref.take(range(self.ref.lenAtoms()))) if self.frameNames != None: r.frameNames = N.take(self.frameNames, indices, 0) r.frameNames = map(''.join, r.frameNames.tolist()) r.pc = self.__takePca(indices) r.profiles = self.profiles.take(indices) r.resIndex = self.resIndex return r
def loessMA(m, windowSize, axis=0, approxMasked=True, verbose=False, callback=None): """Returns a new array with values at the given axis smoothed by loess; if approxMasked==True: the masked values are approximated by loess; assumes equidistant spacing of points on the given axis. """ assert 0 < windowSize <= m.shape[axis]+0.1, "0 < windowSize[%s] <= 1 OR windowSize in range(1.1,m.shape[axis]+1) expected, got %f" % ("%", windowSize) m = MA.asarray(m) if m.dtype.char <> Numeric.Float: m = m.astype(Numeric.Float) shp_other = list(m.shape) shp_other.pop(axis) # get a transposed and reshaped mask and data from m; if m.mask() == None, construct a new array of zeros mask = Numeric.reshape(Numeric.transpose(MA.getmaskarray(m), [axis] + range(0,axis) + range(axis+1,len(m.shape))), (m.shape[axis], Numeric.multiply.reduce(shp_other))) data = MA.reshape(MA.transpose(m, [axis] + range(0,axis) + range(axis+1,len(m.shape))), (m.shape[axis], Numeric.multiply.reduce(shp_other))) maskInv = -1*(mask-1) xall = Numeric.arange(data.shape[0]) xallList = xall.tolist() for ii in Numeric.compress(Numeric.add.reduce(maskInv,0) > 1, range(data.shape[1])): # run loess if the profile contains more than 2 values try: data[:,ii] = MA.array(statc.loess(zip(MA.compress(maskInv[:,ii], xall).tolist(), MA.compress(maskInv[:,ii], data[:,ii]).tolist()), xallList, windowSize))[:,1] except: if verbose: print "Warning: loessMA: could not loess axis %i index %i" % (axis, ii) if callback: callback() if not approxMasked: data = MA.array(data, mask=mask) return MA.transpose(MA.reshape(data, [m.shape[axis]] + shp_other), [axis] + range(0,axis) + range(axis+1,len(m.shape)))
def logConfidence( x, R, clip=0 ): """ Estimate the probability of x NOT beeing a random observation from a lognormal distribution that is described by a set of random values. @param x: observed value @type x: float @param R: sample of random values @type R: [float] @param clip: clip zeros at this value 0->don't clip (default: 0) @type clip: float @return: confidence that x is not random, median of random distr. @rtype: (float, float) """ if clip and 0 in R: R = N.clip( R, clip, max( R ) ) if clip and x == 0: x = clip ## remove 0 instead of clipping R = N.compress( R, R ) if x == 0: return 0, 0 ## get mean and stdv of log-transformed random sample alpha = N.average( N.log( R ) ) n = len( R ) beta = N.sqrt(N.sum(N.power(N.log( R ) - alpha, 2)) / (n - 1.)) return logArea( x, alpha, beta ), logMedian( alpha )
def residusMaximus( self, atomValues, mask=None ): """ Take list of value per atom, return list where all atoms of any residue are set to the highest value of any atom in that residue. (after applying mask) @param atomValues: list 1 x N, values per atom @type atomValues: [ float ] @param mask: list 1 x N, 0|1, 'master' atoms of each residue @type mask: [1|0] @return: Numpy array 1 x N of float @rtype: array """ if mask is None: mask = N.ones( len( self.frames[0] ), N.int32 ) ## eliminate all values that do not belong to the selected atoms masked = atomValues * mask result = [] ## set all atoms of each residue to uniform value for res in range( 0, self.resMap()[-1]+1 ): ## get atom entries for this residue resAtoms = N.compress( N.equal( self.resMap(), res ), masked ) ## get maximum value masterValue = max( resAtoms ) result += resAtoms * 0.0 + masterValue return N.array( result )
def triangularPut(m1d, upper=1, lower=0): """Returns 2D masked array with elements of the given 1D array in the strictly upper (lower) triangle. Elements of the 1D array should be ordered according to the upper triangular part of the 2D matrix. The lower triangular part (if requested) equals to the transposed upper triangular part. If upper == lower == 1 a symetric matrix is returned. """ assert upper in [0,1] and lower in [0,1], "[0|1] expected for upper / lower" m1d = MA.asarray(m1d) assert MA.rank(m1d) == 1, "1D masked array expected" m2dShape0 = math.ceil(math.sqrt(2*m1d.shape[0])) assert m1d.shape[0] == m2dShape0*(m2dShape0-1)/2, "the length of m1d does not correspond to n(n-1)/2" if upper: if lower: mask = Numeric.fromfunction(lambda i,j: i==j, (m2dShape0, m2dShape0)) else: mask = Numeric.fromfunction(lambda i,j: i>=j, (m2dShape0, m2dShape0)) else: if lower: mask = Numeric.fromfunction(lambda i,j: i<=j, (m2dShape0, m2dShape0)) else: mask = Numeric.ones((m2dShape0, m2dShape0)) m2d = MA.ravel(MA.zeros((m2dShape0, m2dShape0), m1d.dtype.char)) condUpperTriang = Numeric.fromfunction(lambda i,j: i<j, (m2dShape0, m2dShape0)) putIndices = Numeric.compress(Numeric.ravel(condUpperTriang), Numeric.arange(0, m2dShape0**2, typecode=Numeric.Int)) MA.put(m2d, putIndices, m1d) m2d = MA.reshape(m2d, (m2dShape0, m2dShape0)) m2d = MA.where(condUpperTriang, m2d, MA.transpose(m2d)) return MA.array(m2d, mask=Numeric.logical_or(mask, MA.getmaskarray(m2d)))
def pairwiseRmsd( self, aMask=None, noFit=0 ): """ Calculate rmsd between each 2 coordinate frames. @param aMask: atom mask @type aMask: [1|0] @return: frames x frames array of float @rtype: array """ frames = self.frames if aMask != None: frames = N.compress( aMask, frames, 1 ) result = N.zeros( (len( frames ), len( frames )), N.Float32 ) for i in range(0, len( frames ) ): for j in range( i+1, len( frames ) ): if noFit: d = N.sqrt(N.sum(N.power(frames[i]-frames[j], 2), 1)) result[i,j] = result[j,i] = N.sqrt( N.average(d**2) ) else: rt, rmsdLst = rmsFit.match( frames[i], frames[j], 1 ) result[i,j] = result[j,i] = rmsdLst[0][1] return result
def chipdata(self, data): """Input data: [(dirname0, [et0, et1, ...]), ...] """ self.numRowsMissingChipData = 0 self._chipdataMA = [] if data != None: self._chipdata = data numValsAll = 0 numValsNonMasked = 0 numFiles = 0 numExamplesList = [] attribDict = {} numColMissing = 0 for (name, etList) in data: numFiles += len(etList) self._chipdataMA.append((name, [])) for et in etList: attribDict.update( dict( zip(map(lambda x: x.name, et.domain.attributes), et.domain.attributes))) numExamplesList.append(len(et)) etm = et.toNumpyMA("a")[0] colNonMissingInd = Numeric.compress( Numeric.not_equal(MA.count(etm, 0), 0), Numeric.arange(etm.shape[1]) ) # indices of columns that are not completely missing numColMissing += etm.shape[1] - colNonMissingInd.shape[0] self.numRowsMissingChipData += int( Numeric.add.reduce( Numeric.less( MA.count(etm.take(colNonMissingInd, 1), 1), etm.shape[1]))) numValsAll += int(Numeric.multiply.reduce(etm.shape)) numValsNonMasked += int(MA.count(etm)) self._chipdataMA[-1][1].append(etm) # info text self.infoc.setText( "Structured Data: %i data files with %i profiles on %i points" % (numFiles, numExamplesList[0], len(attribDict))) numTotalMissing = numValsAll - numValsNonMasked if numTotalMissing > 0: print numTotalMissing, numColMissing, self.numRowsMissingChipData print type(numTotalMissing), type(numColMissing), type( self.numRowsMissingChipData) self.infod.setText( "missing %i values, %i column%s completely, %i row%s partially" % (numTotalMissing, numColMissing, [ "", "s" ][numColMissing != 1], self.numRowsMissingChipData, ["", "s"][self.numRowsMissingChipData != 1])) else: self.infod.setText("") else: self._chipdata = None self.infoc.setText("No structured data on input") self.infod.setText("") self.setGuiCommonExpChip() if self.commitOnChange: self.senddata(2)
def compressIndices(ma): """Returns 1D compressed Numeric array and the indices of the non-masked places. usage: nu,ind = compressIndices(ma) nu = Numeric.elementwise_function(nu) ma = MA.put(ma, ind, nu) """ ma = MA.asarray(ma) nonMaskedInd = Numeric.compress(1-Numeric.ravel(MA.getmaskarray(ma)), Numeric.arange(Numeric.multiply.reduce(ma.shape))) return MA.filled(ma.compressed()), nonMaskedInd
def __extractLigandMatrix(self, fcomplex): """ Compare structure from hex complex with original ligand pdb and store transformation matrix of ligand in self.ligandMatrix. @param fcomplex: pdb file with hex complex @type fcomplex: complec @return: rotation matrix and translation matrix as tuple @rtype: (array, array) """ docked_pdb = self._extractLigandStructure(fcomplex) xyz_docked = N.compress( docked_pdb.maskCA(), docked_pdb.xyz ) xyz_template = N.compress( self.lig_model.maskCA(), self.lig_model.xyz ) (r, t) = self._findTransformation(xyz_docked, xyz_template) return (r,t)
def phi_and_psi(self, model): """ Calculate phi and psi torsion angles for all residues in model:: phi - rotation about the N-CA bond - last position in a chain = None psi - rotation about CA-C - first position in a chain = None @param model: PDBModel @type model: PDBModel """ for c in range(model.lenChains(breaks=1)): cModel = model.takeChains([c], breaks=1) xyz = cModel.xyz xyz_CA = N.compress(cModel.maskCA(), xyz, 0) xyz_N = N.compress(cModel.mask(['N']), xyz, 0) xyz_C = N.compress(cModel.mask(['C']), xyz, 0) ## phi: c1 - N ## c2 - CA ## c3 - C ## c4 - N of next residue for i in range(len(xyz_N) - 1): self.phi += [ self.dihedral(xyz_N[i], xyz_CA[i], xyz_C[i], xyz_N[i + 1]) ] self.phi += [None] ## psi: c1 - C of previous residue ## c2 - N ## c3 - CA ## c4 - C self.psi += [None] for i in range(1, len(xyz_N)): self.psi += [ self.dihedral(xyz_C[i - 1], xyz_N[i], xyz_CA[i], xyz_C[i]) ]
def diagonalPut(m1d, m2d): """Puts the given 1D masked array into the diagonal of the given 2D masked array and returns a new copy of the 2D array. """ m1d = MA.asarray(m1d) m2d = MA.asarray(m2d) assert MA.rank(m1d) == 1 and MA.rank(m2d) == 2, "1D and 2D masked array expected" assert m1d.shape[0] == m2d.shape[0] == m2d.shape[1], "the shape of the given arrays does not match" putIndices = Numeric.compress(Numeric.ravel(Numeric.fromfunction(lambda i,j: i==j, m2d.shape)), Numeric.arange(0, Numeric.multiply.reduce(m2d.shape), typecode=Numeric.Int)) m2dShape = m2d.shape m2d = MA.ravel(m2d) MA.put(m2d, putIndices, m1d) return MA.reshape(m2d, m2dShape)
def plotContactDensity( self, step=1, cutoff=4.5 ): """ Example. plot histogramm of contact density. Somehing wrong?? @raise ComplexTrajError: if gnuplot program is not installed """ if not gnuplot.installed: raise ComplexTrajError, 'gnuplot program is not installed' r = self.averageContacts( step, cutoff ) r = N.ravel( r ) r = N.compress( r, r ) gnuplot.plot( hist.density( r, 10 ) )
def compareSequences(seqAA_1, seqAA_2): """ """ seqAA_1 = list(seqAA_1) seqAA_2 = list(seqAA_2) seqNr_1 = range(len(seqAA_1)) seqNr_2 = range(len(seqAA_2)) # get mask mask_1 = N.zeros(len(seqNr_1)) mask_2 = N.zeros(len(seqNr_2)) # compare sequences seqDiff = getOpCodes(seqAA_1, seqAA_2) # get delete lists del_1, del_2 = getSkipLists(seqDiff) del_1 = [expandRepeats(seqAA_1, *pos) for pos in del_1] del_2 = [expandRepeats(seqAA_2, *pos) for pos in del_2] mask1 = del2mask(seqAA_1, *del_1) mask2 = del2mask(seqAA_2, *del_2) seqAA_1 = N.compress(mask1, seqAA_1).tolist() seqNr_1 = N.compress(mask1, seqNr_1).tolist() seqAA_2 = N.compress(mask2, seqAA_2).tolist() seqNr_2 = N.compress(mask2, seqNr_2).tolist() # get equal parts seqDiff = getOpCodes(seqAA_1, seqAA_2) equal_1, equal_2 = getEqualLists(seqDiff) seqAA_1, seqNr_1 = getEqual(seqAA_1, seqNr_1, equal_1) seqAA_2, seqNr_2 = getEqual(seqAA_2, seqNr_2, equal_2) N.put(mask_1, seqNr_1, 1) N.put(mask_2, seqNr_2, 1) return mask_1, mask_2
def compareSequences( seqAA_1, seqAA_2 ): """ """ seqAA_1 = list( seqAA_1 ) seqAA_2 = list( seqAA_2 ) seqNr_1 = range( len( seqAA_1 ) ) seqNr_2 = range( len( seqAA_2 ) ) # get mask mask_1 = N.zeros( len( seqNr_1 ) ) mask_2 = N.zeros( len( seqNr_2 ) ) # compare sequences seqDiff = getOpCodes( seqAA_1, seqAA_2) # get delete lists del_1, del_2 = getSkipLists( seqDiff ) del_1 = [ expandRepeats( seqAA_1, *pos ) for pos in del_1 ] del_2 = [ expandRepeats( seqAA_2, *pos ) for pos in del_2 ] mask1 = del2mask( seqAA_1, *del_1 ) mask2 = del2mask( seqAA_2, *del_2 ) seqAA_1 = N.compress( mask1, seqAA_1 ).tolist() seqNr_1 = N.compress( mask1, seqNr_1 ).tolist() seqAA_2 = N.compress( mask2, seqAA_2 ).tolist() seqNr_2 = N.compress( mask2, seqNr_2 ).tolist() # get equal parts seqDiff = getOpCodes( seqAA_1, seqAA_2 ) equal_1, equal_2 = getEqualLists( seqDiff ) seqAA_1, seqNr_1 = getEqual( seqAA_1, seqNr_1, equal_1) seqAA_2, seqNr_2 = getEqual( seqAA_2, seqNr_2, equal_2 ) N.put( mask_1, seqNr_1 , 1 ) N.put( mask_2, seqNr_2 , 1 ) return mask_1, mask_2
def __init__(self, crv1, crv2): if not isinstance(crv1, Crv.Crv): raise NURBSError, 'Parameter crv1 not derived from Crv class!' if not isinstance(crv2, Crv.Crv): raise NURBSError, 'Parameter crv2 not derived from Crv class!' # ensure both curves have a common degree d = max(crv1.degree, crv2.degree) crv1.degelev(d - crv1.degree) crv2.degelev(d - crv2.degree) # merge the knot vectors, to obtain a common knot vector k1 = crv1.uknots k2 = crv2.uknots ku = [] for item in k1: if not numerix.sometrue(numerix.equal(k2, item)): if item not in ku: ku.append(item) for item in k2: if not numerix.sometrue(numerix.equal(k1, item)): if item not in ku: ku.append(item) ku = numerix.sort(numerix.asarray(ku, numerix.Float)) n = ku.shape[0] ka = numerix.array([], numerix.Float) kb = numerix.array([], numerix.Float) for i in range(0, n): i1 = numerix.compress(numerix.equal(k1, ku[i]), k1).shape[0] i2 = numerix.compress(numerix.equal(k2, ku[i]), k2).shape[0] m = max(i1, i2) ka = numerix.concatenate((ka, ku[i] * numerix.ones( (m - i1, ), numerix.Float))) kb = numerix.concatenate((kb, ku[i] * numerix.ones( (m - i2, ), numerix.Float))) crv1.kntins(ka) crv2.kntins(kb) coefs = numerix.zeros((4, crv1.cntrl.shape[1], 2), numerix.Float) coefs[:, :, 0] = crv1.cntrl coefs[:, :, 1] = crv2.cntrl Srf.__init__(self, coefs, crv1.uknots, [0., 0., 1., 1.])
def loessMA(m, windowSize, axis=0, approxMasked=True, verbose=False, callback=None): """Returns a new array with values at the given axis smoothed by loess; if approxMasked==True: the masked values are approximated by loess; assumes equidistant spacing of points on the given axis. """ assert 0 < windowSize <= m.shape[ axis] + 0.1, "0 < windowSize[%s] <= 1 OR windowSize in range(1.1,m.shape[axis]+1) expected, got %f" % ( "%", windowSize) m = MA.asarray(m) if m.dtype.char <> Numeric.Float: m = m.astype(Numeric.Float) shp_other = list(m.shape) shp_other.pop(axis) # get a transposed and reshaped mask and data from m; if m.mask() == None, construct a new array of zeros mask = Numeric.reshape( Numeric.transpose(MA.getmaskarray(m), [axis] + range(0, axis) + range(axis + 1, len(m.shape))), (m.shape[axis], Numeric.multiply.reduce(shp_other))) data = MA.reshape( MA.transpose(m, [axis] + range(0, axis) + range(axis + 1, len(m.shape))), (m.shape[axis], Numeric.multiply.reduce(shp_other))) maskInv = -1 * (mask - 1) xall = Numeric.arange(data.shape[0]) xallList = xall.tolist() for ii in Numeric.compress( Numeric.add.reduce(maskInv, 0) > 1, range(data.shape[1]) ): # run loess if the profile contains more than 2 values try: data[:, ii] = MA.array( statc.loess( zip( MA.compress(maskInv[:, ii], xall).tolist(), MA.compress(maskInv[:, ii], data[:, ii]).tolist()), xallList, windowSize))[:, 1] except: if verbose: print "Warning: loessMA: could not loess axis %i index %i" % ( axis, ii) if callback: callback() if not approxMasked: data = MA.array(data, mask=mask) return MA.transpose(MA.reshape(data, [m.shape[axis]] + shp_other), [axis] + range(0, axis) + range(axis + 1, len(m.shape)))
def random2DArray( matrix, ranNr=1, mask=None): """ Create randomized 2D array containing ones and zeros. @param matrix: matrix to randomize @type matrix: 2D array @param mask: mask OR None (default: None) @type mask: list(1|0) @param ranNr: number of matricies to add up (default: 1) @type ranNr: integer @return: 2D array or |ranNr| added contact matricies @rtype:2D array @raise MathUtilError: if mask does not fit matrix """ ## get shape of matrix a,b = N.shape( matrix ) ## get array from matrix that is to be randomized if mask is not None: if len(mask) == len( N.ravel(matrix) ): array = N.compress( mask, N.ravel(matrix) ) if len(mask) != len( N.ravel(matrix) ): raise MathUtilError( 'MatUtils.random2DArray - mask of incorrect length' + '\tMatrix length: %i Mask length: %i'\ %(len( N.ravel(matrix) ), len(mask))) if not mask: array = N.ravel(matrix) ## number of ones and length of array nOnes = int( N.sum( array ) ) lenArray = len( array ) ranArray = N.zeros( lenArray ) ## create random array for n in range(ranNr): ranArray += randomMask( nOnes, lenArray ) ## blow up to size of original matix if mask is not None: r = N.zeros(a*b) N.put( r, N.nonzero(mask), ranArray) return N.reshape( r, (a,b) ) if not mask: return N.reshape( ranArray, (a,b) )
def random2DArray(matrix, ranNr=1, mask=None): """ Create randomized 2D array containing ones and zeros. @param matrix: matrix to randomize @type matrix: 2D array @param mask: mask OR None (default: None) @type mask: list(1|0) @param ranNr: number of matricies to add up (default: 1) @type ranNr: integer @return: 2D array or |ranNr| added contact matricies @rtype:2D array @raise MathUtilError: if mask does not fit matrix """ ## get shape of matrix a, b = N.shape(matrix) ## get array from matrix that is to be randomized if mask is not None: if len(mask) == len(N.ravel(matrix)): array = N.compress(mask, N.ravel(matrix)) if len(mask) != len(N.ravel(matrix)): raise MathUtilError( 'MatUtils.random2DArray - mask of incorrect length' + '\tMatrix length: %i Mask length: %i'\ %(len( N.ravel(matrix) ), len(mask))) if not mask: array = N.ravel(matrix) ## number of ones and length of array nOnes = int(N.sum(array)) lenArray = len(array) ranArray = N.zeros(lenArray) ## create random array for n in range(ranNr): ranArray += randomMask(nOnes, lenArray) ## blow up to size of original matix if mask is not None: r = N.zeros(a * b) N.put(r, N.nonzero(mask), ranArray) return N.reshape(r, (a, b)) if not mask: return N.reshape(ranArray, (a, b))
def __init__(self, crv1, crv2): if not isinstance(crv1, Crv.Crv): raise NURBSError, 'Parameter crv1 not derived from Crv class!' if not isinstance(crv2, Crv.Crv): raise NURBSError, 'Parameter crv2 not derived from Crv class!' # ensure both curves have a common degree d = max(crv1.degree, crv2.degree) crv1.degelev(d - crv1.degree) crv2.degelev(d - crv2.degree) # merge the knot vectors, to obtain a common knot vector k1 = crv1.uknots k2 = crv2.uknots ku = [] for item in k1: if not numerix.sometrue(numerix.equal(k2, item)): if item not in ku: ku.append(item) for item in k2: if not numerix.sometrue(numerix.equal(k1, item)): if item not in ku: ku.append(item) ku = numerix.sort(numerix.asarray(ku, numerix.Float)) n = ku.shape[0] ka = numerix.array([], numerix.Float) kb = numerix.array([], numerix.Float) for i in range(0, n): i1 = numerix.compress(numerix.equal(k1, ku[i]), k1).shape[0] i2 = numerix.compress(numerix.equal(k2, ku[i]), k2).shape[0] m = max(i1, i2) ka = numerix.concatenate((ka , ku[i] * numerix.ones((m - i1,), numerix.Float))) kb = numerix.concatenate((kb , ku[i] * numerix.ones((m - i2,), numerix.Float))) crv1.kntins(ka) crv2.kntins(kb) coefs = numerix.zeros((4, crv1.cntrl.shape[1], 2), numerix.Float) coefs[:,:,0] = crv1.cntrl coefs[:,:,1] = crv2.cntrl Srf.__init__(self, coefs, crv1.uknots, [0., 0., 1., 1.])
def pca( self, atomMask=None, frameMask=None, fit=1 ): """ Calculate principal components of trajectory frames. @param atomMask: 1 x N_atoms, [111001110..] atoms to consider (default: all) @type atomMask: [1|0] @param frameMask: 1 x N_frames, [001111..] frames to consider (default all ) @type frameMask: [1|0] @return: (N_frames x N_frames), (1 x N_frames), projection of each frame in PC space, eigenvalue of each PC @rtype: array, array, array """ if frameMask is None: frameMask = N.ones( len( self.frames ), N.int32 ) if atomMask is None: atomMask = N.ones(self.getRef().lenAtoms(), N.int32) if fit: self.fit( atomMask ) refxyz = N.average( self.frames, 0 ) data = N.compress( frameMask, self.frames, 0 ) data = data - refxyz data = N.compress( atomMask, data, 1 ) ## reduce to 2D array data = N.array( map( N.ravel, data ) ) V, L, U = LA.singular_value_decomposition( data ) return U, V * L, N.power(L, 2)
def pca(self, atomMask=None, frameMask=None, fit=1): """ Calculate principal components of trajectory frames. @param atomMask: 1 x N_atoms, [111001110..] atoms to consider (default: all) @type atomMask: [1|0] @param frameMask: 1 x N_frames, [001111..] frames to consider (default all ) @type frameMask: [1|0] @return: (N_frames x N_frames), (1 x N_frames), projection of each frame in PC space, eigenvalue of each PC @rtype: array, array, array """ if frameMask is None: frameMask = N.ones(len(self.frames), N.int32) if atomMask is None: atomMask = N.ones(self.getRef().lenAtoms(), N.int32) if fit: self.fit(atomMask) refxyz = N.average(self.frames, 0) data = N.compress(frameMask, self.frames, 0) data = data - refxyz data = N.compress(atomMask, data, 1) ## reduce to 2D array data = N.array(map(N.ravel, data)) V, L, U = LA.singular_value_decomposition(data) return U, V * L, N.power(L, 2)
def area(curve, start=0.0, stop=1.0): """ Numerically add up the area under the given curve. The curve is a 2-D array or list of tupples. The x-axis is the first column of this array (curve[:,0]). (originally taken from Biskit.Statistics.ROCalyzer) @param curve: a list of x,y coordinates @type curve: [ (y,x), ] or N.array @param start: lower boundary (in x) (default: 0.0) @type start: float @param stop: upper boundary (in x) (default: 1.0) @type stop: float @return: the area underneath the curve between start and stop. @rtype: float """ ## convert and swap axes curve = N.array(curve) c = N.zeros(N.shape(curve), curve.dtype) c[:, 0] = curve[:, 1] c[:, 1] = curve[:, 0] assert len(N.shape(c)) == 2 ## apply boundaries ## here we have a problem with flat curves mask = N.greater_equal(c[:, 1], start) mask *= N.less_equal(c[:, 1], stop) c = N.compress(mask, c, axis=0) ## fill to boundaries -- not absolutely accurate: we actually should ## interpolate to the neighboring points instead c = N.concatenate((N.array([ [c[0, 0], start], ]), c, N.array([ [c[-1, 0], stop], ]))) x = c[:, 1] y = c[:, 0] dx = x[1:] - x[:-1] # distance on x between points dy = y[1:] - y[:-1] # distance on y between points areas1 = y[:-1] * dx # the rectangles between all points areas2 = dx * dy / 2.0 # the triangles between all points return N.sum(areas1) + N.sum(areas2)
def anova2(self, ma3d, groupLens, addInteraction, repMeasuresOnA, callback): """Conducts two-way ANOVA on individual examples; returns a Numeric array of p-values in shape (2, numExamples) or (3, numExamples), depending whether we test for interaction; Note: levels of factors A and B that cause empty cells are removed prior to conducting ANOVA. """ groupLens = Numeric.asarray(groupLens) # arrays to store p-vals if addInteraction: ps = Numeric.ones((3, ma3d.shape[0]), Numeric.Float) else: ps = Numeric.ones((2, ma3d.shape[0]), Numeric.Float) # decide between non-repeated / repeated measures ANOVA for factor time if repMeasuresOnA: fAnova = Anova.AnovaRM12LR else: fAnova = Anova.Anova2wayLR # check for empty cells for all genes at once and remove them tInd2rem = [] ax2Ind = Numeric.concatenate(([0], Numeric.add.accumulate(groupLens))) for aIdx in range(ma3d.shape[1]): for rIdx in range(groupLens.shape[0]): if Numeric.add.reduce(MA.count(ma3d[:,aIdx,ax2Ind[rIdx]:ax2Ind[rIdx+1]],1)) == 0: tInd2rem.append(aIdx) break if len(tInd2rem) > 0: print "Warning: removing time indices %s for all genes" % (str(tInd2rem)) tInd2keep = range(ma3d.shape[1]) for aIdx in tInd2rem: tInd2keep.remove(aIdx) ma3d = ma3d.take(tInd2keep, 1) # for each gene... for eIdx in range(ma3d.shape[0]): # faster check for empty cells for that gene -> remove time indices with empty cells ma2d = ma3d[eIdx] cellCount = MA.zeros((ma2d.shape[0], groupLens.shape[0]), Numeric.Int) for g,(i0,i1) in enumerate(zip(ax2Ind[:-1], ax2Ind[1:])): cellCount[:,g] = MA.count(ma2d[:,i0:i1], 1) ma2dTakeInd = Numeric.logical_not(Numeric.add.reduce(Numeric.equal(cellCount,0),1)) # 1 where to take, 0 where not to take if Numeric.add.reduce(ma2dTakeInd) != ma2dTakeInd.shape[0]: print "Warning: removing time indices %s for gene %i" % (str(Numeric.compress(ma2dTakeInd == 0, Numeric.arange(ma2dTakeInd.shape[0]))), eIdx) ma2d = MA.compress(ma2dTakeInd, ma2d, 0) an = fAnova(ma2d, groupLens, addInteraction, allowReductA=True, allowReductB=True) ps[:,eIdx] = an.ps callback() return ps
def calc_rmsd(self, fitted_model_if, fitted_model_wo_if, reference, model): """ Takes the two fitted structures (with and without iterative fitting), the known structure (reference), and the associated model inside the pdb_list. Calculates the different RMSD and set the profiles @param fitted_model_if: itteratively fitted model @type fitted_model_if: PDBModel @param fitted_model_wo_if: normaly fitted model @type fitted_model_wo_if: PDBModel @param reference: reference model @type reference: PDBModel @param model: model @type model: PDBModel """ ## first calculate rmsd for heavy atoms and CA without ## removing any residues from the model mask_CA = fitted_model_wo_if.maskCA() rmsd_aa = fitted_model_wo_if.rms( reference, fit=0 ) rmsd_ca = fitted_model_wo_if.rms( reference, mask=mask_CA, fit=1 ) model.info["rmsd2ref_aa_wo_if"] = rmsd_aa model.info["rmsd2ref_ca_wo_if"] = rmsd_ca outliers_mask = N.logical_not(fitted_model_if.profile("rms_outliers")) ## Now remove the residues that were outliers in the iterative fit ## and calculate the rmsd again fitted_model_if = fitted_model_if.compress( outliers_mask ) reference = reference.compress( outliers_mask ) mask_CA = fitted_model_if.maskCA() rmsd_aa_if = fitted_model_if.rms( reference, fit=0 ) rmsd_ca_if = fitted_model_if.rms( reference, mask=mask_CA, fit=1 ) model.info["rmsd2ref_aa_if"] = rmsd_aa_if model.info["rmsd2ref_ca_if"] = rmsd_ca_if model.info["rmsd2ref_aa_outliers"] = 1.*(len(outliers_mask) \ - N.sum(outliers_mask)) / len(outliers_mask) model.info["rmsd2ref_ca_outliers"] = 1.*(N.sum(mask_CA) \ - N.sum(N.compress(mask_CA, outliers_mask))) \ / N.sum(mask_CA)
def calc_rmsd(self, fitted_model_if, fitted_model_wo_if, reference, model): """ Takes the two fitted structures (with and without iterative fitting), the known structure (reference), and the associated model inside the pdb_list. Calculates the different RMSD and set the profiles @param fitted_model_if: itteratively fitted model @type fitted_model_if: PDBModel @param fitted_model_wo_if: normaly fitted model @type fitted_model_wo_if: PDBModel @param reference: reference model @type reference: PDBModel @param model: model @type model: PDBModel """ ## first calculate rmsd for heavy atoms and CA without ## removing any residues from the model mask_CA = fitted_model_wo_if.maskCA() rmsd_aa = fitted_model_wo_if.rms(reference, fit=0) rmsd_ca = fitted_model_wo_if.rms(reference, mask=mask_CA, fit=1) model.info["rmsd2ref_aa_wo_if"] = rmsd_aa model.info["rmsd2ref_ca_wo_if"] = rmsd_ca outliers_mask = N.logical_not(fitted_model_if.profile("rms_outliers")) ## Now remove the residues that were outliers in the iterative fit ## and calculate the rmsd again fitted_model_if = fitted_model_if.compress(outliers_mask) reference = reference.compress(outliers_mask) mask_CA = fitted_model_if.maskCA() rmsd_aa_if = fitted_model_if.rms(reference, fit=0) rmsd_ca_if = fitted_model_if.rms(reference, mask=mask_CA, fit=1) model.info["rmsd2ref_aa_if"] = rmsd_aa_if model.info["rmsd2ref_ca_if"] = rmsd_ca_if model.info["rmsd2ref_aa_outliers"] = 1.*(len(outliers_mask) \ - N.sum(outliers_mask)) / len(outliers_mask) model.info["rmsd2ref_ca_outliers"] = 1.*(N.sum(mask_CA) \ - N.sum(N.compress(mask_CA, outliers_mask))) \ / N.sum(mask_CA)
def area(curve, start=0.0, stop=1.0 ): """ Numerically add up the area under the given curve. The curve is a 2-D array or list of tupples. The x-axis is the first column of this array (curve[:,0]). (originally taken from Biskit.Statistics.ROCalyzer) @param curve: a list of x,y coordinates @type curve: [ (y,x), ] or N.array @param start: lower boundary (in x) (default: 0.0) @type start: float @param stop: upper boundary (in x) (default: 1.0) @type stop: float @return: the area underneath the curve between start and stop. @rtype: float """ ## convert and swap axes curve = N.array( curve ) c = N.zeros( N.shape(curve), curve.dtype ) c[:,0] = curve[:,1] c[:,1] = curve[:,0] assert len( N.shape( c ) ) == 2 ## apply boundaries ## here we have a problem with flat curves mask = N.greater_equal( c[:,1], start ) mask *= N.less_equal( c[:,1], stop ) c = N.compress( mask, c, axis=0 ) ## fill to boundaries -- not absolutely accurate: we actually should ## interpolate to the neighboring points instead c = N.concatenate((N.array([[c[0,0], start],]), c, N.array([[c[-1,0],stop ],])) ) x = c[:,1] y = c[:,0] dx = x[1:] - x[:-1] # distance on x between points dy = y[1:] - y[:-1] # distance on y between points areas1 = y[:-1] * dx # the rectangles between all points areas2 = dx * dy / 2.0 # the triangles between all points return N.sum(areas1) + N.sum(areas2)
def getFluct_global(self, mask=None): """ Get RMS of each atom from it's average position in trajectory. The frames should be superimposed (fit() ) to a reference. @param mask: N x 1 list/Numpy array of 0|1, (N=atoms), atoms to be considered. @type mask: [1|0] @return: Numpy array ( N_unmasked x 1 ) of float. @rtype: array """ frames = self.frames if mask is not None: frames = N.compress(mask, frames, 1) ## mean position of each atom in all frames avg = N.average(frames) return N.average(N.sqrt(N.sum(N.power(frames - avg, 2), 2)))
def getFluct_global( self, mask=None ): """ Get RMS of each atom from it's average position in trajectory. The frames should be superimposed (fit() ) to a reference. @param mask: N x 1 list/Numpy array of 0|1, (N=atoms), atoms to be considered. @type mask: [1|0] @return: Numpy array ( N_unmasked x 1 ) of float. @rtype: array """ frames = self.frames if mask is not None: frames = N.compress( mask, frames, 1 ) ## mean position of each atom in all frames avg = N.average( frames ) return N.average(N.sqrt(N.sum(N.power(frames - avg, 2), 2) ))
def data(self, data): if data != None: self._data = data ## self._dataMA = chipstat.orng2ma(data) self._dataMA = data.toNumpyMA("a")[0] # info text self.infoa.setText("Examples: %i profiles on %i points" % (self._dataMA.shape[0], self._dataMA.shape[1])) numTotalMissing = int( Numeric.multiply.reduce(self._dataMA.shape) - MA.count(self._dataMA)) if numTotalMissing > 0: numValsByCol = MA.count(self._dataMA, 0) numEmptyCol = Numeric.add.reduce( Numeric.where(numValsByCol == 0, 1, 0)) colNonEmpty = Numeric.compress( numValsByCol != 0, Numeric.arange(self._dataMA.shape[1])) dataRemEmptyCol = self._dataMA.take(colNonEmpty, 1) self.numRowsMissing = Numeric.add.reduce( Numeric.where( MA.count(dataRemEmptyCol, 1) < dataRemEmptyCol.shape[1], 1, 0)) s1 = "" s2 = "" if numEmptyCol > 0: s1 = "s" if self.numRowsMissing > 0: s2 = "s" self.infob.setText( "missing %i values, %i column%s completely, %i row%s partially" % (numTotalMissing, numEmptyCol, s1, self.numRowsMissing, s2)) else: self.infob.setText("") else: self._data = None self._dataMA = None self.infoa.setText("No examples on input") self.infob.setText("") self.numRowsMissing = 0 self.setGuiCommonExpChip() if self.commitOnChange: self.senddata(1)
def test_Ramachandran(self): """Ramachandran test""" self.traj = T.load(T.testRoot() + '/lig_pcr_00/traj.dat') self.traj.ref.atoms.set('mass', self.traj.ref.masses()) self.mdl = [self.traj[0], self.traj[11]] self.mdl = [md.compress(md.maskProtein()) for md in self.mdl] self.rama = Ramachandran(self.mdl, name='test', profileName='mass', verbose=self.local) self.psi = N.array(self.rama.psi) if self.local: self.rama.show() r = N.sum(N.compress(N.logical_not(N.equal(self.psi, None)), self.psi)) self.assertAlmostEqual(r, -11717.909796797909, 2)
def outliers( a, z=5, it=5 ): """ Iterative detection of outliers in a set of numeric values. Requirement: len(a) > 0; outlier detection is only performed if len(a)>2 @param a: array or list of values @type a: [ float ] @param z: z-score threshold for iterative refinement of median and SD @type z: float @param it: maximum number of iterations @type it: int @return: outlier mask, median and standard deviation of last iteration @rtype: N.array( int ), float, float """ assert( len(a) > 0 ) mask = N.ones( len(a) ) out = N.zeros( len(a) ) if len(a) < 3: return out, N.median(a), N.std(a) for i in range( it ): b = N.compress( N.logical_not(out), a ) me = N.median( b ) sd = N.std( b ) bz = N.absolute((N.array( a ) - me) / sd) # pseudo z-score of each value o = bz > z ## print 'iteration %i: <%5.2f> +- %5.2f -- %i outliers' % (i,me,sd,N.sum(o)) ## stop if converged or reached bottom if (N.sum(o) == N.sum(out)) or (N.sum(o) > len(a) - 3): return o, me, sd out = o return out, me, sd
def outliers(a, z=5, it=5): """ Iterative detection of outliers in a set of numeric values. Requirement: len(a) > 0; outlier detection is only performed if len(a)>2 @param a: array or list of values @type a: [ float ] @param z: z-score threshold for iterative refinement of median and SD @type z: float @param it: maximum number of iterations @type it: int @return: outlier mask, median and standard deviation of last iteration @rtype: N.array( int ), float, float """ assert (len(a) > 0) mask = N.ones(len(a)) out = N.zeros(len(a)) if len(a) < 3: return out, N.median(a), N.std(a) for i in range(it): b = N.compress(N.logical_not(out), a) me = N.median(b) sd = N.std(b) bz = N.absolute((N.array(a) - me) / sd) # pseudo z-score of each value o = bz > z ## print 'iteration %i: <%5.2f> +- %5.2f -- %i outliers' % (i,me,sd,N.sum(o)) ## stop if converged or reached bottom if (N.sum(o) == N.sum(out)) or (N.sum(o) > len(a) - 3): return o, me, sd out = o return out, me, sd
def condition2indices(condition): """Input: condition=[1,0,0,1]; output: indices=[0,3] """ condition = Numeric.asarray(condition) assert len(condition.shape) == 1 return Numeric.compress(condition, Numeric.arange(condition.shape[0]))
def estimate_reference_single(entry, stats, bounds, ref=0.0, verbose=False, exclude=None, entry_name=None, atom_type='H', exclude_outliers=False,molType='protein'): A = 0. B = 0. S = 0. N = 1 ## loop through all atom types classes = decompose_classes(entry, bounds, atom_type,molType=molType) if exclude and not entry_name: raise TypeError, 'attribute entry_name needs to be set.' n_excluded = 0 n_total = 0 for key, shifts in classes.items(): ## print entry_name, key if not key in stats: if verbose: print key,'no statistics.' continue if exclude and (entry_name, key) in exclude: print entry_name, key, 'excluded from ref estimation.' continue ## get statistics for current atom type mu, sd = stats[key][:2] k = 1./sd**2 if exclude_outliers is not False: ## calculate Z scores and exclude shifts with high Z scores from analysis Z = abs(shifts-mu)/sd mask_include = Numeric.less(Z, exclude_outliers) shifts = Numeric.compress(mask_include, shifts) n_excluded += len(Z)-Numeric.sum(mask_include) n_total += len(Z) n = len(shifts) if not n: continue A += k*n*(median(shifts)-mu) B += k*n S += -0.5*len(shifts)*Numeric.log(k)+0.5*k*sum((Numeric.array(shifts)-mu-ref)**2) N += n if B > 0.: ref_mu = A/B ref_sd = 1./Numeric.sqrt(B) else: ref_mu = None ref_sd = None if exclude_outliers is not False and n_excluded == n_total: print '%d/%d outliers discarded' % (n_excluded, n_total) return ref_mu, ref_sd, S/N
#-------------------- Limiting Magnitude Section --------------------# # N.B. This section merely determines the limiting magnitude, if it can. # The limMag value is currently not used. Hence the try/except clause. #To define the typical 1-sigma limiting magnitude (which now changes from object to object), #we take the median of the 5% faintest 1-sigma fluxerror_auto fluxes. This is obviously #biased, but shows which is the depth in the deepest part of the images with the current #SExtractor parameters, etc. # one_sigma_mags = Numeric.compress(Numeric.less_equal(abs(em[i,:]-0.7526),0.02),m[i,:]) # n_one = len(one_sigma_mags) # test snippet: dm = 0.02 n_one = 0 while n_one < 21: one_sigma_mags = Numeric.compress( Numeric.less_equal(abs(em[i, :] - 0.7526), dm), m[i, :]) n_one = len(one_sigma_mags) dm += 0.01 if dm > 0.03 and dm < 1.1: message = "Warning: not enough 1-sigma objects in the catalog. Using dm=+-%.2f" % dm self.logfile.write(message) if dm >= 1.1: message = "Warning: Stopped searching for 1-sigma objects at dm=+-%.2f" % dm self.logfile.write(message) break try: limMag = MLab.median(Numeric.sort(one_sigma_mags)[-20:]) self.logfile.write("Limiting Mag " + basefits + ":" + str(limMag)) print "Limiting Mag " + basefits + ":" + str(limMag)
def go(self, model_list = None, reference = None): """ Run benchmarking. @param model_list: list of models (default: None S{->} outFolder/L{F_PDBModels}) @type model_list: ModelList @param reference: reference model (default: None S{->} outFolder/L{F_INPUT_REFERENCE}) @type reference: PDBModel """ model_list = model_list or self.outFolder + self.F_PDBModels reference = reference or self.outFolder + self.F_INPUT_REFERENCE pdb_list = T.load('%s'%model_list) reference = PDBModel(reference) # check with python 2.4 iref, imodel = reference.compareAtoms(pdb_list[0]) mask_casting = N.zeros(len(pdb_list[0])) N.put(mask_casting, imodel, 1) reference = reference.take(iref) #reference_mask_CA = reference_rmsd.maskCA() atom_mask = N.zeros(len(pdb_list[0])) N.put(atom_mask,imodel,1) rmask = pdb_list[0].profile2mask("n_templates", 1,1000) amask = pdb_list[0].res2atomMask(rmask) mask_final_ref = N.compress(mask_casting, amask) mask_final = mask_casting * amask reference = reference.compress(mask_final_ref) for i in range(len(pdb_list)): #self.cad(reference, pdb_list[i]) pdb_list[i], pdb_wo_if = self.output_fittedStructures(\ pdb_list[i], reference, i, mask_final) fitted_model_if = pdb_list[i].compress(mask_final) fitted_model_wo_if = pdb_wo_if.compress(mask_final) coord1 = reference.getXyz() coord2 = fitted_model_if.getXyz() aprofile = self.rmsd_res(coord1,coord2) self.calc_rmsd(fitted_model_if, fitted_model_wo_if, reference, pdb_list[i]) pdb_list[i].atoms.set('rmsd2ref_if', aprofile, mask=mask_final, default = -1, comment="rmsd to known reference structure") self.output_rmsd_aa(pdb_list) self.output_rmsd_ca(pdb_list) self.output_rmsd_res(pdb_list) self.write_PDBModels(pdb_list)
def makeMap( self, maxPerCenter=4 ): """ Calculate mapping between complete and reduced atom list. Creates a (list of lists of int, list of atom dictionaries) containing groups of atom indices into original model, new center atoms @param maxPerCenter: max number of atoms per side chain center atom (default: 4) @type maxPerCenter: int """ resIndex = self.m_sorted.resIndex() resModels= self.m_sorted.resModels() m = self.m_sorted self.currentAtom = 0 groups = [] atoms = DictList() for i in range( len( resIndex ) ): first_atom = resIndex[ i ] if i < len( resIndex )-1: last_atom = resIndex[ i+1 ] - 1 else: last_atom = len( self.a_indices ) - 1 a = m.atoms[ first_atom ] ## res_name = m.atoms[ first_atom ]['residue_name'] ## segid = m.atoms[ first_atom ]['segment_id'] ## chainId = m.atoms[ first_atom ]['chain_id'] ## res_number= m.atoms[ first_atom ]['serial_number'] ## position of this residue's atoms in original PDBModel (unsorted) a_indices = self.a_indices[ first_atom : last_atom+1 ] ## for each center create list of atom indices and a center atom if a['residue_name'] != 'GLY' and a['residue_name'] != 'ALA': bb_a_indices = N.compress( resModels[i].maskBB(), a_indices) sc_a_indices = N.compress( N.logical_not( resModels[i].maskBB()), a_indices ) sc_groups = self.group( sc_a_indices, maxPerCenter ) else: bb_a_indices = a_indices sc_groups = [] groups += [ bb_a_indices ] atoms += [ self.nextAtom(a, 'BB') ] i = 0 for g in sc_groups: groups += [ g ] atoms += [ self.nextAtom( a, 'SC%i'%i) ] i += 1 self.groups = groups self.atoms = atoms
def getPositions(m, val): """Input: arbitrary (masked) array and a value from that array; Output: array of positions of the given value in a flat m; """ m = MA.asarray(m) return Numeric.compress(MA.equal(MA.ravel(m),val), Numeric.arange(Numeric.multiply.reduce(m.shape)))
def histogram2d_2(data, bins, xrange = None, yrange = None): try: data = Numeric.array(data, Float) except: raise TypeError, 'data: list or array excepted, %s given', \ str(type(data)) if not len(shape(data)) == 2: raise ValueError, 'shape of data array must be (n,2)' if type(bins) == type(0): bins = (bins, bins) elif not type(bins) in (type([]), type(())): raise TypeError, 'bins: int, list or tuple expected. %s given', \ str(type(bins)) if yrange is None: yrange = (min(data[:,1]), max(data[:,1])) x_min = min(data[:,0]) x_max = max(data[:,0]) x_spacing = (x_max - x_min) / bins[0] ystep = abs(yrange[1] - yrange[0]) / float(bins[1]) X = [] Y = [] N = [] for y in arange(yrange[0] + ystep , yrange[1] + ystep, ystep): ## collect values which are in [y,y+ystep] mask = less_equal(data[:,1], y) set = compress(mask, data, 0) ## create histogram for x-dimension if shape(set[:,0])[0]: x_histogram = histogram(set[:,0], bins[0], range = xrange) else: x_bins = arange(x_min + x_spacing / 2., x_max + x_spacing / 2., x_spacing) ## no. of x_bins might be larger as it should be ## (due to numerical errors). if shape(x_bins)[0] - 1 == bins[0]: x_bins = x_bins[:-1] x_histogram = Numeric.concatenate((x_bins[:,NewAxis], zeros((bins[0],1))), 1) ## append #point per cell (x_i, y_i, n_i) X.append(x_histogram[:,0]) N.append(x_histogram[:,1]) s = ones(shape(x_histogram)[0]) * (y - ystep / 2.) Y.append(s) ## discard processed data data = Numeric.compress(Numeric.logical_not(mask), data, 0) return Numeric.array(X), Numeric.array(Y), Numeric.array(N)
def _removeDuplicateChains(self, chainMask=None): """ Get rid of identical chains by comparing all chains with Blast2seq. @param chainMask: chain mask for overriding the chain identity checking (default: None) @type chainMask: [int] @return: number of chains removed @rtype: int """ chainCount = len(self.chains) matrix = 1.0 * N.zeros((chainCount,chainCount)) chain_ids = [] ## create identity matrix for all chains against all chains for i in range(0, chainCount): chain_ids = chain_ids + [self.chains[i].chain_id] # collect for log file for j in range(i, len(self.chains)): # convert 3-letter-code res list into 1-letter-code String seq1 = singleAA( self.chains[i].sequence() ) seq2 = singleAA( self.chains[j].sequence() ) ## if len(seq1) > len(seq2): # take shorter sequence ## # aln len at least half the len of the shortest sequence ## alnCutoff = len(seq2) * 0.5 ## else: ## alnCutoff = len(seq1) * 0.5 ## if id['aln_len'] > alnCutoff: ## matrix[i,j] = id['aln_id'] ## else: # aln length too short, ignore ## matrix[i,j] = 0 matrix[i,j] = self._compareSequences( seq1, seq2 ) ## report activity self.log.add("\n Chain ID's of compared chains: "+str(chain_ids)) self.log.add(" Cross-Identity between chains:\n"+str(matrix)) self.log.add(" Identity threshold used: "+str(self.threshold)) ## override the automatic chain deletion by supplying a ## chain mask to this function if chainMask: if len(chainMask) == chainCount: self.chains = N.compress(chainMask, self.chains) self.log.add("NOTE: chain mask %s used for removing chains.\n"%chainMask) else: self.log.add("########## ERROR ###############") self.log.add("# Chain mask is only %i chains long"%len(chainMask)) self.log.add("# when a mask of length %i is needed"%chainCount) self.log.add("# No cleaning will be performed.\n") if not chainMask: ## look at diagonals in "identity matrix" ## (each chain against each) duplicate = len(self.chains) for offset in range(1,chainCount): diag = N.diagonal(matrix, offset ,0,1) # diagonal of 1's mark begin of duplicate avg = 1.0 * N.sum(diag)/len(diag) if (avg >= self.threshold): duplicate = offset break self.chains = self.chains[:duplicate] self.log.add("NOTE: Identity matrix will be used for removing identical chains.") ## report activit self.log.add(str(chainCount - len(self.chains))+\ " chains have been removed.\n") # how many chains have been removed? return (chainCount - len(self.chains))