def arc_by_radian(x, y, height, radian_range, thickness, gaussian_width): """ Radial arc with Gaussian fall-off after the solid ring-shaped region with the given thickness, with shape specified by the (start,end) radian_range. """ # Create a circular ring (copied from the ring function) radius = height/2.0 half_thickness = thickness/2.0 distance_from_origin = sqrt(x**2+y**2) distance_outside_outer_disk = distance_from_origin - radius - half_thickness distance_inside_inner_disk = radius - half_thickness - distance_from_origin ring = 1.0-bitwise_xor(greater_equal(distance_inside_inner_disk,0.0),greater_equal(distance_outside_outer_disk,0.0)) sigmasq = gaussian_width*gaussian_width if sigmasq==0.0: inner_falloff = x*0.0 outer_falloff = x*0.0 else: with float_error_ignore(): inner_falloff = exp(divide(-distance_inside_inner_disk*distance_inside_inner_disk, 2.0*sigmasq)) outer_falloff = exp(divide(-distance_outside_outer_disk*distance_outside_outer_disk, 2.0*sigmasq)) output_ring = maximum(inner_falloff,maximum(outer_falloff,ring)) # Calculate radians (in 4 phases) and cut according to the set range) # RZHACKALERT: # Function float_error_ignore() cannot catch the exception when # both dividend and divisor are 0.0, and when only divisor is 0.0 # it returns 'Inf' rather than 0.0. In x, y and # distance_from_origin, only one point in distance_from_origin can # be 0.0 (circle center) and in this point x and y must be 0.0 as # well. So here is a hack to avoid the 'invalid value encountered # in divide' error by turning 0.0 to 1e-5 in distance_from_origin. distance_from_origin += where(distance_from_origin == 0.0, 1e-5, 0) with float_error_ignore(): sines = divide(y, distance_from_origin) cosines = divide(x, distance_from_origin) arcsines = arcsin(sines) phase_1 = where(logical_and(sines >= 0, cosines >= 0), 2*pi-arcsines, 0) phase_2 = where(logical_and(sines >= 0, cosines < 0), pi+arcsines, 0) phase_3 = where(logical_and(sines < 0, cosines < 0), pi+arcsines, 0) phase_4 = where(logical_and(sines < 0, cosines >= 0), -arcsines, 0) arcsines = phase_1 + phase_2 + phase_3 + phase_4 if radian_range[0] <= radian_range[1]: return where(logical_and(arcsines >= radian_range[0], arcsines <= radian_range[1]), output_ring, 0.0) else: return where(logical_or(arcsines >= radian_range[0], arcsines <= radian_range[1]), output_ring, 0.0)
def kNNimputeMA(arr2d, K=20, callback=None): """Returns a new 2D MA.array with missing values imputed from K nearest neighbours. Find K rows (axis 0) with the most similar values where similarity measure corresponds to weighted Euclidean distance. Imputed value = weighted average of the corresponding values of K nearest neighbours, where weights equal to tricubic distribution of distances to all rows. Impute missing rows by average over all rows. Version: 30.8.2005 """ arr2d = MA.asarray(arr2d) assert len(arr2d.shape) == 2, "2D array expected" # make a copy for imputation aImp2 = MA.array(arr2d) # leave out columns with 0 known values (columnInd: non-zero columns) columnCond = Numeric.greater(MA.count(arr2d, axis=0), 0) columnIndAll = Numeric.arange(arr2d.shape[1]) columnInd = Numeric.compress(columnCond, columnIndAll) # impute the rows where 0 < #known_values < #non_zero_columns, i.e. exclude the rows with 0 and all (non-zero-column) values countByRows = MA.count(arr2d, axis=1) for rowIdx in Numeric.compress(Numeric.logical_and(Numeric.greater(countByRows, 0), Numeric.less(countByRows, columnInd.shape[0])), Numeric.arange(arr2d.shape[0])): rowResized = MA.resize(arr2d[rowIdx], arr2d.shape) diff = arr2d - rowResized distances = MA.sqrt(MA.add.reduce((diff)**2, 1) / MA.count(diff, axis=1)) # nearest neighbours row indices (without the current row index) indSorted = MA.argsort(distances)[1:] distSorted = distances.take(indSorted) # number of distances different from MA.masked numNonMasked = distSorted.shape[0] - Numeric.add.reduce(Numeric.asarray(MA.getmaskarray(distSorted), Numeric.Int)) # number of distances to account for (K or less) if numNonMasked > 1: weightsSorted = MA.power(1-MA.power(distSorted/distSorted[numNonMasked-1],3),3) # tricubic distribution of all weights else: weightsSorted = Numeric.ones(distSorted.shape[0]) # compute average for each column separately in order to account for K non-masked values colInd4CurrRow = Numeric.compress(Numeric.logical_and(MA.getmaskarray(arr2d[rowIdx]), columnCond), columnIndAll) for colIdx in colInd4CurrRow: # column values sorted by distances columnVals = arr2d[:,colIdx].take(indSorted) # take only those weights where columnVals does not equal MA.masked weightsSortedCompressed = MA.compress(1-MA.getmaskarray(columnVals), weightsSorted) # impute from K (or possibly less) values aImp2[rowIdx,colIdx] = MA.average(columnVals.compressed()[:K], weights=weightsSortedCompressed[:K]) if callback: callback() # impute the unknown rows with average profile avrgRow = MA.average(arr2d, 0) for rowIdx in Numeric.compress(Numeric.equal(countByRows, 0), Numeric.arange(arr2d.shape[0])): aImp2[rowIdx] = avrgRow if callback: callback() return aImp2
def histogram(data, nbins, range=None): """ Create a histogram. Comes from Konrad Hinsen: Scientific Python @param data: data list or array @type data: [any] @param nbins: number of bins @type nbins: int @param range: data range to create histogram from (min val, max val) @type range: (float, float) OR None @return: array (2 x len(data) ) with start of bin and witdh of bin. @rtype: array """ data = Numeric.array(data, Numeric.Float) if range is None: min = Numeric.minimum.reduce(data) max = Numeric.maximum.reduce(data) else: min, max = range data = Numeric.repeat( data, Numeric.logical_and(Numeric.less_equal(data, max), Numeric.greater_equal(data, min))) bin_width = (max - min) / nbins data = Numeric.floor((data - min) / bin_width).astype(Numeric.Int) histo = Numeric.add.reduce( Numeric.equal(Numeric.arange(nbins)[:, Numeric.NewAxis], data), -1) histo[-1] = histo[-1] + Numeric.add.reduce(Numeric.equal(nbins, data)) bins = min + bin_width * (Numeric.arange(nbins) + 0.5) return Numeric.transpose(Numeric.array([bins, histo]))
def histogram(data, nbins, range = None): """ Comes from Konrad Hinsen: Scientific Python """ data = Numeric.array(data, Numeric.Float) if range is None: min = Numeric.minimum.reduce(data) max = Numeric.maximum.reduce(data) else: min, max = range data = Numeric.repeat(data, Numeric.logical_and(Numeric.less_equal(data, max), Numeric.greater_equal(data, min))) # end if bin_width = (max-min)/nbins data = Numeric.floor((data - min)/bin_width).astype(Numeric.Int) histo = Numeric.add.reduce(Numeric.equal( Numeric.arange(nbins)[:,Numeric.NewAxis], data), -1) histo[-1] = histo[-1] + Numeric.add.reduce(Numeric.equal(nbins, data)) bins = min + bin_width*(Numeric.arange(nbins)+0.5) return Numeric.transpose(Numeric.array([bins, histo]))
def histogram(data, nbins, range=None): """ Comes from Konrad Hinsen: Scientific Python """ data = Numeric.array(data, Numeric.Float) if range is None: min = Numeric.minimum.reduce(data) max = Numeric.maximum.reduce(data) else: min, max = range data = Numeric.repeat( data, Numeric.logical_and(Numeric.less_equal(data, max), Numeric.greater_equal(data, min))) # end if bin_width = (max - min) / nbins data = Numeric.floor((data - min) / bin_width).astype(Numeric.Int) histo = Numeric.add.reduce( Numeric.equal(Numeric.arange(nbins)[:, Numeric.NewAxis], data), -1) histo[-1] = histo[-1] + Numeric.add.reduce(Numeric.equal(nbins, data)) bins = min + bin_width * (Numeric.arange(nbins) + 0.5) return Numeric.transpose(Numeric.array([bins, histo]))
def histogram(data, nbins, range = None): """ Create a histogram. Comes from Konrad Hinsen: Scientific Python @param data: data list or array @type data: [any] @param nbins: number of bins @type nbins: int @param range: data range to create histogram from (min val, max val) @type range: (float, float) OR None @return: array (2 x len(data) ) with start of bin and witdh of bin. @rtype: array """ data = Numeric.array(data, Numeric.Float) if range is None: min = Numeric.minimum.reduce(data) max = Numeric.maximum.reduce(data) else: min, max = range data = Numeric.repeat(data, Numeric.logical_and(Numeric.less_equal(data, max), Numeric.greater_equal(data, min))) bin_width = (max-min)/nbins data = Numeric.floor((data - min)/bin_width).astype(Numeric.Int) histo = Numeric.add.reduce(Numeric.equal( Numeric.arange(nbins)[:,Numeric.NewAxis], data), -1) histo[-1] = histo[-1] + Numeric.add.reduce(Numeric.equal(nbins, data)) bins = min + bin_width*(Numeric.arange(nbins)+0.5) return Numeric.transpose(Numeric.array([bins, histo]))
def divide_unary(a, b): """Returns a*b with masked values only in places where both a and b are masked. """ a = MA.asarray(a) b = MA.asarray(b) el = MA.divide(a.filled(1), b.filled(1)) mask = Numeric.logical_and(MA.getmaskarray(a), MA.getmaskarray(b)) return MA.array(el, mask=mask)
def subtract_unary(a, b): """Returns a-b with masked values only in places where both a and b are masked. """ a = MA.asarray(a) b = MA.asarray(b) el = MA.subtract(a.filled(0), b.filled(0)) mask = Numeric.logical_and(MA.getmaskarray(a), MA.getmaskarray(b)) return MA.array(el, mask=mask)
def fractionNativeSurface(self, cont, contRef ): """ fraction of atoms/residues that are involved in B{any} contacts in both complexes. @param cont: contact matrix @type cont: matrix @param contRef: reference contact matrix @type contRef: matrix @return: (fractRec, fractLig), fraction of atoms/residues that are involved in any contacts in both complexes @rtype: (float, float) """ lig, ligRef = N.clip( N.sum(cont),0,1), N.clip( N.sum(contRef), 0,1) rec = N.clip( N.sum(cont, 1),0,1) recRef = N.clip( N.sum(contRef, 1), 0,1) fLig = N.sum( N.logical_and( lig, ligRef )) *1./ N.sum( ligRef ) fRec = N.sum( N.logical_and( rec, recRef )) *1./ N.sum( recRef ) return (fRec, fLig)
def contactsShared(self, reference, cutoff=None): """ Number of equal B{residue-residue} contacts in this and reference complex. @param reference: reference complex @type reference: Complex @param cutoff: cutoff for atom-atom contact to be counted @type cutoff: float @return: the number or residue-residue contacts that are common to both this and reference:: abs( N.sum( N.sum( contactMatrix_a - contactMatrix_b ))) @rtype: int """ equality = N.logical_and(self.resContacts( cutoff=cutoff ), reference.resContacts( cutoff=cutoff ) ) return abs(N.sum(N.sum( equality )))
def contactsOverlap(self, ref, cutoff=None): """ Fraction of overlapping B{residue-residue} contacts between this and reference complex. @param ref: reference complex @type ref: Complex @param cutoff: maximal atom-atom distance, None .. previous setting @type cutoff: float @return: fraction of contacts shared between this and ref (normalized to number of all contacts) @rtype: float """ equal = N.logical_and(self.resContacts( cutoff=cutoff ), ref.resContacts( cutoff=cutoff ) ) total = N.logical_or( self.resContacts(cutoff), ref.resContacts(cutoff) ) return N.sum(N.sum( equal )) * 1.0 / N.sum(N.sum( total ))
def filter(self, dlg): fptr = open(dlg) dlg_lines = fptr.readlines() fptr.close() #STEP 1:accumulate lines of various poses model_lines = [] #keep all of them all_models = [] in_model = False for ll in dlg_lines: if ll.find("DOCKED:") == 0: #check for a new model if ll.find("DOCKED: MODEL") == 0: model_lines = [] in_model = True model_lines.append(ll) if ll.find("_") == 0 and in_model: all_models.append(model_lines) model_lines = [] in_model = False #initialize this ligand # loop over the models: for model_lines in all_models: self.setup_ligand(model_lines) bigR = self.bigRC[:self.lenK] bigM = self.bigC[:self.lenK] cutoff = bigR + self.keyRadii d = bigM - self.smallM dSQ = d * d dSQMAT = Numeric.sum(dSQ, 2) cutoffSQMAT = cutoff * cutoff ansMat = Numeric.logical_and(Numeric.less(dSQMAT, cutoffSQMAT), Numeric.not_equal(dSQMAT, 0.)) rowIndices = Numeric.nonzero(Numeric.sum(ansMat, 1)) num_contacts = 0 for ind in rowIndices: for j in ansMat[ind]: if j: num_contacts += 1 if num_contacts > 0: break return num_contacts
def filter(self, dlg): fptr = open(dlg) dlg_lines = fptr.readlines() fptr.close() #STEP 1:accumulate lines of various poses model_lines = [] #keep all of them all_models = [] in_model = False for ll in dlg_lines: if ll.find("DOCKED:")==0: #check for a new model if ll.find("DOCKED: MODEL")==0: model_lines = [] in_model = True model_lines.append(ll) if ll.find("_")==0 and in_model: all_models.append(model_lines) model_lines = [] in_model = False #initialize this ligand # loop over the models: for model_lines in all_models: self.setup_ligand(model_lines) bigR = self.bigRC[:self.lenK] bigM = self.bigC[:self.lenK] cutoff = bigR + self.keyRadii d = bigM - self.smallM dSQ = d*d dSQMAT = Numeric.sum(dSQ,2) cutoffSQMAT = cutoff*cutoff ansMat = Numeric.logical_and(Numeric.less(dSQMAT, cutoffSQMAT),Numeric.not_equal(dSQMAT, 0.)) rowIndices = Numeric.nonzero(Numeric.sum(ansMat,1)) num_contacts = 0 for ind in rowIndices: for j in ansMat[ind]: if j: num_contacts+=1 if num_contacts > 0: break return num_contacts
Y = Numeric.ones(N).astype('f') for i in range(numproc): Y = Y*Numeric.array(range(N))*(i+1) #print X_float #print Y assert Numeric.allclose(X_float, Y) print "Raw reduce using pypar.PROD OK" else: if myid == 0: print "Skipping product-reduce - try again with numproc < 20" pypar.raw_reduce(testArray, X, pypar.LAND, 0, 0) if myid == 0: Y = Numeric.ones(N) for i in range(numproc): Y = Numeric.logical_and(Y, Numeric.array(range(N))*(i+1)) assert Numeric.allclose(X, Y) print "Raw reduce using pypar.LAND OK" pypar.raw_reduce(testArray, X, pypar.BAND, 0, 0) if myid == 0: Y = Numeric.ones(N)*255 #Neutral element for & for i in range(numproc): Y = Numeric.bitwise_and(Y, Numeric.array(range(N))*(i+1)) assert Numeric.allclose(X, Y) print "Raw reduce using pypar.BAND OK" pypar.raw_reduce(testArray, X, pypar.LOR, 0, 0) if myid == 0: Y = Numeric.zeros(N) for i in range(numproc):
def logical_unary_and(m1, m2): el = Numeric.logical_and(m1.filled(1), m2.filled(1)) mask = Numeric.logical_and(MA.getmaskarray(m1), MA.getmaskarray(m2)) return MA.array(el, mask=mask)
def match(x, y, n_iterations=1, z=2, eps_rmsd=0.5, eps_stdv=0.05): """ Matches two arrays onto each other, while iteratively removing outliers. Superimposed array y would be C{ N.dot(y, N.transpose(r)) + t }. @param n_iterations: number of calculations:: 1 .. no iteration 0 .. until convergence @type n_iterations: 1|0 @param z: number of standard deviations for outlier definition (default: 2) @type z: float @param eps_rmsd: tolerance in rmsd (default: 0.5) @type eps_rmsd: float @param eps_stdv: tolerance in standard deviations (default: 0.05) @type eps_stdv: float @return: (r,t), [ [percent_considered, rmsd_for_it, outliers] ] @rtype: (array, array), [float, float, int] """ iter_trace = [] rmsd_old = 0 stdv_old = 0 n = 0 converged = 0 mask = N.ones(len(y), N.int32) while not converged: ## find transformation for best match r, t = findTransformation(N.compress(mask, x, 0), N.compress(mask, y, 0)) ## transform coordinates xt = N.dot(y, N.transpose(r)) + t ## calculate row distances d = N.sqrt(N.sum(N.power(x - xt, 2), 1)) * mask ## calculate rmsd and stdv rmsd = N.sqrt(N.average(N.compress(mask, d)**2)) stdv = MU.SD(N.compress(mask, d)) ## check conditions for convergence d_rmsd = abs(rmsd - rmsd_old) d_stdv = abs(1 - stdv_old / stdv) if d_rmsd < eps_rmsd and d_stdv < eps_stdv: converged = 1 else: rmsd_old = rmsd stdv_old = stdv ## store result perc = round(float(N.sum(mask)) / float(len(mask)), 2) ## throw out non-matching rows mask = N.logical_and(mask, N.less(d, rmsd + z * stdv)) outliers = N.nonzero(N.logical_not(mask)) iter_trace.append([perc, round(rmsd, 3), outliers]) n += 1 if n_iterations and n >= n_iterations: break return (r, t), iter_trace
def senddata(self): """computes selectionList, partitions the examples and updates infoc; sends out selectionList and selected/other dataStructure or None; """ if self.dataStructure and self.ps.shape[1]: # set selectionList alphas = [self.alphaA, self.alphaB, self.alphaI] selectors = [self.selectorA, self.selectorB, self.selectorI] selectionList = Numeric.ones((self.numExamples,)) boxSelectors = [self.boxSelectorA, self.boxSelectorB, self.boxSelectorI] for si in range(3): try: ## if selectors[si] and self.anovaType in [[0,1,3,4],[2,3,4],[4]][si]: if selectors[si] and boxSelectors[si].isEnabled(): selectionList = Numeric.logical_and(selectionList, Numeric.less(self.ps[si], float(alphas[si]))) except ValueError: print "Warning: cannot convert %s to float" % str(alphas[si]) pass self.infoc.setText('Sending out data...') if self.sendProbabilities: # create example table with probabilities ## print self.ps ## print Numeric.transpose(self.ps).shape etProb = orange.ExampleTable(orange.Domain([orange.FloatVariable("Factor A p-val"),orange.FloatVariable("Factor B p-val"),orange.FloatVariable("Interaction p-val")]), Numeric.transpose(self.ps)) # in etProb, convert p-val to meta attribute domProb = orange.Domain([]) domProb.addmetas(dict(zip([orange.newmetaid(),orange.newmetaid(),orange.newmetaid()], etProb.domain.variables))) etProb = orange.ExampleTable(domProb, etProb) else: # create new etProb without attributes/metas and of length equal to etProb etProb = orange.ExampleTable(orange.Domain([]), Numeric.zeros((selectionList.shape[0],0))) # partition dataStructure and send out data selectionList = selectionList.tolist() self.send("Example Selection", (self.selectorName, selectionList)) dataStructS = [] dataStructN = [] self.progressBarInit() if self.sendNotSelectedData: pbStep = 50./len(self.dataStructure) else: pbStep = 100./len(self.dataStructure) for (dsName, etList) in self.dataStructure: etListS = [et.select(selectionList) for et in etList] for i in range(len(etList)): # append probabilities (if etProb not empty) etListS[i] = orange.ExampleTable([etListS[i], etProb.select(selectionList)]) # add name etListS[i].name = etList[i].name dataStructS.append((dsName, etListS)) self.progressBarAdvance(pbStep) self.send("Selected Structured Data", dataStructS) if self.sendNotSelectedData: for (dsName, etList) in self.dataStructure: etListN = [et.select(selectionList, negate=1) for et in etList] for i in range(len(etList)): # append probabilities (if etProb not empty) etListN[i] = orange.ExampleTable([etListN[i], etProb.select(selectionList, negate=1)]) # add name etListN[i].name = etList[i].name dataStructN.append((dsName, etListN)) self.progressBarAdvance(pbStep) self.send("Other Structured Data", dataStructN) else: self.send("Other Structured Data", None) self.progressBarFinished() # report the number of selected examples numExamples = Numeric.add.reduce(Numeric.greater(selectionList, 0)) self.infoc.setText('Total of %d example%s match criteria.' % (numExamples, ['', 's'][int(numExamples!=1)])) else: self.send("Example Selection", None) self.send("Selected Structured Data", None) self.send("Other Structured Data", None)
def select(self, keyAts, checkAts, cutoff=3.0, percentCutoff=1.0, keyMat=None, checkMat=None): """ keyAts, checkAts, cutoff, percentCutoff keyAts: first set of atoms checkAts: a second set of atoms which is checked vs. keyAts cutoff: either a single float by default 3.0 or a matrix with shape: (max(len(keyAts),len(checkAts)), min(len(keyAts),len(checkAts))) percentCutoff: by default 1.0 (cutoff is multiplied by this value) keyMat: transformation of keyAts checkMat: transformation of checkAts returns 'pairDict' whose keys are atoms used as reference points and whose values are atoms within cutoff distance of corresponding key. If 'return_dist' flag is set, 'distDict' is returned also, whose keys are the same atoms which are used as reference points and whose values are lists of distances to atoms within cutoff distance of corresponding key """ lenK = len(keyAts) lenC = len(checkAts) #data arrays are used to find atoms with given indices quickly atar = Numeric.array(checkAts.data) keyAtar = Numeric.array(keyAts.data) #basic arrays of coords used to build others c = Numeric.array(checkAts.coords, 'f') if checkMat: c = self.mul(c, checkMat) k = Numeric.array(keyAts.coords, 'f') if keyMat: k = self.mul(k, keyMat) # first build matrix of distances between all pairs of ats # rows correspond to ats in larger set, columns to those in smaller # first build square matrix if lenC >= lenK: bigC = Numeric.resize(c, (lenC, lenC, 3)) k.shape = (lenK,1,3) bigM = bigC[:lenK] smallM = k cutoff = self.setupCutoff(checkAts, keyAts, cutoff) #print "0a:cutoff[0][0]=", cutoff[0][0] cutoff.shape = (lenK, -1) else: bigK = Numeric.resize(k, (lenK, lenK, 3)) c.shape = (lenC,1,3) bigM = bigK[:lenC] smallM = c cutoff = self.setupCutoff(keyAts, checkAts, cutoff) #print "0b:cutoff[0][0]=", cutoff[0][0] cutoff.shape = (lenC, -1) # distance matrix d = bigM - smallM # distance squared matrix dSQ = d * d # next step sums deltaX**2, deltaY**2, deltaZ**2 dSQMAT = Numeric.sum(dSQ,2) #percentCutoff lets user relax sum of radii #the smaller the percentCutoff the smaller the key #dSQ has to be less than cutoff = cutoff * percentCutoff cutoffSQMAT = cutoff * cutoff #cutoffSQMAT = cutoffSQMAT * percentCutoff # ansMat has 1 where sq dist. is smaller than cutoff ansMat = Numeric.logical_and(self.func(dSQMAT, cutoffSQMAT) , \ Numeric.not_equal(dSQMAT, 0.)) if lenK > lenC: # in this case need to rearrange matrix # which got shuffled in if-else above ansMat = Numeric.swapaxes(ansMat, 0, 1) dSQMAT = Numeric.swapaxes(dSQMAT, 0, 1) # finally, build result dictionaries which have atom keys: # pairDict has values which are lists of close atoms # distDict has values which are lists of distances pairDict = {} distDict = {} # get a list of rows which have non-zero entries # to loop over in next section rowIndices = Numeric.nonzero(Numeric.sum(ansMat,1)) # rows correspond to ats in keyAts # columns correspond to ats in checkAts for i in rowIndices: # atindex is a list [7 8 9] indexing into checkAts atindex = Numeric.nonzero(ansMat[i]) # keyAtar[i] is ith atom in keyAts keyAt = keyAtar[i] pairDict[keyAt] = Numeric.take(atar, atindex) if self.return_dist: distDict[keyAt] = [] for ind in atindex: distDict[keyAt].append(math.sqrt(dSQMAT[i][ind])) #getting distDict back is optional if self.return_dist: return pairDict, distDict else: return pairDict
def kNNimputeMA(arr2d, K=20, callback=None): """Returns a new 2D MA.array with missing values imputed from K nearest neighbours. Find K rows (axis 0) with the most similar values where similarity measure corresponds to weighted Euclidean distance. Imputed value = weighted average of the corresponding values of K nearest neighbours, where weights equal to tricubic distribution of distances to all rows. Impute missing rows by average over all rows. Version: 30.8.2005 """ arr2d = MA.asarray(arr2d) assert len(arr2d.shape) == 2, "2D array expected" # make a copy for imputation aImp2 = MA.array(arr2d) # leave out columns with 0 known values (columnInd: non-zero columns) columnCond = Numeric.greater(MA.count(arr2d, axis=0), 0) columnIndAll = Numeric.arange(arr2d.shape[1]) columnInd = Numeric.compress(columnCond, columnIndAll) # impute the rows where 0 < #known_values < #non_zero_columns, i.e. exclude the rows with 0 and all (non-zero-column) values countByRows = MA.count(arr2d, axis=1) for rowIdx in Numeric.compress( Numeric.logical_and(Numeric.greater(countByRows, 0), Numeric.less(countByRows, columnInd.shape[0])), Numeric.arange(arr2d.shape[0])): rowResized = MA.resize(arr2d[rowIdx], arr2d.shape) diff = arr2d - rowResized distances = MA.sqrt( MA.add.reduce((diff)**2, 1) / MA.count(diff, axis=1)) # nearest neighbours row indices (without the current row index) indSorted = MA.argsort(distances)[1:] distSorted = distances.take(indSorted) # number of distances different from MA.masked numNonMasked = distSorted.shape[0] - Numeric.add.reduce( Numeric.asarray(MA.getmaskarray(distSorted), Numeric.Int)) # number of distances to account for (K or less) if numNonMasked > 1: weightsSorted = MA.power( 1 - MA.power(distSorted / distSorted[numNonMasked - 1], 3), 3) # tricubic distribution of all weights else: weightsSorted = Numeric.ones(distSorted.shape[0]) # compute average for each column separately in order to account for K non-masked values colInd4CurrRow = Numeric.compress( Numeric.logical_and(MA.getmaskarray(arr2d[rowIdx]), columnCond), columnIndAll) for colIdx in colInd4CurrRow: # column values sorted by distances columnVals = arr2d[:, colIdx].take(indSorted) # take only those weights where columnVals does not equal MA.masked weightsSortedCompressed = MA.compress( 1 - MA.getmaskarray(columnVals), weightsSorted) # impute from K (or possibly less) values aImp2[rowIdx, colIdx] = MA.average(columnVals.compressed()[:K], weights=weightsSortedCompressed[:K]) if callback: callback() # impute the unknown rows with average profile avrgRow = MA.average(arr2d, 0) for rowIdx in Numeric.compress(Numeric.equal(countByRows, 0), Numeric.arange(arr2d.shape[0])): aImp2[rowIdx] = avrgRow if callback: callback() return aImp2
def match(x, y, n_iterations=1, z=2, eps_rmsd=0.5, eps_stdv=0.05): """ Matches two arrays onto each other, while iteratively removing outliers. Superimposed array y would be C{ N.dot(y, N.transpose(r)) + t }. @param n_iterations: number of calculations:: 1 .. no iteration 0 .. until convergence @type n_iterations: 1|0 @param z: number of standard deviations for outlier definition (default: 2) @type z: float @param eps_rmsd: tolerance in rmsd (default: 0.5) @type eps_rmsd: float @param eps_stdv: tolerance in standard deviations (default: 0.05) @type eps_stdv: float @return: (r,t), [ [percent_considered, rmsd_for_it, outliers] ] @rtype: (array, array), [float, float, int] """ iter_trace = [] rmsd_old = 0 stdv_old = 0 n = 0 converged = 0 mask = N.ones(len(y), N.int32 ) while not converged: ## find transformation for best match r, t = findTransformation(N.compress(mask, x, 0), N.compress(mask, y, 0)) ## transform coordinates xt = N.dot(y, N.transpose(r)) + t ## calculate row distances d = N.sqrt(N.sum(N.power(x - xt, 2), 1)) * mask ## calculate rmsd and stdv rmsd = N.sqrt(N.average(N.compress(mask, d)**2)) stdv = MU.SD(N.compress(mask, d)) ## check conditions for convergence d_rmsd = abs(rmsd - rmsd_old) d_stdv = abs(1 - stdv_old / stdv) if d_rmsd < eps_rmsd and d_stdv < eps_stdv: converged = 1 else: rmsd_old = rmsd stdv_old = stdv ## store result perc = round(float(N.sum(mask)) / float(len(mask)), 2) ## throw out non-matching rows mask = N.logical_and(mask, N.less(d, rmsd + z * stdv)) outliers = N.nonzero( N.logical_not( mask ) ) iter_trace.append([perc, round(rmsd, 3), outliers]) n += 1 if n_iterations and n >= n_iterations: break return (r, t), iter_trace