def madMA(m,axis=0): """Returns Median Absolute Deviation of the given masked array along the given axis. """ m = MA.asarray(m) mx = MA.asarray(medianMA(m,axis),Numeric.Float) xt = MA.transpose(m, [axis]+range(axis)+range(axis+1,MA.rank(m))) # do not use swapaxes: (0,1,2) -swap-> (2,1,0); (0,1,2) -transpose-> (2,0,1) return medianMA(MA.absolute(xt-mx))
def subtract_unary(a, b): """Returns a-b with masked values only in places where both a and b are masked. """ a = MA.asarray(a) b = MA.asarray(b) el = MA.subtract(a.filled(0), b.filled(0)) mask = Numeric.logical_and(MA.getmaskarray(a), MA.getmaskarray(b)) return MA.array(el, mask=mask)
def divide_unary(a, b): """Returns a*b with masked values only in places where both a and b are masked. """ a = MA.asarray(a) b = MA.asarray(b) el = MA.divide(a.filled(1), b.filled(1)) mask = Numeric.logical_and(MA.getmaskarray(a), MA.getmaskarray(b)) return MA.array(el, mask=mask)
def dotMA(a, b): """Returns dot-product for MA arrays; fixed masked values. """ a = MA.asarray(a) b = MA.asarray(b) ab = MA.dot(a,b) # fix masked values in ab (MA.dot returns 0 instead of MA.masked) nonMasked = Numeric.dot(1-MA.getmaskarray(a).astype(Numeric.Int), 1-MA.getmaskarray(b).astype(Numeric.Int)) return MA.where(Numeric.equal(nonMasked,0), MA.masked, ab)
def distEuclidean(x, y): """normalized euclidean distance """ x = MA.asarray(x) y = MA.asarray(y) assert MA.rank(x) == MA.rank(y) == 1 sumWeights = MA.add.reduce( MA.logical_not(MA.logical_or( MA.getmaskarray(x), MA.getmaskarray(y))).astype(Numeric.Float)) return MA.sqrt(MA.add.reduce((x - y)**2) / sumWeights)
def distManhattan(x, y): """normalized Manhattan distance """ x = MA.asarray(x) y = MA.asarray(y) assert MA.rank(x) == MA.rank(y) == 1 sumWeights = MA.add.reduce( MA.logical_not(MA.logical_or( MA.getmaskarray(x), MA.getmaskarray(y))).astype(Numeric.Float)) return MA.add.reduce(MA.absolute(x - y)) / sumWeights
def distSpearman(x, y): """distance corresponding to 1 - spearman's correlation coefficient for arrays x,y returns distance: 1 - spearman_r """ x = MA.asarray(x) y = MA.asarray(y) assert MA.rank(x) == MA.rank(y) == 1 cond = MA.logical_not(MA.logical_or(MA.getmaskarray(x), MA.getmaskarray(y))) return 1 - statc.spearmanr( MA.compress(cond, x).tolist(), MA.compress(cond, y).tolist())[0]
def diagonalPut(m1d, m2d): """Puts the given 1D masked array into the diagonal of the given 2D masked array and returns a new copy of the 2D array. """ m1d = MA.asarray(m1d) m2d = MA.asarray(m2d) assert MA.rank(m1d) == 1 and MA.rank(m2d) == 2, "1D and 2D masked array expected" assert m1d.shape[0] == m2d.shape[0] == m2d.shape[1], "the shape of the given arrays does not match" putIndices = Numeric.compress(Numeric.ravel(Numeric.fromfunction(lambda i,j: i==j, m2d.shape)), Numeric.arange(0, Numeric.multiply.reduce(m2d.shape), typecode=Numeric.Int)) m2dShape = m2d.shape m2d = MA.ravel(m2d) MA.put(m2d, putIndices, m1d) return MA.reshape(m2d, m2dShape)
def _distSpearmanW_MA(x, y, w): """if any of x,y,w is a MA array containing masked values """ x = MA.asarray(x) y = MA.asarray(y) w = MA.asarray(w) assert MA.rank(x) == MA.rank(y) == MA.rank(w) == 1 cond = MA.logical_not( MA.logical_or(MA.logical_or(MA.getmaskarray(x), MA.getmaskarray(y)), MA.getmaskarray(w))) # with MA use compress before tolist() ! rankx = Numeric.array(statc.rankdata(MA.compress(cond, x).tolist())) ranky = Numeric.array(statc.rankdata(MA.compress(cond, y).tolist())) return distPearsonW(rankx, ranky, MA.compress(cond, w))
def triangularPut(m1d, upper=1, lower=0): """Returns 2D masked array with elements of the given 1D array in the strictly upper (lower) triangle. Elements of the 1D array should be ordered according to the upper triangular part of the 2D matrix. The lower triangular part (if requested) equals to the transposed upper triangular part. If upper == lower == 1 a symetric matrix is returned. """ assert upper in [0,1] and lower in [0,1], "[0|1] expected for upper / lower" m1d = MA.asarray(m1d) assert MA.rank(m1d) == 1, "1D masked array expected" m2dShape0 = math.ceil(math.sqrt(2*m1d.shape[0])) assert m1d.shape[0] == m2dShape0*(m2dShape0-1)/2, "the length of m1d does not correspond to n(n-1)/2" if upper: if lower: mask = Numeric.fromfunction(lambda i,j: i==j, (m2dShape0, m2dShape0)) else: mask = Numeric.fromfunction(lambda i,j: i>=j, (m2dShape0, m2dShape0)) else: if lower: mask = Numeric.fromfunction(lambda i,j: i<=j, (m2dShape0, m2dShape0)) else: mask = Numeric.ones((m2dShape0, m2dShape0)) m2d = MA.ravel(MA.zeros((m2dShape0, m2dShape0), m1d.dtype.char)) condUpperTriang = Numeric.fromfunction(lambda i,j: i<j, (m2dShape0, m2dShape0)) putIndices = Numeric.compress(Numeric.ravel(condUpperTriang), Numeric.arange(0, m2dShape0**2, typecode=Numeric.Int)) MA.put(m2d, putIndices, m1d) m2d = MA.reshape(m2d, (m2dShape0, m2dShape0)) m2d = MA.where(condUpperTriang, m2d, MA.transpose(m2d)) return MA.array(m2d, mask=Numeric.logical_or(mask, MA.getmaskarray(m2d)))
def loessMA(m, windowSize, axis=0, approxMasked=True, verbose=False, callback=None): """Returns a new array with values at the given axis smoothed by loess; if approxMasked==True: the masked values are approximated by loess; assumes equidistant spacing of points on the given axis. """ assert 0 < windowSize <= m.shape[axis]+0.1, "0 < windowSize[%s] <= 1 OR windowSize in range(1.1,m.shape[axis]+1) expected, got %f" % ("%", windowSize) m = MA.asarray(m) if m.dtype.char <> Numeric.Float: m = m.astype(Numeric.Float) shp_other = list(m.shape) shp_other.pop(axis) # get a transposed and reshaped mask and data from m; if m.mask() == None, construct a new array of zeros mask = Numeric.reshape(Numeric.transpose(MA.getmaskarray(m), [axis] + range(0,axis) + range(axis+1,len(m.shape))), (m.shape[axis], Numeric.multiply.reduce(shp_other))) data = MA.reshape(MA.transpose(m, [axis] + range(0,axis) + range(axis+1,len(m.shape))), (m.shape[axis], Numeric.multiply.reduce(shp_other))) maskInv = -1*(mask-1) xall = Numeric.arange(data.shape[0]) xallList = xall.tolist() for ii in Numeric.compress(Numeric.add.reduce(maskInv,0) > 1, range(data.shape[1])): # run loess if the profile contains more than 2 values try: data[:,ii] = MA.array(statc.loess(zip(MA.compress(maskInv[:,ii], xall).tolist(), MA.compress(maskInv[:,ii], data[:,ii]).tolist()), xallList, windowSize))[:,1] except: if verbose: print "Warning: loessMA: could not loess axis %i index %i" % (axis, ii) if callback: callback() if not approxMasked: data = MA.array(data, mask=mask) return MA.transpose(MA.reshape(data, [m.shape[axis]] + shp_other), [axis] + range(0,axis) + range(axis+1,len(m.shape)))
def compressIndices(ma): """Returns 1D compressed Numeric array and the indices of the non-masked places. usage: nu,ind = compressIndices(ma) nu = Numeric.elementwise_function(nu) ma = MA.put(ma, ind, nu) """ ma = MA.asarray(ma) nonMaskedInd = Numeric.compress(1-Numeric.ravel(MA.getmaskarray(ma)), Numeric.arange(Numeric.multiply.reduce(ma.shape))) return MA.filled(ma.compressed()), nonMaskedInd
def triangularGet(m2d, upper=1): """Returns 1D masked array with elements from the upper (lower) triangular part of the given matrix. For a symetric matrix triangularGet(m2d, 0) and triangularGet(m2d, 1) return elements in different order. """ assert upper in [0,1], "upper: [0|1] expected" m2d = MA.asarray(m2d) assert MA.rank(m2d) == 2, "2D (masked) array expected" if upper: takeInd = Numeric.compress(Numeric.ravel(Numeric.fromfunction(lambda i,j: i<j, m2d.shape)), Numeric.arange(0, Numeric.multiply.reduce(m2d.shape), typecode=Numeric.Int)) else: takeInd = Numeric.compress(Numeric.ravel(Numeric.fromfunction(lambda i,j: i>j, m2d.shape)), Numeric.arange(0, Numeric.multiply.reduce(m2d.shape), typecode=Numeric.Int)) return MA.ravel(m2d).take(takeInd)
def kNNimputeMA(arr2d, K=20, callback=None): """Returns a new 2D MA.array with missing values imputed from K nearest neighbours. Find K rows (axis 0) with the most similar values where similarity measure corresponds to weighted Euclidean distance. Imputed value = weighted average of the corresponding values of K nearest neighbours, where weights equal to tricubic distribution of distances to all rows. Impute missing rows by average over all rows. Version: 30.8.2005 """ arr2d = MA.asarray(arr2d) assert len(arr2d.shape) == 2, "2D array expected" # make a copy for imputation aImp2 = MA.array(arr2d) # leave out columns with 0 known values (columnInd: non-zero columns) columnCond = Numeric.greater(MA.count(arr2d, axis=0), 0) columnIndAll = Numeric.arange(arr2d.shape[1]) columnInd = Numeric.compress(columnCond, columnIndAll) # impute the rows where 0 < #known_values < #non_zero_columns, i.e. exclude the rows with 0 and all (non-zero-column) values countByRows = MA.count(arr2d, axis=1) for rowIdx in Numeric.compress(Numeric.logical_and(Numeric.greater(countByRows, 0), Numeric.less(countByRows, columnInd.shape[0])), Numeric.arange(arr2d.shape[0])): rowResized = MA.resize(arr2d[rowIdx], arr2d.shape) diff = arr2d - rowResized distances = MA.sqrt(MA.add.reduce((diff)**2, 1) / MA.count(diff, axis=1)) # nearest neighbours row indices (without the current row index) indSorted = MA.argsort(distances)[1:] distSorted = distances.take(indSorted) # number of distances different from MA.masked numNonMasked = distSorted.shape[0] - Numeric.add.reduce(Numeric.asarray(MA.getmaskarray(distSorted), Numeric.Int)) # number of distances to account for (K or less) if numNonMasked > 1: weightsSorted = MA.power(1-MA.power(distSorted/distSorted[numNonMasked-1],3),3) # tricubic distribution of all weights else: weightsSorted = Numeric.ones(distSorted.shape[0]) # compute average for each column separately in order to account for K non-masked values colInd4CurrRow = Numeric.compress(Numeric.logical_and(MA.getmaskarray(arr2d[rowIdx]), columnCond), columnIndAll) for colIdx in colInd4CurrRow: # column values sorted by distances columnVals = arr2d[:,colIdx].take(indSorted) # take only those weights where columnVals does not equal MA.masked weightsSortedCompressed = MA.compress(1-MA.getmaskarray(columnVals), weightsSorted) # impute from K (or possibly less) values aImp2[rowIdx,colIdx] = MA.average(columnVals.compressed()[:K], weights=weightsSortedCompressed[:K]) if callback: callback() # impute the unknown rows with average profile avrgRow = MA.average(arr2d, 0) for rowIdx in Numeric.compress(Numeric.equal(countByRows, 0), Numeric.arange(arr2d.shape[0])): aImp2[rowIdx] = avrgRow if callback: callback() return aImp2
def loessMA(m, windowSize, axis=0, approxMasked=True, verbose=False, callback=None): """Returns a new array with values at the given axis smoothed by loess; if approxMasked==True: the masked values are approximated by loess; assumes equidistant spacing of points on the given axis. """ assert 0 < windowSize <= m.shape[ axis] + 0.1, "0 < windowSize[%s] <= 1 OR windowSize in range(1.1,m.shape[axis]+1) expected, got %f" % ( "%", windowSize) m = MA.asarray(m) if m.dtype.char <> Numeric.Float: m = m.astype(Numeric.Float) shp_other = list(m.shape) shp_other.pop(axis) # get a transposed and reshaped mask and data from m; if m.mask() == None, construct a new array of zeros mask = Numeric.reshape( Numeric.transpose(MA.getmaskarray(m), [axis] + range(0, axis) + range(axis + 1, len(m.shape))), (m.shape[axis], Numeric.multiply.reduce(shp_other))) data = MA.reshape( MA.transpose(m, [axis] + range(0, axis) + range(axis + 1, len(m.shape))), (m.shape[axis], Numeric.multiply.reduce(shp_other))) maskInv = -1 * (mask - 1) xall = Numeric.arange(data.shape[0]) xallList = xall.tolist() for ii in Numeric.compress( Numeric.add.reduce(maskInv, 0) > 1, range(data.shape[1]) ): # run loess if the profile contains more than 2 values try: data[:, ii] = MA.array( statc.loess( zip( MA.compress(maskInv[:, ii], xall).tolist(), MA.compress(maskInv[:, ii], data[:, ii]).tolist()), xallList, windowSize))[:, 1] except: if verbose: print "Warning: loessMA: could not loess axis %i index %i" % ( axis, ii) if callback: callback() if not approxMasked: data = MA.array(data, mask=mask) return MA.transpose(MA.reshape(data, [m.shape[axis]] + shp_other), [axis] + range(0, axis) + range(axis + 1, len(m.shape)))
def distPearsonW(x, y, w): """weighted distance corresponding to 1 - pearson's correlation coefficient for arrays x,y and weights w returns distance: 1 - pearson_r """ #TINY = 1.0e-20 # ones for non-masked places at x,y and w x = MA.asarray(x) y = MA.asarray(y) w = MA.asarray(w) assert MA.rank(x) == MA.rank(y) == MA.rank(w) == 1 mask = MA.logical_or(MA.logical_or(MA.getmaskarray(x), MA.getmaskarray(y)), MA.getmaskarray(w)) # set mask to w that is equal to the mask from x, y and w w = MA.masked_array(w, mask=mask) n_w_mean = MA.add.reduce(w) # n * mean(w) x_w = x * w # x * w y_w = y * w # y * w x_wmean = MA.divide(MA.add.reduce(x_w), n_w_mean) # weighted_mean(x) y_wmean = MA.divide(MA.add.reduce(y_w), n_w_mean) # weighted_mean(x) r_num = MA.add.reduce(x * y * w) - n_w_mean * x_wmean * y_wmean r_den = MA.sqrt((MA.add.reduce(x_w * x) - n_w_mean * x_wmean**2) * (MA.add.reduce(y_w * y) - n_w_mean * y_wmean**2)) return 1 - MA.divide(r_num, r_den)
def rankDataMA(m, inverse=False): """Returns ranks of 1D masked array; masked values ignored, range 1...#non-masked_values. """ m = MA.asarray(m) assert MA.rank(m) == 1 fill_val = m.fill_value() m.set_fill_value(MA.maximum(m) + 1) r = MA.zeros(m.shape[0], Numeric.Float) MA.put(r, MA.argsort(m), Numeric.arange(m.shape[0])) m.set_fill_value(fill_val) r = MA.array(r, mask=MA.getmaskarray(m)) if inverse: return -1*r+MA.count(m) else: return r+1
def ma2orng_keepClassMetas(arr2d, aExampleTable): """Creates new example table where attribute values correspond to the given 2D array, class and meta attributes remain unchanged. """ arr2d = MA.asarray(arr2d, Numeric.PyObject) assert MA.rank(arr2d) == 2, "2D array expected" assert arr2d.shape[0] == len(aExampleTable), "arr2d.shape[0] != len(aExampleTable)" assert arr2d.shape[1] == len(aExampleTable.domain.attributes), "arr2d.shape[1] != len(aExampleTable.domain.attributes)" domAtt = orange.Domain(aExampleTable.domain.attributes, None) if aExampleTable.domain.classVar != None: domClassMeta = orange.Domain([aExampleTable.domain.classVar]) else: domClassMeta = orange.Domain([]) domClassMeta.addmetas(aExampleTable.domain.getmetas()) etAtt = orange.ExampleTable(domAtt, arr2d.tolist("?")) etClassMeta = orange.ExampleTable(domClassMeta, aExampleTable) return orange.ExampleTable(aExampleTable.domain, orange.ExampleTable([etAtt, etClassMeta]))
def ma2orng_keepClassMetas(arr2d, aExampleTable): """Creates new example table where attribute values correspond to the given 2D array, class and meta attributes remain unchanged. """ arr2d = MA.asarray(arr2d, Numeric.PyObject) assert MA.rank(arr2d) == 2, "2D array expected" assert arr2d.shape[0] == len( aExampleTable), "arr2d.shape[0] != len(aExampleTable)" assert arr2d.shape[1] == len( aExampleTable.domain.attributes ), "arr2d.shape[1] != len(aExampleTable.domain.attributes)" domAtt = orange.Domain(aExampleTable.domain.attributes, None) if aExampleTable.domain.classVar != None: domClassMeta = orange.Domain([aExampleTable.domain.classVar]) else: domClassMeta = orange.Domain([]) domClassMeta.addmetas(aExampleTable.domain.getmetas()) etAtt = orange.ExampleTable(domAtt, arr2d.tolist("?")) etClassMeta = orange.ExampleTable(domClassMeta, aExampleTable) return orange.ExampleTable(aExampleTable.domain, orange.ExampleTable([etAtt, etClassMeta]))
def getPositions(m, val): """Input: arbitrary (masked) array and a value from that array; Output: array of positions of the given value in a flat m; """ m = MA.asarray(m) return Numeric.compress(MA.equal(MA.ravel(m),val), Numeric.arange(Numeric.multiply.reduce(m.shape)))
def maxMA(m,axis=0): """slow: remove sorting""" m = MA.asarray(m, Numeric.Float) transList = [axis] + range(0,axis) + range(axis+1,MA.rank(m)) m = MA.transpose(m, transList) # do not use swapaxes: (0,1,2) -swap-> (2,1,0); (0,1,2) -transpose-> (2,0,1) return MA.sort(m, 0, fill_value=-1e20)[-1]
def scaleMad(nm, axis=0): """Returns new masked numarray with values scaled by dividing by Median Absolute Difference: MAD = median(|val(i)-median(val(i))|).""" nm = numpyExtn.swapaxesMA(MA.asarray(nm, Numeric.Float), 0, axis) return numpyExtn.swapaxesMA(nm / numpyExtn.madMA(nm, 0), 0, axis)
def centerMed(nm, axis=0): """Returns new masked numarray with values centered by subtracting Median.""" nm = numpyExtn.swapaxesMA(MA.asarray(nm, Numeric.Float), 0, axis) return numpyExtn.swapaxesMA(nm - numpyExtn.medianMA(nm, 0), 0, axis)
def scaleMad(nm, axis=0): """Returns new masked numarray with values scaled by dividing by Median Absolute Difference: MAD = median(|val(i)-median(val(i))|).""" nm = numpyExtn.swapaxesMA(MA.asarray(nm, Numeric.Float), 0, axis) return numpyExtn.swapaxesMA(nm / numpyExtn.madMA(nm,0), 0, axis)
def centerMed(nm, axis=0): """Returns new masked numarray with values centered by subtracting Median.""" nm = numpyExtn.swapaxesMA(MA.asarray(nm, Numeric.Float), 0, axis) return numpyExtn.swapaxesMA(nm - numpyExtn.medianMA(nm,0), 0, axis)
def kNNimputeMA(arr2d, K=20, callback=None): """Returns a new 2D MA.array with missing values imputed from K nearest neighbours. Find K rows (axis 0) with the most similar values where similarity measure corresponds to weighted Euclidean distance. Imputed value = weighted average of the corresponding values of K nearest neighbours, where weights equal to tricubic distribution of distances to all rows. Impute missing rows by average over all rows. Version: 30.8.2005 """ arr2d = MA.asarray(arr2d) assert len(arr2d.shape) == 2, "2D array expected" # make a copy for imputation aImp2 = MA.array(arr2d) # leave out columns with 0 known values (columnInd: non-zero columns) columnCond = Numeric.greater(MA.count(arr2d, axis=0), 0) columnIndAll = Numeric.arange(arr2d.shape[1]) columnInd = Numeric.compress(columnCond, columnIndAll) # impute the rows where 0 < #known_values < #non_zero_columns, i.e. exclude the rows with 0 and all (non-zero-column) values countByRows = MA.count(arr2d, axis=1) for rowIdx in Numeric.compress( Numeric.logical_and(Numeric.greater(countByRows, 0), Numeric.less(countByRows, columnInd.shape[0])), Numeric.arange(arr2d.shape[0])): rowResized = MA.resize(arr2d[rowIdx], arr2d.shape) diff = arr2d - rowResized distances = MA.sqrt( MA.add.reduce((diff)**2, 1) / MA.count(diff, axis=1)) # nearest neighbours row indices (without the current row index) indSorted = MA.argsort(distances)[1:] distSorted = distances.take(indSorted) # number of distances different from MA.masked numNonMasked = distSorted.shape[0] - Numeric.add.reduce( Numeric.asarray(MA.getmaskarray(distSorted), Numeric.Int)) # number of distances to account for (K or less) if numNonMasked > 1: weightsSorted = MA.power( 1 - MA.power(distSorted / distSorted[numNonMasked - 1], 3), 3) # tricubic distribution of all weights else: weightsSorted = Numeric.ones(distSorted.shape[0]) # compute average for each column separately in order to account for K non-masked values colInd4CurrRow = Numeric.compress( Numeric.logical_and(MA.getmaskarray(arr2d[rowIdx]), columnCond), columnIndAll) for colIdx in colInd4CurrRow: # column values sorted by distances columnVals = arr2d[:, colIdx].take(indSorted) # take only those weights where columnVals does not equal MA.masked weightsSortedCompressed = MA.compress( 1 - MA.getmaskarray(columnVals), weightsSorted) # impute from K (or possibly less) values aImp2[rowIdx, colIdx] = MA.average(columnVals.compressed()[:K], weights=weightsSortedCompressed[:K]) if callback: callback() # impute the unknown rows with average profile avrgRow = MA.average(arr2d, 0) for rowIdx in Numeric.compress(Numeric.equal(countByRows, 0), Numeric.arange(arr2d.shape[0])): aImp2[rowIdx] = avrgRow if callback: callback() return aImp2