def scatter(pts, nsamples=100, colormap=None, scale=1, thresh=0.001, ax=None, store=False): """Create a scatter plot of x and y points from an array or an RDD (through sampling) Can optionally use the values to determine colors""" if ax is None: ax = pyplot.gca() if isrdd(pts): if thresh is not None: pts = array(pts.values().filter(lambda x: std(x) > thresh).takeSample(False, nsamples)) else: pts = array(pts.values().takeSample(False, nsamples)) if len(pts) == 0: raise Exception('no samples found, most likely your threshold is too low') else: pts = asarray(pts) if colormap is not None: # pass in strings or actual colormap objects if isinstance(colormap, basestring): clrs = Colorize(colormap, scale).calc(pts) else: clrs = colormap.calc(pts) else: clrs = 'indianred' h = ax.scatter(pts[:, 0], pts[:, 1], s=100, c=clrs, alpha=0.6, edgecolor='black', linewidth=0.2) if store is True: return ax, h, pts else: return ax, h
def calc(self, data): if isrdd(data): self.checkargs(size(data.first()[1])) return data.mapValues(lambda x: self.get(x)) else: self.checkargs(size(data[0])) return map(lambda line: self.get(line), data)
def scatter(pts, nsamples=100, colormap=None, scale=1, thresh=0.001, ax=None, store=False): """Create a scatter plot of x and y points from an array or an RDD (through sampling) Can optionally use the values to determine colors""" if ax is None: ax = pyplot.gca() if isrdd(pts): if thresh is not None: pts = array( pts.values().filter(lambda x: std(x) > thresh).takeSample( False, nsamples)) else: pts = array(pts.values().takeSample(False, nsamples)) if len(pts) == 0: raise Exception( 'no samples found, most likely your threshold is too low') else: pts = asarray(pts) if colormap is not None: # pass in strings or actual colormap objects if isinstance(colormap, basestring): clrs = Colorize(colormap, scale).calc(pts) else: clrs = colormap.calc(pts) else: clrs = 'indianred' h = ax.scatter(pts[:, 0], pts[:, 1], s=100, c=clrs, alpha=0.6, edgecolor='black', linewidth=0.2) if store is True: return ax, h, pts else: return ax, h
def calc(self, data, func): """Base function for making clustering predictions""" # small optimization to avoid serializing full model centers = self.centers if isrdd(data): return data.mapValues(lambda x: func(centers, x)) elif isinstance(data, list): return map(lambda x: func(centers, x), data) elif isinstance(data, ndarray): if data.ndim == 1: return func(centers, data) else: return map(lambda x: func(centers, x), data)
def pointmap(data, colormap='polar', scale=1.0, ax=None): """Create a spatial point map from a collection of key-value pairs, using the keys as spatial indices, and the values to compute colors""" if ax is None: ax = pyplot.gca() if isrdd(data): pts = Colorize(colormap, scale).calc(data).collect() else: raise Exception('input must be an RDD') clrs = array(map(lambda (k, v): v, pts)) x = map(lambda (k, v): k[0], pts) y = map(lambda (k, v): k[1], pts) z = map(lambda (k, v): k[2], pts) # currently unused h = ax.scatter(x, y, s=100, c=clrs, alpha=0.5, edgecolor='black', linewidth=0.2) return ax, h
def predict(self, data): """Predict the cluster that all data points belong to, and the similarity Parameters ---------- data : RDD of (tuple, array) pairs, a list of arrays, or a single array The data to predict cluster assignments on Returns ------- closest : RDD of (tuple, array) pairs, list of arrays, or a single array For each data point, gives an array with the closest center for each data point, and the correlation with that center """ if isrdd(data): return data.mapValues(lambda x: KMeans.similarity(x, self.centers)) elif type(data) is list: return map(lambda x: KMeans.similarity(x, self.centers), data) else: return KMeans.similarity(data, self.centers)
def imagemap(data, colormap='polar', scale=1.0, ax=None): """Create an image from a collection of key-value pairs, using the keys as spatial indices, and the values to compute colors""" if ax is None: ax = pyplot.gca() if isrdd(data): ndim = len(data.first()[0]) data = Colorize(colormap, scale).calc(data) if ndim == 3: pixels = pack(data, axes=2) elif ndim == 2: pixels = pack(data) else: raise Exception('number of spatial dimensions for images must be 2 or 3') else: raise Exception('input must be an RDD') h = ax.imshow(transpose(pixels, [2, 1, 0])) return ax, h
def imagemap(data, colormap='polar', scale=1.0, ax=None): """Create an image from a collection of key-value pairs, using the keys as spatial indices, and the values to compute colors""" if ax is None: ax = pyplot.gca() if isrdd(data): ndim = len(data.first()[0]) data = Colorize(colormap, scale).calc(data) if ndim == 3: pixels = pack(data, axes=2) elif ndim == 2: pixels = pack(data) else: raise Exception( 'number of spatial dimensions for images must be 2 or 3') else: raise Exception('input must be an RDD') h = ax.imshow(transpose(pixels, [2, 1, 0])) return ax, h
def save(data, outputdir, outputfile, outputformat, sorting=False, dimsmax=None, dimsmin=None): """ Save data to a variety of formats Automatically determines whether data is an array or an RDD and handle appropriately Parameters ---------- data : RDD of (tuple, array) pairs, or numpy array The data to save outputdir : str Output directory outputfile : str Output filename outputformat : str Output format ("matlab", "text", or "image") """ if not os.path.exists(outputdir): os.makedirs(outputdir) filename = os.path.join(outputdir, outputfile) if isrdd(data): nout = size(data.first()[1]) if dimsmax is not None: dims = Dimensions() dims.max = dimsmax if dimsmin is not None: dims.min = dimsmin else: dims.min = (1, 1, 1) elif dimsmin is not None: raise Exception('cannot provide dimsmin without dimsmax') else: dims = getdims(data) if (outputformat == "matlab") | (outputformat == "text"): if isrdd(data): if nout > 1: for iout in range(0, nout): result = pack(data, ind=iout, dims=dims, sorting=sorting) if outputformat == "matlab": savemat(filename + "-" + str(iout) + ".mat", mdict={outputfile + str(iout): result}, oned_as='column', do_compression='true') if outputformat == "text": savetxt(filename + "-" + str(iout) + ".txt", result, fmt="%.6f") else: result = pack(data, dims=dims, sorting=sorting) if outputformat == "matlab": savemat(filename + ".mat", mdict={outputfile: result}, oned_as='column', do_compression='true') if outputformat == "text": savetxt(filename + ".txt", result, fmt="%.6f") else: if outputformat == "matlab": savemat(filename + ".mat", mdict={outputfile: data}, oned_as='column', do_compression='true') if outputformat == "text": savetxt(filename + ".txt", data, fmt="%.6f") if outputformat == "image": if isrdd(data): data = rescale(data) if nout > 1: for iout in range(0, nout): result = pack(data, ind=iout, dims=dims, sorting=sorting) arraytoim(result, filename + "-" + str(iout)) else: result = pack(data, dims=dims, sorting=sorting) arraytoim(result, filename) else: arraytoim(data, filename)
def save(data, outputdir, outputfile, outputformat, sorting=False, dimsmax=None, dimsmin=None): """ Save data to a variety of formats Automatically determines whether data is an array or an RDD and handle appropriately Parameters ---------- data : RDD of (tuple, array) pairs, or numpy array The data to save outputdir : str Output directory outputfile : str Output filename outputformat : str Output format ("matlab", "text", or "image") """ if not os.path.exists(outputdir): os.makedirs(outputdir) filename = os.path.join(outputdir, outputfile) if isrdd(data): nout = size(data.first()[1]) if dimsmax is not None: dims = Dimensions() dims.max = dimsmax if dimsmin is not None: dims.min = dimsmin else: dims.min = (1, 1, 1) elif dimsmin is not None: raise Exception('cannot provide dimsmin without dimsmax') else: dims = getdims(data) if (outputformat == "matlab") | (outputformat == "text"): if isrdd(data): if nout > 1: for iout in range(0, nout): result = pack(data, ind=iout, dims=dims, sorting=sorting) if outputformat == "matlab": savemat(filename+"-"+str(iout)+".mat", mdict={outputfile+str(iout): result}, oned_as='column', do_compression='true') if outputformat == "text": savetxt(filename+"-"+str(iout)+".txt", result, fmt="%.6f") else: result = pack(data, dims=dims, sorting=sorting) if outputformat == "matlab": savemat(filename+".mat", mdict={outputfile: result}, oned_as='column', do_compression='true') if outputformat == "text": savetxt(filename+".txt", result, fmt="%.6f") else: if outputformat == "matlab": savemat(filename+".mat", mdict={outputfile: data}, oned_as='column', do_compression='true') if outputformat == "text": savetxt(filename+".txt", data, fmt="%.6f") if outputformat == "image": if isrdd(data): data = rescale(data) if nout > 1: for iout in range(0, nout): result = pack(data, ind=iout, dims=dims, sorting=sorting) arraytoim(result, filename+"-"+str(iout)) else: result = pack(data, dims=dims, sorting=sorting) arraytoim(result, filename) else: arraytoim(data, filename)