Exemplo n.º 1
0
def kmeans(data, **kwargs):
    """
    Perform k-means clustering on unstructured N-dimensional data.
    
    @type data: array
    @param data: The data to be clustered
    @type kwargs: dict
    @param kwargs: The following args are accepted:
        - numClusters: The number of clusters to form (returned number of clusters may be less than k).
        - npasses: The number of times the k-means clustering algorithm is performed, 
        each time with a different (random) initial condition.
        - method: describes how the center of a cluster is found: 
            - method=='a': arithmetic mean.
            - method=='m': median.
        - initialCenters: a set of points that should be used as the initial
                          cluster centers
            
    @rtype: tuple
    @return: A list where each element indicates the cluster membership of the 
        corresponding index in the original data and a message string
    """
    k = 1
    npasses = 1
    method = 'a'
    initialCenters = None
    smartCenters = False
    msg = ''
    
    if 'numClusters' in kwargs:
        k = int(kwargs['numClusters'])
    if 'npasses' in kwargs:
        npasses = int(kwargs['npasses'])
    if 'method' in kwargs:
        method = kwargs['method']
    if 'initialCenters' in kwargs:
        initialCenters = kwargs['initialCenters']
    if 'smartCenters' in kwargs:
        smartCenters = kwargs['smartCenters']
    
    
    logData = tm.getMethod('log')(data)
    if initialCenters is not None:
        (clusterIDs, err, nOpt) = pc.kcluster(logData, k, npass=npasses, method=method)
        msg = "Number of rounds optimal solution was found: %i" % nOpt
    else:
        logCenters = tm.getMethod('log')(np.array(initialCenters[:k]))
        (centroids, clusterIDs) = kmeans2(logData, logCenters, minit='matrix')
        if len(np.unique(clusterIDs)) < k:
            wx.MessageBox('Warning: One or more of the returned clusters are empty. Please choose different initial cluster centers and re-run k-means for better results.', 'Insufficiently varied cluster centers', wx.OK | wx.ICON_WARNING)
            
    
    return clusterIDs, msg
Exemplo n.º 2
0
def heatmap2d(subplot, figure, dims):
    """
    heatmap2d; 2D Heatmap; Plots a 2D heatmap of the data with event density as the main indicator.
    """
    opts = subplot.opts
    if len(opts) == 0:
        opts['type'] = 'Hexbins'
        opts['colorMap'] = 'gist_earth'
        opts['bins'] = (200, 200)
        opts['transform'] = 'log'
        opts['transformAuto'] = True
        
    subplot.axes = figure.add_subplot(subplot.mnp, title=subplot.Title)
    subplot.axes.set_xlabel(subplot.Labels[dims[0]])
    subplot.axes.set_ylabel(subplot.Labels[dims[1]])
    
    x = subplot.Data[:, dims[0]]
    y = subplot.Data[:, dims[1]]
    cbLabel = ''
    
    # apply transform
    if 'transform' not in opts:
        opts['transform'] = 'log'
    if opts['transform'] == 'log':
        x = tm.getMethod('log')(x)
        y = tm.getMethod('log')(y)
    
    
    extent = (0, x.max()*1.05, 0, y.max()*1.05)
    
    cmap = CM.get_cmap(opts['colorMap'])
    if opts['type'] == 'Hexbins':
        gAx = subplot.axes.hexbin(x, y, gridsize=opts['bins'][0], extent=extent, mincnt=1, cmap=cmap)
        cbLabel = 'Events'

    # The following two means of calculating the heat map do not 
    # work correctly yet and are not enabled for selection.
    if opts['type'] == 'Gaussian KDE':
        kdeGrid = fast_kde(x, y, gridsize=opts['bins'])
        gAx = subplot.axes.imshow(kdeGrid, extent=extent, cmap=cmap,
                            aspect='auto', interpolation='bicubic')
    
    if opts['type'] == 'Histogram':
        heatmap, xedges, yedges = np.histogram2d(x, y, bins=opts['bins'])
        extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]]
        gAx = subplot.axes.imshow(heatmap, extent=extent, cmap=cmap,
                            aspect='auto')
        
    cb = subplot.parent.colorbar(gAx)
    if cbLabel == '':
        cbLabel = 'Events' if opts['transform'] != 'log' else 'log Events'
    cb.set_label(cbLabel)
Exemplo n.º 3
0
def histogram(subplot, figure, dims):
    """
    histogram; Histogram; Plots a 1D Histogram
    """
    # set default plot options if necessary
    opts = subplot.opts
    if len(opts) == 0:
        opts['type'] = 'Gaussian KDE'
        opts['bins'] = 200
        opts['transformAuto'] = True
        opts['xTransform'] = ''
        opts['yTransform'] = ''
        opts['kdeDisplay'] = True 
    
    # Set axes transforms
    if (opts['transformAuto']):
        opts['xTransform'] = 'log'
        opts['yTransform'] = 'linear'
    
    subplot.axes = figure.add_subplot(subplot.mnp, title=subplot.Title)
    subplot.axes.set_xlabel(subplot.Labels[dims[0]])
    
    data = subplot.Data[:, dims[0]]
    
    if opts['xTransform'] == 'log':
        data = tm.getMethod('log')(data)

    # Kernel density estimation
    if opts['type'] == 'Gaussian KDE' or opts['type'] == 'Both':
        ind = np.linspace(np.min(data), np.max(data), data.shape[0]*.1)
        gkde = stats.gaussian_kde(data)
        kdepdf = gkde.evaluate(ind)
        subplot.axes.plot(ind, kdepdf, label='kde', color='blue')
    
    # Binned Histogram
    if opts['type'] != 'Gaussian KDE':
        #subplot.axes.hist(subplot.Data[:, dims[0]], bins=250, normed=True, histtype='bar',log=True)
        h, b = np.histogram(data, bins=opts['bins'])
        if opts['type'] == 'Both':
            h = tm.getMethod('log')(h)
        b = (b[:-1] + b[1:])/2.0
        subplot.axes.plot(b, h)
        
    if opts['type'] == 'Both':
        dataMax = max(np.max(kdepdf), np.max(h))
        subplot.axes.set_ylim(0, dataMax + 0.1)
Exemplo n.º 4
0
def bakker_kMeans(data, **kwargs):
    """
    This is an implementation of the k-means algorithm designed specifically
    for flow cytometry data in the following paper:
    
    T.C.B. Schut, B.G.D. Grooth, and J. Greve, 
    "Cluster analysis of flow cytometric list mode data on a personal computer", 
    Cytometry,  vol. 14, 1993, pp. 649-659.

    @type data: array
    @param data: The data to be clustered
    @type kwargs: dict
    @param kwargs: The following args are accepted:
        - numClusters: The number of clusters to form
            
    @rtype: tuple
    @return: A list where each element indicates the cluster membership of the 
        corresponding index in the original data and a message string
    """
    k = 1
    initClusters = 200
    msg = ''
    
    if 'numClusters' in kwargs.keys():
        k = int(kwargs['numClusters'])
    if 'initClusters' in kwargs.keys():
        initClusters = int(kwargs['numClusters'])
    
    # Log transform
    logData = tm.getMethod('log')(data)
    
    # Choose large # (200 as suggested by authors) of non-random initial centers
    centers = util.kinit(logData, initClusters)
    
    # Run k-means
    _, ids = kmeans2(logData, np.array(centers), minit='matrix')
    
    # Merge clusters w/special comparison metric until user cluster # achieved
    clusters = util.separate(logData, ids)
    finalIDs = merge(k, ids, clusters)
    
    return finalIDs, msg