def hierarchicalClusteringScipy(self, vectorLayer, attributesList, normalize, clusterThreshold, linkageMethod, criterion, metric, depth, max_clust, outputFieldName):
        import scipy.cluster.hierarchy as hcluster
        from numpy import array

        fullObjectsList = []
        features = vectorLayer.getFeatures()

        for feature in features:
            fullObjectsList.append([])
            for attribute in attributesList:
                if feature[attribute[0]]:
                    fullObjectsList[len(fullObjectsList) - 1].append(feature[attribute[0]])
                else:
                    fullObjectsList[len(fullObjectsList) - 1].append(0)

        # NORMALIZING
        if normalize:
            i = 0
            maxValues = []
            while i < len(attributesList):
                maxValues.append(max(abs(item[i]) for item in fullObjectsList))
                i += 1

            j = 0
            while j < len(fullObjectsList):
                i = 0
                while i < len(fullObjectsList[j]):
                    fullObjectsList[j][i] = (fullObjectsList[j][i] * 1.0) / (maxValues[i] * 1.0)
                    i += 1
                j += 1

        data = array(fullObjectsList)

        if criterion == 'maxclust':
            clusters = hcluster.fclusterdata(data, t=max_clust, criterion=criterion, method=linkageMethod,
                                             metric=metric, depth=depth)
        else:
            clusters = hcluster.fclusterdata(data, t=clusterThreshold, criterion=criterion, method=linkageMethod,
                                             metric=metric, depth=depth)

        vectorLayerDataProvider = vectorLayer.dataProvider()
#
        ## Create field of not exist
        if vectorLayer.fields().indexFromName(outputFieldName) == -1:
            vectorLayerDataProvider.addAttributes([QgsField(outputFieldName, QVariant.Int)])
#
        vectorLayer.updateFields()
        vectorLayer.startEditing()
        attrIdx = vectorLayer.fields().indexFromName(outputFieldName)
        features = vectorLayer.getFeatures()
#
        i = 0
        for feature in features:
            vectorLayer.changeAttributeValue(feature.id(), attrIdx, int(clusters[i]))
            i += 1
#
        vectorLayer.updateFields()
        vectorLayer.commitChanges()
Exemplo n.º 2
0
def cluster_peaks_by_lane(peak_pos, hdist=8.0, return_sorted=True):
    """
    :param peak_pos:
    :param hdist:
    :param return_sorted:
    :return:

    Refs:
        http://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.fclusterdata.html
        http://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.fcluster.html
        http://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html
        https://web.archive.org/web/20100619134310/http://www.plantbio.ohiou.edu/epb/instruct/multivariate/Week7Lectures.PDF

    Linkage methods:
        single linkage - produces "chains"
        complete linkage - produces "sperical" clusters
        intermediate linkage -
    Other clustering methods:
        UPGMA -
        WPGMA -
        UPGMC -
        WPGMC -
        K-means - cluster into exactly K number of clusters

    """

    hdist = float(hdist)  # ensure float/numeric input
    if hdist is None:
        hdist = 8.0
    xpos = np.array([[pos[1]] for pos in peak_pos])
    # printarr(xpos, "xpos")

    # maybe add a little bit of y-position to the mix?
    # xpos = np.array([[pos[1], pos[0]/100] for pos in peak_pos])

    lane_clusters = fclusterdata(xpos, t=0.2)  # fclusterdata(X, t) is for N observations each with M variables.
    lane_clusters = fclusterdata(xpos, t=hdist, criterion='distance', metric='euclidean', depth=2, method='single')
    # lane_clusters = linkage(xpos)  # defaults to 'single', 'euclidean'

    # group lane-clustered peaks: lane_id -> array of peak pos.
    peaks_by_lane = defaultdict(list)
    for lane_id, pos in zip(lane_clusters, peak_pos):
        peaks_by_lane[lane_id].append(list(pos))
    # convert
    for lane_id in peaks_by_lane:
        peaks_by_lane[lane_id] = np.array(peaks_by_lane[lane_id])

    # pprint(peaks_by_lane)
    if return_sorted:
        # sort by mean x-position (indexing as [y, x] aka [row, col])
        peaks_by_lane = OrderedDict(sorted(peaks_by_lane.items(), key=lambda kv: kv[1][:, 1].mean()))
    # pprint(list(peaks_by_lane.values()))

    return peaks_by_lane
Exemplo n.º 3
0
def pcaCode():
    ##Question: PCA descriptors, or PCA final profiles?
    #Principal Component Analysis
    pca = deco.PCA(n_components = 10)
    Xp = pca.fit_transform(X)

    #Z = hier.linkage(X)
    Y = hier.fclusterdata(X, 1.15)
    print "Num. Clusters (no PCA): %s"%max(Y)

    Yp = hier.fclusterdata(Xp, 1.15)
    print "Num. Clusters (with PCA): %s"%max(Yp)
Exemplo n.º 4
0
def cluster_qs(qs, k=None, threshold=1.5):
    """Cluster q vectors into discrete groups.

    Classifies each of the q vectors into a number of clusters. The number of clusters used is decided by the parameters passed:
        * If the k parameter is supplied then the q vectors are grouped into k clusters using kmeans.
        * If the threshold parameter is supplied then the q vectors a split into groups based on cophenetic distance.

    :param qs: list of q vectors to cluster. Each element should be a numpy array of length three.
    :param k: number of clusters to use (optional).
    :param threshold: cophenetic distance cut off point for new clusters (optional)
    :returns: tuple (clusters, k)
        Where:
            list -- clusters is a list of cluster indicies which each q belongs to
            int -- k is the number of clusters used
    """
    if k is not None:
        centroids = kmeans_plus_plus(qs, k)
        _, clusters = kmeans2(qs, centroids, minit='matrix')
        if len(set(clusters)) != k:
            raise ValueError("Could not group the satellite reflections "
                             "into {} clusters. Please check that you have "
                             "at least {} satellites.".format(k,k))
    else:
        clusters = hcluster.fclusterdata(qs, threshold, criterion="distance")
    return clusters, len(set(clusters))
def identify(image, colors):

    global pixelCounters

    num_colors = 1

    

    #data = numpy.zeros((1000,2))
    n = 0
    a = 0
    for x in xrange(0, image.shape[0]):
        for y in xrange(0, image.shape[1]):
            a += 1
            if a & 0b1111111 != 0:
                continue
            continue
            for i in range(num_colors):                
                hue = image[x, y, 0]
                sat = image[x, y, 1]
                val = image[x, y, 2]
                if hue >= 0 and hue < 10 and sat > 150 and val > 50:
                    data[n, 0] = x
                    data[n, 1] = y
                    n += 1
    if n < 2:
        return (None, None)
    
    t = 30
    data = data[0:n, :]
    clusters = hcluster.fclusterdata(data, t, criterion="distance")
    
    return (data, clusters)
Exemplo n.º 6
0
def calc_best_result(coords, threshold=0.01):
    """
    Calculates most possible result based on clustering of provided coordinates.
    We assume that the bigger cluster represents the value most of the best agent
    have agreed on. Method uses SciPy's hierarchy.fclusterdata function.

    Parameters
    ----------
    coords : list of two-element tuples
        coordinates to guess result from
    threshold : float
        see documentation for scipy.hierarchy.fclusterdata

    Returns
    -------
    x : float
        x coordinate of the result
    y : float
        y coordinate of the result

    """
    coords = np.array(coords)
    t = coords[:,0].std()
    idx = hierarchy.fclusterdata(coords, threshold * t)
    best = int(stats.mode(idx)[0][0])
    ans = np.array([coords[i] for i in range(len(coords)) if idx[i] == best])
    return namedtuple('Ans', 'x, y')(ans[:,0].mean(), ans[:,1].mean())
Exemplo n.º 7
0
def cluster(points, thresh):
    #the x,y,z points must first be separated out
    ndata = [[],[],[]]
    npts = (len(points)-2)/3
    for j in range(0,npts):
        x = float(points[2 + 3*j])
        y = float(points[3 + 3*j])
        z = float(points[4 + 3*j])
        ndata[0].append(x)
        ndata[1].append(y)
        ndata[2].append(z)
    data = np.asarray(ndata)

    clusterlist = hcluster.fclusterdata(np.transpose(data), thresh, criterion="distance")
    
    nclusters = findLargest(clusterlist)
    
    #initializes an array to the right size
    #http://stackoverflow.com/questions/7745562/appending-to-2d-lists-in-python
    clusters = [[] for i in range(nclusters)] 
    #assingns points to the correct cluster
    for i in range(0, npts):
        #print clusters[clusterlist[i]-1]
        
        clusters[clusterlist[i]-1].append([ndata[0][i],ndata[1][i],ndata[2][i]])
    return [data, clusterlist, clusters]
Exemplo n.º 8
0
def searchForColorPoints(im, criteria):
    points = []
    pointColors = []
    hsvIm = cv2.cvtColor(im, cv2.COLOR_BGR2HSV_FULL)
    for i in range(11, im.shape[1] - 11, 10):
        for j in range(11, im.shape[0] - 11, 10):
            b = block(hsvIm, (i, j), 8)
            if b[:, :, 0].std() > 25:
                continue

            color = (b[:, :, 0].mean(), b[:, :, 1].mean(), b[:, :, 2].mean())
            matchedColor = matchColor(color, criteria)
            if matchedColor >= 0:
                points.append((i, j))
                pointColors.append(matchedColor)

    points = np.array(points, np.float16)
    cluster = fclusterdata(points, 10, "distance")

    centroids = []
    for i in range(len(criteria)):
        centroids.append([])

    for i in range(1, cluster.max() + 1):
        b = cluster == i

        c = np.zeros((1, 2), np.int16)
        for p in points[b.argsort()[len(b) - sum(b) :]]:
            c = c + p / sum(b)

        centroids[pointColors[b.argsort()[len(b) - sum(b)]]].append(c[0])

    return centroids
Exemplo n.º 9
0
def merge_paths(rides):
    waypoints = list(itertools.chain(*[ride.route.waypoints for ride in rides]))
    waypoints = sorted(waypoints, key=lambda x: x.country)

    logger.info("Merging {} rides with {} total waypoints".format(len(rides), len(waypoints)))

    for country, group in itertools.groupby(waypoints, key=lambda x: x.country):
        waypoints = list(group)
        country_lat_lng_points = [(x.lat, x.lng) for x in waypoints]
        country_xyz_points = [latlng_to_xyz(lat, lng) for lat, lng in country_lat_lng_points]

        logger.debug("Processing {} with {} waypoints".format(country, len(country_xyz_points)))

        wh = whiten(country_xyz_points)
        k_guess = max(1,len(country_xyz_points)/BEARABLE_CLUSTER_SIZE)
        k_centroids = kmeans(wh,k_guess)[0]
        k_labels = vq(wh, k_centroids)[0]

        k_labeled = sorted(zip(country_xyz_points,country_lat_lng_points,waypoints,k_labels), key=lambda x: x[3])
        logger.debug("Got {} miniclusters".format(len(k_centroids)))
        for key, gr in itertools.groupby(k_labeled, key=lambda x:x[3]):
            gr = list(gr)
            k_waypoints = [x[2] for x in gr]
            k_lat_lng_points = [x[1] for x in gr]
            k_xyz_points = [x[0] for x in gr]
            logger.debug("Running {} minicluster with {} waypoints".format(key, len(k_waypoints)))
            cluster_labels = fclusterdata(np.array(k_xyz_points), 0.2, criterion="distance", metric="euclidean")
            centroids = cluster_centroids(zip(k_lat_lng_points, cluster_labels))
            logger.debug("Got {} hierarhical clusters".format(len(set(cluster_labels))))

            for i in range(0, len(k_waypoints)):
                new_lat, new_lng = centroids[cluster_labels[i]-1]
                k_waypoints[i].lat = new_lat
                k_waypoints[i].lng = new_lng
Exemplo n.º 10
0
def cluster_lane_peaks_to_bands(lane_peaks, vdist=5.0, img=None):
    vdist = float(vdist)  # ensure float/numeric input
    # Special case, lane only has a single peak, nothing to cluster:
    if len(lane_peaks) < 2:
        this_lane_bands_peaks = {0: lane_peaks}  # ensure we have a dict of peaks
        # print("lane_id %s has only %s peaks" % (lane_id, len(lane_peaks)))
    else:
        # sort by row (y-coordinate):
        # print("sorting bands in lane_id %s by y position (pos[0])" % lane_id)
        band_clusters = fclusterdata(lane_peaks, t=vdist, criterion='distance', metric='euclidean', depth=2, method='single')
        # lane_band_cluster_ids[lane_id] = band_clusters
        # print("lane_id", lane_id)
        # print("lane_peaks", lane_peaks)
        # print("band_clusters", band_clusters)
        # group, method (1) using defaultdict:
        # cannot use dict.fromkeys, because it only takes static default values, not types/functions.
        this_lane_bands_peaks = defaultdict(list)
        for band_id, pos in zip(band_clusters, lane_peaks):
            this_lane_bands_peaks[band_id].append(pos)
        # alternative grouping methods: (2) zip, sort, then groupby;
        # print("this_lane_bands_peaks", this_lane_bands_peaks)
        # convert to nparray and take mean:
    # convert the list of peaks for each band to ndarray:
    for band_id in this_lane_bands_peaks:
        this_lane_bands_peaks[band_id] = np.array(this_lane_bands_peaks[band_id])
    return this_lane_bands_peaks
Exemplo n.º 11
0
    def magic_fragmentation(self):
        """ This function takes the atom objects and tries to separate two fragments by a k-means-clustering algorithm. Always check the result before relying on those fragmentations!"""

        #hardcoded number of fragments, for now always 2!
        nr_frags = 2
        coordinates = self.dimer.get_positions()
        # 
        #centroids,_ = kmeans(coordinates, nr_frags)
        # assign indices to clusters (bitmask!)
        cluster_indices = fclusterdata(coordinates, self.magic_cutoff, criterion="distance")
        # compress the whole coordinates to fragments 
        #coords_frag1 = np.array(list(itertools.compress(coordinates.tolist(), cluster_indices)))
        # invert the bitmask
        #cluster_indices = cluster_indices ^ 1
        #coords_frag2 = np.array(list(itertools.compress(coordinates.tolist(), cluster_indices)))

        self.frag1 = deepcopy(self.dimer)
        self.frag2 = deepcopy(self.dimer)

        # Now delete the atoms of the other fragment from the object with mighty pythonic list comprehensions!
        del self.frag1[[atom.index for pos, atom in enumerate(self.frag1) if cluster_indices[pos] != 1]]
        del self.frag2[[atom.index for pos, atom in enumerate(self.frag2) if cluster_indices[pos] != 2]]

        print("Finished automatic fragmentation, please remember to check the result!")
        self.__check_fragments__() 

        self.__set_charges__()
        self.__get_frontiers__()
Exemplo n.º 12
0
def _agglomerative_cluster_encounters(X_data, seconds_thresh):
    """ Agglomerative encounter clustering algorithm
    Input:  Length N array of data to cluster
    Output: Length N array of cluster indexes
    """
    label_arr = hier.fclusterdata(X_data, seconds_thresh, criterion='distance')
    return label_arr
Exemplo n.º 13
0
def compute_encounters(hs, back, seconds_thresh=15):
    '''
    clusters encounters togethers (by time, not space)

    An encounter is a meeting, localized in time and space between a camera and
    a group of animals.

    Animals are identified within each encounter.
    '''
    if not 'seconds_thresh' in vars():
        seconds_thresh = 15
    gx_list = hs.get_valid_gxs()
    datetime_list = hs.gx2_exif(gx_list, tag='DateTime')

    unixtime_list = [io.exiftime_to_unixtime(datetime_str) for datetime_str in datetime_list]

    unixtime_list = np.array(unixtime_list)
    X = np.vstack([unixtime_list, np.zeros(len(unixtime_list))]).T
    print('[scripts] clustering')

    # Build a mapping from clusterxs to member gxs
    gx2_clusterid = fclusterdata(X, seconds_thresh, criterion='distance')
    clusterx2_gxs = [[] for _ in xrange(gx2_clusterid.max())]
    for gx, clusterx in enumerate(gx2_clusterid):
        clusterx2_gxs[clusterx - 1].append(gx)  # IDS are 1 based

    clusterx2_nGxs = np.array(map(len, clusterx2_gxs))
    print('cluster size stats: %s' % helpers.printable_mystats(clusterx2_nGxs))

    # Change IDs such that higher number = more gxs
    gx2_ex = [None] * len(gx2_clusterid)
    gx2_eid = [None] * len(gx2_clusterid)
    ex2_clusterx = clusterx2_nGxs.argsort()
    ex2_gxs = [None] * len(ex2_clusterx)
    for ex in xrange(len(ex2_clusterx)):
        clusterx = ex2_clusterx[ex]
        gxs = clusterx2_gxs[clusterx]
        ex2_gxs[ex] = gxs
        for gx in gxs:
            nGx = len(gxs)
            USE_STRING_ID = True
            if USE_STRING_ID:
                # String ID
                eid = 'ex=%r_nGxs=%d' % (ex, nGx)
            else:
                # Float ID
                eid = ex + (nGx / 10 ** np.ceil(np.log(nGx) / np.log(10)))
            gx2_eid[gx] = eid
            gx2_ex[gx] = ex

    hs.tables.gx2_ex  = np.array(gx2_ex)
    hs.tables.gx2_eid = np.array(gx2_eid)

    # Give info to GUI
    extra_cols = {'eid': lambda gx_list: [gx2_eid[gx] for gx in iter(gx_list)]}
    back.append_header('gxs', 'eid')
    back.populate_image_table(extra_cols=extra_cols)
    return locals()
Exemplo n.º 14
0
def clust(fp_list):
    np_fps = []
    for fp in fp_list:
        arr = numpy.zeros((1,))
        DataStructs.ConvertToNumpyArray(fp, arr)
        np_fps.append(arr)
    thresh = 6.5
    clusters = hcluster.fclusterdata(np_fps, thresh, criterion="distance")
    return clusters
Exemplo n.º 15
0
def clusterization(data, clastersNum = 2):

    import scipy.cluster.hierarchy as hcluster
    #import pylab
    data = np.array(data)
    #clusters = hcluster.fclusterdata(np.transpose(data), 3, criterion='maxclust', metric='euclidean', depth=1)
    #clusters = hcluster.fclusterdata(data, 2, criterion='maxclust', metric='euclidean', depth=1)
    thresh = 1.5
    clusters = hcluster.fclusterdata(data, thresh, criterion="distance")
    return np.array(clusters)
Exemplo n.º 16
0
def clusterAndPlotAverages(distmatrix,  labeldates, data, noOfClusters=0, cutoff=0, clustersize=0):
    '''runs hierarchical clustering on the given distance matrix using UPGMA 
    and plots the clusters average days, set either noOfClusters or cutoff as 
    keyword arguments and specify clustersize to plot only clusters with a 
    minimum size'''
    if cutoff == 0: #method="average" == UPGMA   
        clusters = hierarchy.fclusterdata(distmatrix, noOfClusters, criterion='maxclust', metric='euclidean', method='average')
    if noOfClusters == 0:
        clusters = hierarchy.fclusterdata(distmatrix, cutoff, criterion='distance', metric='euclidean', method='average')
    if noOfClusters == 0 and cutoff == 0:
        raise ValueError('Call clusterAndPlotAverages with specifying either cutoff or noOfClusters')
    #print clusters
    groupedDays = []
    for i in range(max(clusters)):
        groupedDays.append([])
    for i in range(len(clusters)):
        groupedDays[clusters[i]-1].append(i)
    for group in groupedDays:
        if len(group)> clustersize:
            averageday(data, group, labeldates)
Exemplo n.º 17
0
def add_band_product_id_annotation(df, vdist=5.0):

    # maybe use fcluster instead of fclusterdata? - nope, fcluster(Z) takes a pre-calculated linkage matrix Z.
    # manually calculate Z?
    # ypos = [[ypos] for df.]
    # ypos = df.ypos[:, np.newaxis]
    product_clusters_ids = fclusterdata(
        df.ypos[:, np.newaxis], t=vdist,
        criterion='distance', metric='euclidean', depth=2, method='single'
    )
    df['product_id'] = product_clusters_ids
def Counting_Clusters(col,row,tot):
    sizes = []
    pixels = [[col[i],row[i]] for i,x in enumerate(col)]

    if(len(pixels)>1):

        results=fclusterdata(pixels,sqrt(2.),criterion="distance",method="single")                

        y = numpy.bincount(results)
        ii = numpy.nonzero(y)[0]

        j = 0
        previous = 0

        for result, hit, TOT in zip(results,pixels,tot) : 

            i = 0

            if y[result]>1:

                if previous != result :  

                    while i <= y[result]-1:
                        if j < len(results) :

                            j+=1
                            i+=1

                        if j == len(results) :
                             break


                    else :
                         sizes.append(y[result])




            previous = result       


            if y[result]==1:
                if j < len(results) :
                    sizes.append(y[result])
                    j+=1 



    else :
        oneHitClusters = [[pixels[0][0],pixels[0][1],tot[0]]]


    return sizes
Exemplo n.º 19
0
def display_hmax():
    num_clusters = int(sys.argv[3])

    # retrieve coordinates
    f = open(sys.argv[1], 'r')
    coords = f.read().splitlines()
    for i, coord in enumerate(coords):
        coords[i] = coord.split(' ')
        if len(coords[i]) < 3:
            coords[i].append(str(0))
    # convert coordinates
    for coord in coords:
        coord[0] = int(coord[0])
        coord[1] = int(coord[1])
        coord[2] = int(coord[2])

    features = np.array(coords)

    glClear(GL_COLOR_BUFFER_BIT|GL_DEPTH_BUFFER_BIT)

    X = np.array(features, np.int32)
    clusters = fclusterdata(X, int(num_clusters),
                            criterion='maxclust',
                            metric='euclidean',
                            method='complete')
    print clusters

    for i, feature in enumerate(features):

        if clusters[i] == 1:
            color = [1.0, 0.0, 0.0, .5]
        elif clusters[i] == 2:
            color = [0.0, 1.0, 0.0, .5]
        elif clusters[i] == 3:
            color = [0.0, 0.0, 1.0, .5]
        elif clusters[i] == 4:
            color = [1.0, 1.0, 0.0, .5]
        elif clusters[i] == 5:
            color = [0.0, 1.0, 1.0, .5]
        elif clusters[i] == 6:
            color = [1.0, 0.0, 1.0, .5]

        glPushMatrix()
        glMaterialfv(GL_FRONT,GL_DIFFUSE,color)
        glTranslatef(float(feature[0])/10,
                        float(feature[1])/10,
                        float(feature[2])/10)
        glutSolidSphere(.03,20,20)
        glPopMatrix()

    glutSwapBuffers()
    return
Exemplo n.º 20
0
def generate_linkage_clusters(bags_of_words_file,t=0.,method='single',metric='braycurtis'):
    """Performs clustering on the bag of words listed in the input file.

       This function reads a list of bags of words and performs clustering using hierarchical clustering, but without using the metrics defined in this module. The bags of words are listed one per line in the input file.
    """
        # first of all we need to read the vocabulary
    # we also count the lines
    voc = set()
    n_of_bags = 0
    
    bags_of_words_file.seek(0)

    for line in bags_of_words_file:
        voc.update(line.split())
        n_of_bags += 1

    # this is the space inside which the multiset vectors live
    space = sorted(voc)

    # we create the numpy array that will store the bags vectors
    data = np.zeros((n_of_bags,len(space)))

    # now we store the bags as vectors in memory
    i = 0

    bags_of_words_file.seek(0)

    for line in bags_of_words_file:
        m = Multiset(line.split())
        data[i] = m.to_vector(space)
        i += 1
        
    # now we can perform the clustering

    clusters = fclusterdata(data,t,metric=metric,method=method)
    
    # and we return a dict of clusters
    clusters_dict = dict()

    bags_of_words_file.seek(0)
    i = 0
    for line in bags_of_words_file:
        line = line.strip()
        c = clusters[i]
        v = clusters_dict.get(c,[])
        v.append(line)
        clusters_dict[c] = v
        i+=1

    return clusters_dict

    return kmeans(data,k)
Exemplo n.º 21
0
    def run_cosine_clustering(self, method="greedy", th_clustering=0.55):

        if not hasattr(self, "topic_word"):
            raise ValueError("Thresholding not done yet.")

        # Swap the NaNs for zeros. Turn into a numpy array and grab the parent names
        data = self.docdf.fillna(0)
        data_array = np.array(data)
        peak_names = list(data.columns.values)

        # Create a matrix with the normalised values (each parent ion has magnitude 1)
        l = np.sqrt((data_array ** 2).sum(axis=0))
        norm_data = np.divide(data_array, l)

        if method.lower() == "hierarchical":  # scipy hierarchical clustering

            clustering = hierarchy.fclusterdata(
                norm_data.transpose(), th_clustering, criterion="distance", metric="euclidean", method="single"
            )

        elif method.lower() == "greedy":  # greedy cosine clustering

            cosine_sim = np.dot(norm_data.transpose(), norm_data)
            finished = False
            total_intensity = data_array.sum(axis=0)
            total_intensity = total_intensity
            n_features, n_parents = data_array.shape
            clustering = np.zeros((n_parents,), np.int)
            current_cluster = 1
            thresh = th_clustering
            count = 0
            while not finished:
                # Find the parent with the max intensity left
                current = np.argmax(total_intensity)
                total_intensity[current] = 0.0
                count += 1
                clustering[current] = current_cluster
                # Find other parents with cosine similarity over the threshold
                friends = np.where((cosine_sim[current, :] > thresh) * (total_intensity > 0.0))[0]
                clustering[friends] = current_cluster
                total_intensity[friends] = 0.0
                # When points are clustered, their total_intensity is set zto zero.
                # If there is nothing left with zero, quit
                left = np.where(total_intensity > 0.0)[0]
                if len(left) == 0:
                    finished = True
                current_cluster += 1

        else:
            raise ValueError("Unknown clustering method")

        return peak_names, clustering
Exemplo n.º 22
0
def clustering(prop, threshold):
    import scipy.cluster.hierarchy as hier
    log("info")("clustering start...")
    positions = prop[['x', 'y', 'z']].copy()
    print positions.values.shape
    log("info")("akka")
    cluster_idx = hier.fclusterdata(positions.values, threshold, criterion='distance')
    log("info")("ooover")
    prop['new_label'] = cluster_idx
    prop.set_index('new_label', drop=True, append=False, inplace=True)
    prop.index.name = 'label' 
    prop = prop.sort_index()
    return prop
Exemplo n.º 23
0
    def detect(self, image):
        # define an 8-connected neighborhood
        neighborhood = generate_binary_structure(2, 2)

        # apply the local maximum filter; all pixel of maximal value
        # in their neighborhood are set to 1
        local_max = maximum_filter(image, footprint=neighborhood) == image
        # local_max is a mask that contains the peaks we are
        # looking for, but also the background.
        # In order to isolate the peaks we must remove the background from the mask.

        # we create the mask of the background
        background = (image < self.min_th)

        # a little technicality: we must erode the background in order to
        # successfully subtract it form local_max, otherwise a line will
        # appear along the background border (artifact of the local maximum filter)
        eroded_background = binary_erosion(background,
                                           structure=neighborhood,
                                           border_value=1)

        # we obtain the final mask, containing only peaks,
        # by removing the background from the local_max mask (xor operation)
        detected_peaks = local_max ^ eroded_background

        detected_peaks[image < self.min_th] = False
        peaks = np.array(np.nonzero(detected_peaks)).T

        if len(peaks) == 0:
            return peaks, np.array([])

        # nms
        if len(peaks) == 1:
            clusters = [0]
        else:
            clusters = fclusterdata(peaks, self.min_dist, criterion="distance")
        peak_groups = {}
        for ind_junc, ind_group in enumerate(clusters):
            if ind_group not in peak_groups.keys():
                peak_groups[ind_group] = []
                peak_groups[ind_group].append(peaks[ind_junc])
        peaks_nms = []
        peaks_score = []
        for peak_group in peak_groups.values():
            values = [image[y, x] for y, x in peak_group]
            ind_max = np.argmax(values)
            peaks_nms.append(peak_group[int(ind_max)])
            peaks_score.append(values[int(ind_max)])

        return np.float32(np.array(peaks_nms)), np.float32(
            np.array(peaks_score))
Exemplo n.º 24
0
def cluster(res):
  data = res.values()

  clusters = hcluster.fclusterdata(data, thresh, metric=metricname)

  clustered = {}

  for (m, c) in zip(res.keys(), clusters):
    if c not in clustered:
      clustered[c] = [m]
    else:
      clustered[c].append(m)

  return clustered.values()
Exemplo n.º 25
0
    def SciPyClustering(self,col,row,tot,energyGC,energyPbPC):

        pixels = [[col[i],row[i]] for i,x in enumerate(col)]
        if(len(pixels)>1):
            result=fclusterdata(pixels,sqrt(2.),criterion="distance")
            clusters=[Cluster() for i in range(max(result))]
            [clusters[x-1].addPixel(col[j],row[j],tot[j],energyGC[j],energyPbPC[j]) for j,x in enumerate(result)]
        else:
            if(len(pixels)==1):
                c=Cluster()
                c.addPixel(col[0],row[0],tot[0],energyGC[0],energyPbPC[0])
                clusters=[c]        
	
	return clusters
Exemplo n.º 26
0
def run(path_to_frames_directory):
    segment1 = Segment.Segment(
        path_to_frames_directory=path_to_frames_directory, sampling_rate=25)
    forward_dict = segment1.get_shots_forward()
    segment2 = Segment.Segment(
        path_to_frames_directory=path_to_frames_directory, sampling_rate=25)
    backward_dict = segment2.get_shots_backward()

    data = []
    for frame in forward_dict:
        x = frame.get_frame_no()
        y = frame.get_correlation_val()
        coordinate = [x, y]
        data.append(coordinate)
    for frame in forward_dict:
        x = frame.get_frame_no()
        y = frame.get_correlation_val()
        coordinate = [x, y]
        data.append(coordinate)

    data_array = np.array(data)
    x = data_array[:, :1]
    y = data_array[:, 1:2]

    thresh = 25
    clusters = hcluster.fclusterdata(data_array, thresh, criterion="distance")
    plt.scatter(*np.transpose(data_array), c=clusters)
    plt.show()
    cluster_dict = {}
    for i in range(clusters.max()):
        cluster_dict[str(i + 1)] = []

    for i, val in enumerate(clusters):
        print(str(val))
        temp_list = cluster_dict[str(val)]
        temp_list.append(i)
        cluster_dict[str(val)] = temp_list

    shot_boundaries = []

    for key in cluster_dict.keys():
        minimum_frame_index = data_array[cluster_dict[key][0]][0]
        minimum_cor_value = data_array[cluster_dict[key][0]][1]
        for val in cluster_dict[key]:
            if (minimum_cor_value > data_array[val][1]):
                minimum_frame_index = data_array[val][0]
        shot_boundaries.append(minimum_frame_index)

    shot_boundaries.sort()
    return shot_boundaries
Exemplo n.º 27
0
def get_clustering_indexes(list_data, max_distance):
    clusters = fclusterdata(list_data, t=max_distance, criterion='distance')
    clusters_with_list_data = dict()
    for index, cluster_number in enumerate(clusters):
        if not clusters_with_list_data.get(cluster_number):
            clusters_with_list_data[cluster_number] = list()
        clusters_with_list_data[cluster_number].append(list_data[index])
    clusters_to_indexes = dict()
    for index, (key, value) in enumerate(clusters_with_list_data.items()):
        clusters_to_indexes[key] = index
    indexes = [
        clusters_to_indexes[cluster_number] for cluster_number in clusters
    ]
    return indexes
Exemplo n.º 28
0
def MakeClusters(col, row, tot, tot_hd, hist0, hist1, hist2):
    oneHitClusters = []
    allHitClusters = []
    pixels = [[col[i], row[i]] for i, x in enumerate(col)]

    #for pixel in pixels :
        #hist1.Fill(pixel[0],pixel[1])

    if len(pixels) > 1:
        results = fclusterdata(pixels, np.sqrt(2.), criterion="distance", method="single") 
        y = np.bincount(results) # histogramm of cluster sizes
        ii = np.nonzero(y)[0]

        do_nothing = 0
        j = 0
        previous = 0

        for result, hit, TOT, TOT_HD in zip(results, pixels, tot, tot_hd):             
            tot_c = 0
            tot_hd_c = 0
            i = 0
            
            # process multi hit clusters
            if y[result] > 1:
                if previous != result:
                    while i <= y[result] - 1:
                        if j < len(results):
                            tot_c += tot[j]
                            tot_hd_c += tot_hd[j]
                            j += 1
                            i += 1
                        if j == len(results):
                             break
                    if tot_c != 0:
                        allHitClusters.append([hit[0], hit[1], tot_c, tot_hd_c, y[result]]) 
                        hist0.Fill(y[result])
            previous = result
            
            # process single hit clusters
            if y[result] == 1:
                if j < len(results):
                    hist0.Fill(y[result])
                    oneHitClusters.append([hit[0], hit[1], TOT, TOT_HD, y[result]])
                    allHitClusters.append([hit[0], hit[1], TOT, TOT_HD, y[result]])
                    j += 1

    if len(pixels) == 1:
        oneHitClusters = [[pixels[0][0], pixels[0][1], tot[0], tot_hd[0], 1]]       

    return oneHitClusters, allHitClusters
Exemplo n.º 29
0
 def clustering(self, center_pt_list):       
     pt_arr = np.asarray(center_pt_list)
     result = []
     try: result = list(fclusterdata(pt_arr, self.clustering_th, 'distance'))
     except: pass
     number_of_groups = 0
     groups = []
     if result != []:
         groups = []
         number_of_groups = max(result)
         for i in range(number_of_groups): groups.append([])
         for i in range(len(result)):
             groups[result[i]-1].append(center_pt_list[i])
     return number_of_groups, groups
Exemplo n.º 30
0
Arquivo: merge.py Projeto: neevor/grit
def reduce_internal_clustered_transcripts(internal_grpd_transcripts, gene_id,
                                          max_cluster_gap):
    """Take a set of clustered transcripts and reduce them into 
    a set of canonical transcripts, and associated sources.
    
    """
    # if there is only a single trnascript, clustering doesnt make sense
    if len(internal_grpd_transcripts) == 1:
        new_t = copy.copy(internal_grpd_transcripts[0][0])
        new_t.gene_id = gene_id
        new_t.id = new_t.gene_id + "_1"
        yield (new_t, [
            internal_grpd_transcripts[0][0],
        ], [
            internal_grpd_transcripts[0][1],
        ])
        return

    # 2 transcripts are in the same cluster if both their 5' and 3' ends
    # are within 50 bp's of each other. Use the scipy cluster machinery
    # to do this for us
    transcript_ends = numpy.array([(t.exons[0][0], t.exons[-1][1])
                                   for t, s in internal_grpd_transcripts])
    cluster_indices = fclusterdata(transcript_ends,
                                   t=max_cluster_gap,
                                   criterion='distance',
                                   metric='chebyshev')

    # convert the incdices returned by flclusterdata into lists of transcript
    # source pairs
    clustered_transcript_grps = defaultdict(list)
    clustered_transcript_grp_sources = defaultdict(list)
    for cluster_index, ( trans, src ) in \
            izip(cluster_indices, internal_grpd_transcripts):
        clustered_transcript_grps[cluster_index].append(trans)
        clustered_transcript_grp_sources[cluster_index].append(src)

    # finally, decide upon the 'canonical' transcript for each cluster, and
    # add it and it's sources
    for cluster_index in clustered_transcript_grps.keys():
        clustered_transcripts = clustered_transcript_grps[cluster_index]
        clustered_transcripts_sources = clustered_transcript_grp_sources[
            cluster_index]
        merged_transcript = build_merged_transcript(gene_id,
                                                    clustered_transcripts)
        yield (merged_transcript, clustered_transcripts,
               clustered_transcripts_sources)

    return
Exemplo n.º 31
0
def checkin_clustering(checkin, t, criterion, metric, method):
    '''
    Clustering the checkin data by distance.
    Using fclusterdata method in scipy
    Args:
        checkin: pandas frame having at least two columns ['lat','lon']
        t, criterion, metric, method are parameters of fclusterdata,
        see:https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.fclusterdata.html
    Returns:
        checkin_clusters: dict{clusterno:[(lat0,lon0),(lat1,lon1),...]}
        cluster_num_center:
            dict{clusterno:(number of points in the cluster, array([center_lat,center_lon]))}
    '''
    geopoint = np.array(checkin[['lat', 'lon']])
    clusters = fclusterdata(geopoint,
                            t,
                            criterion=criterion,
                            metric=metric,
                            depth=1,
                            method=method)
    checkin_clusters = dict()
    index = 0
    for cluster_no in clusters:
        if cluster_no not in checkin_clusters.keys():
            checkin_clusters[cluster_no] = [tuple(geopoint[index])]
        else:
            checkin_clusters[cluster_no].append(tuple(geopoint[index]))
        index = index + 1
    cluster_number_center = dict()
    for cluster_no in clusters:
        cluster_number_center[cluster_no] = (len(checkin_clusters[cluster_no]),
                                             np.mean(np.array(
                                                 checkin_clusters[cluster_no]),
                                                     axis=0))


#   xlim =
#   ylim =
#   plt.scatter(geopoint[:,0], geopoint[:,1], c = clusters)
#   plt.show()
    checkin_clusters_descending = OrderedDict(
        sorted(checkin_clusters.items(),
               key=lambda kv: len(kv[1]),
               reverse=True))
    cluster_number_center_descending = OrderedDict(
        sorted(cluster_number_center.items(),
               key=lambda kv: kv[1][0],
               reverse=True))
    return checkin_clusters_descending, cluster_number_center_descending
Exemplo n.º 32
0
def main(dataNameList, featurelist_dir):

    #   dataNameList = args.dataNameList

    #    featurelist_dir = args.featlist_dir
    picSrcDir = ''

    for dataName in dataNameList:
        featlist = osp.join(featurelist_dir,
                            '{0}_featlist.txt'.format(dataName))
        print 'featlist is', featlist
        data, filePathList = multiprocess_feature_data_reader(featlist)

        saveName = dataName
        # clustering
        for thresh in np.arange(0.5, 0.9, 0.02):
            print thresh
            clusters = hcluster.fclusterdata(data,
                                             thresh,
                                             metric="cosine",
                                             method='average',
                                             criterion="distance")
            print 'The number of clustered label is:', np.amax(clusters)
            print clusters
            label_result = {}
            for label in set(clusters):
                label_num = np.sum(clusters == label)
                label_result[label] = label_num

            for i in range(len(filePathList)):
                #picName = filePathList[i].split('/')[-1].replace('.npy', '')
                picName = filePathList[i].replace('.npy', '')
                srcPicPath = picSrcDir + picName
                lost_clustered_label = 'result_{0}/result_{1}/0'.format(
                    saveName, thresh)

                if label_result[clusters[i]] != 1:
                    try:
                        save_path = 'result_{0}/result_{1}/{2}'.format(
                            saveName, thresh, clusters[i])
                        os.makedirs(save_path)
                    except:
                        pass
                    shutil.copy(srcPicPath, save_path)

                else:
                    if not osp.exists(lost_clustered_label):
                        os.makedirs(lost_clustered_label)
                    shutil.copy(srcPicPath, lost_clustered_label)
Exemplo n.º 33
0
def getClusters(segs, db, subject='a1', thresh=20.0, plot=True):
	clusters = fclusterdata([[s] for s in segs], thresh, criterion="distance")
	if plot:
		rawData = db.data[subject]
		plt.figure(figsize=(11,9))
		plt.plot(rawData[:,0],rawData[:,2:])
		colours = cm.rainbow(np.linspace(0, 1, len(set(clusters))))
		for i,s in enumerate(segs):
			plt.axvline(rawData[:,0][s],color=colours[clusters[i]-1],linewidth=2)
		plt.title("Clustered Neural Network Segments")
		plt.xlabel("Time (Seconds)")
		plt.ylabel("Rotation (Degrees per second)")
		#savefig('/Users/robertevans/Desktop/Gait analysis graphs/Segment clustering/'+name+'.pdf', format='pdf')
		plt.show()
	return clusters
Exemplo n.º 34
0
def cluster_weights_agglo(weight, threshold, average=True,cosine=True,euclidean=False,chebyshev=False,manhattan=False):
    t0 = time.time()
    weight = weight.T
    weight = normalize(weight, norm='l2', axis=1)
    threshold =  1.0-threshold   # Conversion to distance measure
    if cosine==True:
        clusters = hcluster.fclusterdata(weight, threshold, criterion="distance", metric='cosine', depth=1, method='centroid')
        z = hac.linkage(weight, metric='cosine', method='complete')
    elif euclidean==True:
        clusters = hcluster.fclusterdata(weight, threshold, criterion="distance", metric='euclidean', depth=1, method='centroid')
        z = hac.linkage(weight, metric='euclidean', method='complete')
    elif chebyshev==True:
        clusters = hcluster.fclusterdata(weight, threshold, criterion="distance", metric='chebyshev', depth=1, method='centroid')
        z = hac.linkage(weight, metric='chebyshev', method='complete')
    elif manhattan==True:
        clusters = hcluster.fclusterdata(weight, threshold, criterion="distance", metric='cityblock', depth=1, method='centroid')
        z = hac.linkage(weight, metric='cityblock', method='complete')
    
    labels = hac.fcluster(z, threshold, criterion="distance")

    labels_unique = np.unique(labels)
    n_clusters_ = len(labels_unique)

    #print(n_clusters_)
    elapsed_time = time.time() - t0
    # print(elapsed_time)

    a=np.array(labels)
    sort_idx = np.argsort(a)
    a_sorted = a[sort_idx]
    unq_first = np.concatenate(([True], a_sorted[1:] != a_sorted[:-1]))
    unq_items = a_sorted[unq_first]
    unq_count = np.diff(np.nonzero(unq_first)[0])
    unq_idx= np.split(sort_idx, np.cumsum(unq_count))
    first_ele = [unq_idx[idx][-1] for idx in range(len(unq_idx))]
    return n_clusters_, first_ele
Exemplo n.º 35
0
    def cluster_lines(self, positions, threshold=20):
        """

        :param positions: list of positions [y,x] for points on each line
        :param threshold: max distance allowed for distance between points in a cluster
        :return: list of labels of which cluster each line belongs to
        """
        clusters = hcluster.fclusterdata(np.array(positions),
                                         threshold,
                                         criterion="distance")
        n_clusters = len(set(clusters))
        print("Number of clusters: {}".format(n_clusters))
        if n_clusters < 2:
            self.good_quality = False
        return clusters
Exemplo n.º 36
0
def find_clusters_1d_hierarchical(vals, t, **kwargs):
    """
    Find clusters in <vals> using hierarchical clustering with parameter <t>. Further parameters need to be passed via
    <kwargs>. Uses *fclusterdata* from *scipy.cluster.hierarchy*.
    """
    from scipy.cluster.hierarchy import fclusterdata

    data = vals.reshape((len(vals), 1))
    ind = fclusterdata(data, t, **kwargs)

    clusters = [np.where(ind == c_id)[0] for c_id in np.unique(ind)]

    assert len(vals) == sum(map(len, clusters))

    return clusters
Exemplo n.º 37
0
    def buildClusters(self, th=0.05, min_cluster_size=0):
        self.labels = fclusterdata(self.points, th, criterion='distance')
        for i in range(0, len(self.labels)):
            label = self.labels[i]
            if label not in self.points_map:
                self.points_map[label] = []
                self.voxels_map[label] = []
            self.points_map[label].append(self.points[i])
            self.voxels_map[label].append(self.voxels[i])

        for l, points in self.points_map.iteritems():
            if len(points) >= min_cluster_size:
                self.objects_map[l] = Instance(points, self.voxels_map[l], l)

        return self.labels
def find_clusters_1d_hierarchical(vals, t, **kwargs):
    """
    Find clusters in <vals> using hierarchical clustering with parameter <t>. Further parameters need to be passed via
    <kwargs>. Uses *fclusterdata* from *scipy.cluster.hierarchy*.
    """
    from scipy.cluster.hierarchy import fclusterdata
    
    data = vals.reshape((len(vals), 1))
    ind = fclusterdata(data, t, **kwargs)
    
    clusters = [np.where(ind == c_id)[0] for c_id in np.unique(ind)]
    
    assert len(vals) == sum(map(len, clusters))
    
    return clusters
Exemplo n.º 39
0
def merge_paths(rides):
    waypoints = list(itertools.chain(*[ride.route.waypoints
                                       for ride in rides]))
    waypoints = sorted(waypoints, key=lambda x: x.country)

    logger.info("Merging {} rides with {} total waypoints".format(
        len(rides), len(waypoints)))

    for country, group in itertools.groupby(waypoints,
                                            key=lambda x: x.country):
        waypoints = list(group)
        country_lat_lng_points = [(x.lat, x.lng) for x in waypoints]
        country_xyz_points = [
            latlng_to_xyz(lat, lng) for lat, lng in country_lat_lng_points
        ]

        logger.debug("Processing {} with {} waypoints".format(
            country, len(country_xyz_points)))

        wh = whiten(country_xyz_points)
        k_guess = max(1, len(country_xyz_points) / BEARABLE_CLUSTER_SIZE)
        k_centroids = kmeans(wh, k_guess)[0]
        k_labels = vq(wh, k_centroids)[0]

        k_labeled = sorted(zip(country_xyz_points, country_lat_lng_points,
                               waypoints, k_labels),
                           key=lambda x: x[3])
        logger.debug("Got {} miniclusters".format(len(k_centroids)))
        for key, gr in itertools.groupby(k_labeled, key=lambda x: x[3]):
            gr = list(gr)
            k_waypoints = [x[2] for x in gr]
            k_lat_lng_points = [x[1] for x in gr]
            k_xyz_points = [x[0] for x in gr]
            logger.debug("Running {} minicluster with {} waypoints".format(
                key, len(k_waypoints)))
            cluster_labels = fclusterdata(np.array(k_xyz_points),
                                          0.2,
                                          criterion="distance",
                                          metric="euclidean")
            centroids = cluster_centroids(zip(k_lat_lng_points,
                                              cluster_labels))
            logger.debug("Got {} hierarhical clusters".format(
                len(set(cluster_labels))))

            for i in range(0, len(k_waypoints)):
                new_lat, new_lng = centroids[cluster_labels[i] - 1]
                k_waypoints[i].lat = new_lat
                k_waypoints[i].lng = new_lng
Exemplo n.º 40
0
def identify_center_band(energies, k):
    """
    Tries to identify the bounds of the central energy band (around the Fermi
    energy) using hierarchical clustering.

    Inputs:
        energies = sorted energy levels
        k = assumed number of distinct bands
    Outputs:
        central_band = energies in the identified central band
    """
    E = np.copy(energies)
    E.resize((E.shape[0], 1))  # Transpose
    labels = fclusterdata(E, k, criterion='maxclust')  # Clustering
    # Find the levels closest to the Fermi level
    return energies[np.where(labels == labels[len(energies) // 2])]
Exemplo n.º 41
0
 def hcluster_filter(self, v, t):
     X = np.array(zip(v,t))
     T = hcluster.fclusterdata(X, 0.1)
     lens = {}
     best = []
     w, h = X.shape
     for idx, clno in enumerate(T):
         lens.setdefault(clno, 0)
         lens[clno] += 1
     a = list(lens.values())
     b = list(lens.keys())
     clmax = b[a.index(max(a))]
     best = [idx for idx, clno in enumerate(T) if clno == clmax]
     v = v[best]
     t = t[best]
     return v, t
Exemplo n.º 42
0
def precluster(people, ngo):
    print("in precluster!!!!")
    for p in people:
        p.append(1)
    for n in ngo:
        n.append(0)
    data = people
    newngo = []
    datanp = np.zeros((len(data), 2))
    for i, d in enumerate(data):
        if len(d) == 5:
            datanp[i, 0] = d[1]
            datanp[i, 1] = d[2]
    if len(data) > 1:
        cluster = fclusterdata(datanp, 1)
    else:
        cluster = [0]
    for i, d in enumerate(data):
        d.append(cluster[i])
    data.sort(key=lambda x: x[5])

    curr = data[0][5]
    i = 0
    while i < len(data):
        temp = []
        while i < len(data) and data[i][5] == curr:
            temp.append(data[i])
            i += 1
        if i < len(data):
            curr = data[i][5]
        cenlat = 0
        cenlon = 0
        for t in temp:
            cenlat += t[1] / len(temp)
            cenlon += t[2] / len(temp)
        minindex = 0
        min = ngo[0]
        for n in ngo:
            if haversine(n[5], n[4], cenlon, cenlat) < haversine(
                    min[5], min[4], cenlon, cenlat):
                min = n
        for t in temp:
            print("inserting!!!!!!")
            db.insertngocurr(min[0], str(t[1]), str(t[2]), t[0])
            db.commit()
            db.insertrescuengo(t[0], min[0])
            db.commit()
Exemplo n.º 43
0
Arquivo: merge.py Projeto: nboley/grit
def reduce_internal_clustered_transcripts( 
        internal_grpd_transcripts, gene_id, max_cluster_gap ):
    """Take a set of clustered transcripts and reduce them into 
    a set of canonical transcripts, and associated sources.
    
    """
    # if there is only a single trnascript, clustering doesnt make sense
    if len( internal_grpd_transcripts ) == 1: 
        new_t = copy.copy(internal_grpd_transcripts[0][0])
        new_t.gene_id = gene_id
        new_t.id = new_t.gene_id + "_1"
        yield ( new_t, 
                [internal_grpd_transcripts[0][0],], 
                [internal_grpd_transcripts[0][1],] )
        return

    # 2 transcripts are in the same cluster if both their 5' and 3' ends
    # are within 50 bp's of each other. Use the scipy cluster machinery 
    # to do this for us
    transcript_ends = numpy.array( [(t.exons[0][0], t.exons[-1][1])
                                    for t, s in internal_grpd_transcripts])    
    cluster_indices = fclusterdata( transcript_ends, t=max_cluster_gap,
                                    criterion='distance', metric='chebyshev' )
    
    # convert the incdices returned by flclusterdata into lists of transcript
    # source pairs
    clustered_transcript_grps = defaultdict( list )
    clustered_transcript_grp_sources = defaultdict( list )
    for cluster_index, ( trans, src ) in \
            izip(cluster_indices, internal_grpd_transcripts):
        clustered_transcript_grps[cluster_index].append( trans )
        clustered_transcript_grp_sources[cluster_index].append( src )
    
    # finally, decide upon the 'canonical' transcript for each cluster, and 
    # add it and it's sources
    for cluster_index in clustered_transcript_grps.keys():
        clustered_transcripts = clustered_transcript_grps[cluster_index]
        clustered_transcripts_sources = clustered_transcript_grp_sources[
            cluster_index]
        merged_transcript = build_merged_transcript(
                gene_id, clustered_transcripts)
        yield ( merged_transcript, 
                clustered_transcripts, 
                clustered_transcripts_sources)
    
    return
Exemplo n.º 44
0
    def group_measures(self, measure_filter):
        """
        Groups the measures that are the result of measure_filter
        """
        measures = measure_filter.all()

        data = np.array([self._key_fn(m) for m in measures])
        npdata = np.reshape(np.array(data), [len(data), 1])

        clusters = hierarchy.fclusterdata(npdata, **self._clustering_args)

        grouped = {}
        for i, cluster in enumerate(clusters):
            current = grouped.get(cluster, [])
            current.append(measures[i])
            grouped[cluster] = current
        return grouped
Exemplo n.º 45
0
def cluster_docs(texts, min_clusters_per_leaf=3, hierarchy=True):
    """
    Function for clustering texts.
    Parameters:
        texts (scipy.sparse.csr_matrix): vector implementation for all text in sparse matrix
        min_clusters_per_leaf (int): minimum number of samples per cluster
        hierarchy (bool): use hierarchy clustering algorithm
    Returns:
        clusters (list): list of clusters for texts
    """
    thresh = 1  # empirical diameter for samples of one cluster
    if hierarchy:
        clusters = hcluster.fclusterdata(texts.todense(),
                                         thresh,
                                         criterion="distance")
    else:
        clusters = DBSCAN(eps=1, min_samples=min_clusters_per_leaf).fit(texts)
    return clusters
Exemplo n.º 46
0
    def SciPyClustering(self, col, row, tot):

        pixels = [[col[i], row[i]] for i, x in enumerate(col)]
        if (len(pixels) > 1):
            result = fclusterdata(pixels, sqrt(2.), criterion="distance")
            clusters = [Cluster() for i in range(max(result))]
            [
                clusters[x - 1].addPixel(col[j], row[j], tot[j])
                for j, x in enumerate(result)
            ]
        else:
            if (len(pixels) == 1):
                c = Cluster()
                c.addPixel(col[0], row[0], tot[0])
                clusters = [c]

        print len(clusters)
        return clusters
 def get_clusters(self, X, n_clusters=None):
     """
     Clusters a set of points and returns the indices of the points
     within each cluster.
     :param X: An (N, D) tensor representing N points in D dimensions
     :param n_clusters: The number of clusters to use for KMeans, or None to use hierarchical
                        clustering and automatically determine the number of clusters.
     :returns: cluster_indices, a list of lists of indices
     """
     if n_clusters is None:
         cluster_labels = hcluster.fclusterdata(X, 1)
         print("Hierarchical clustering returned {} clusters".format(len(set(cluster_labels))))
     else:
         km = KMeans(n_clusters=n_clusters)
         km.fit(X)
         cluster_labels = km.labels_
     cluster_indices = [ np.nonzero(cluster_labels == label)[0] for label in set(cluster_labels) ]
     return cluster_indices
Exemplo n.º 48
0
    def _make_clusters(self, matrix, num_clusters_per_roi, metric):
        """clusters a given matrix by into specified number of clusters according to given metric"""

        from scipy.cluster.hierarchy import fclusterdata

        # maxclust needed to ensure t is interpreted as # clusters in heirarchical clustering
        group_ids = fclusterdata(matrix,
                                 metric=metric,
                                 t=num_clusters_per_roi,
                                 criterion='maxclust')
        group_set = np.unique(group_ids)
        clusters = [
            self._summary_func(matrix[group_ids == group, :],
                               axis=0,
                               keepdims=True) for group in group_set
        ]

        return np.vstack(clusters).squeeze()
Exemplo n.º 49
0
 def Clustering(self):
     nodes = glob.allPoint.values()
     points = [[_.x, _.y] for _ in nodes]
     threshold = 450
     clusters = hcluster.fclusterdata(points,
                                      threshold,
                                      criterion="distance")
     for i in range(len(nodes)):
         clusterID = clusters[i]
         nodes[i].cluster = clusterID
         if clusterID not in glob.allCluster:
             glob.allCluster[clusterID] = Cluster(random.randint(100, 255),
                                                  random.randint(100, 255),
                                                  random.randint(100, 255))
         c = glob.allCluster[clusterID]
         c.points.add(nodes[i])
         #nodes[i].color = c.color
     for cluster in glob.allCluster.values():
         cluster.renewAttribute()
def findLines(img, threshold=10, plot=True):
    imgBlur = cv2.GaussianBlur(img, (5, 5), 0)
    imgT = cv2.threshold(imgBlur, 70, 255, cv2.THRESH_BINARY)[1]

    lines = cv2.HoughLines(imgT, 1, np.pi / 180, 200)
    lines = np.squeeze(lines)
    lines = lines[lines[:, 1] > 0.5]
    """fig, ax = plt.subplots()
    ax.imshow(imgT, cmap="gray")
    
    for rho,theta in lines:
        a = np.cos(theta)
        b = np.sin(theta)
        x0 = a*rho
        y0 = b*rho
        x1 = int(x0 + 1000*(-b))
        y1 = int(y0 + 1000*(a))
        x2 = int(x0 - 1000*(-b))
        y2 = int(y0 - 1000*(a))
    
        ax.plot([x1,x2], [y1,y2], 'm-', lw=2)    
    rows, cols = img.shape[0], img.shape[1]
    ax.axis((0, cols, rows, 0))
    plt.show()  """

    clusters = hcluster.fclusterdata(lines, threshold, criterion="distance")
    n_clusters = len(set(clusters))

    if plot:
        plt.scatter(*np.transpose(lines), c=clusters)
        title = "threshold: %f, number of clusters: %d" % (threshold,
                                                           n_clusters)
        plt.title(title)
        plt.show()

    mean_lines = [[] for i in range(n_clusters)]
    for i in xrange(0, len(lines)):
        mean_lines[clusters[i] - 1].append(list(lines[i]))

    return np.array([
        np.mean(np.array(zip(*mean_lines[i])), axis=1)
        for i in xrange(len(mean_lines))
    ])
Exemplo n.º 51
0
def detect_grid(coordinates):
    """
    Check if sample points form regular, rectangular grid

    :param coordinates:
    :return: (xs, ys, zs) axes of grid
    """
    dtype = coordinates.dtype
    coord_round = coordinates.round(decimals=6)
    tol = {'rtol': 0, 'atol': 1e-5}

    axes = []

    # clustering
    for coord_dim in coord_round.T:
        # pre-clustering (not really unique due to float + rounding ridges)
        xs = np.unique(coord_dim)

        # hierarchical clustering
        xc = hcluster.fclusterdata(xs[:, np.newaxis],
                                   1e-5,
                                   criterion="distance")
        _, xu_idx = np.unique(xc, return_index=True)
        xs = sorted(xs[xu_idx])

        xs_step = np.diff(xs)
        assert np.allclose(xs_step, np.median(xs_step), **tol), "xs_step"
        axes.append(xs)

    # assumption: fraction coords were laid out on regular, rectangular
    #             grid parallel to axes
    # test:

    g_min = np.min(coordinates, axis=0)
    g_max = np.max(coordinates, axis=0)

    axes_grid = []
    for dim_min, dim_max, xs in zip(g_min.T, g_max.T, axes):
        xs_grid = np.linspace(dim_min, dim_max, len(xs), dtype=dtype)
        assert np.allclose(xs, xs_grid, **tol), "xs"
        axes_grid.append(xs_grid)

    return axes_grid
Exemplo n.º 52
0
    def cluster(self, participants: List['BaseParticipant'],
                server: BaseParticipant) -> Dict[str, List['BaseParticipant']]:
        model_predictions = np.array([self.predict(p) for p in participants])
        cluster_ids = hac.fclusterdata(model_predictions,
                                       self.max_value_criterion,
                                       self.criterion,
                                       method=self.linkage_mech,
                                       metric=self.dis_metric)

        num_cluster = max(cluster_ids)

        # Allocate participants to clusters
        i = 0
        clusters_hac_dic = {}
        for id in range(1, num_cluster + 1):
            clusters_hac_dic[str(id)] = []
        for participant in participants:
            participant.cluster_id = str(cluster_ids[i])
            clusters_hac_dic[participant.cluster_id].append(participant)
            i += 1
        return clusters_hac_dic
Exemplo n.º 53
0
 def process_list_update(self):
     #print(self.ls)
     if (len(self.ls) > 3):
         #Do clustering
         data = []
         for item in self.ls:
             data.append(item[0:2])
         clusters = hcluster.fclusterdata(data,
                                          self.threshold,
                                          criterion="distance")
         self.cluster_points = range(max(clusters))
         for c in range(max(clusters)):
             temp = [x == c + 1 for x in clusters]
             self.cluster_points[c] = [
                 item for item, con in zip(self.ls, temp) if con == True
             ]
             self.check_for_new_batteries(self.cluster_points[c])
             #print(self.cluster_points)
             #plt.scatter(*np.transpose(data), c=clusters)
             #plt.show()
             print("Finished Clustering")
Exemplo n.º 54
0
    def get_baseline_segs(self):
        """
        Get baseline segments, given the baseline value
        """

        yDown, yUp, stripeNum, noiseStripeNum =self._yDown, self._yUp,\
            self.stripeNum, self.noiseStripeNum

        assert stripeNum > 0

        rdRaioLog = []

        # here should keep idx
        ycV = np.array([
            np.log(seg.tReadNum + 1) - np.log(seg.nReadNum + 1)
            for seg in self.segPool.segments
        ])

        # 记录是否是outlier
        statusYcV = np.logical_and(ycV > yDown, ycV < yUp)

        ycV = ycV[statusYcV]

        yFcd = ycV.reshape(ycV.shape[0], 1)
        clusters = hierarchy.fclusterdata(
            yFcd, stripeNum + noiseStripeNum, criterion="maxclust",
            method="complete")

        _, blSegL = self.__get_baseline_from_stripe(clusters, ycV, statusBoolL = statusYcV)
        # writeToFile(self,clusters)
        # debug
        ycVBL = np.array([
            np.log(seg.tReadNum + 1) - np.log(seg.nReadNum + 1)
            for seg in blSegL])

        statusYcVBL = np.logical_or(ycVBL <= yDown, ycVBL >= yUp)
        if sum(statusYcVBL) > 0:
            print "baseline segment is not correct"

        return blSegL
Exemplo n.º 55
0
    def cluster_hierarchical(self, cluster_count, modifier=1):

        self.clustering_name = "hierarchical clustering " + str(
            time.process_time())

        distance_measure_data_values = self._similarity_measure_data_pre_processing(
        )
        metric = self.get_degree_and_euclidean_distance_metric(modifier)

        fclust = fclusterdata(distance_measure_data_values,
                              t=cluster_count,
                              criterion='maxclust',
                              metric=metric)
        bp_list = self.data_frame.get_border_points_point_only_df(
        ).index.tolist()

        bp_clus = list(zip(bp_list, fclust))

        self.data_frame.add_result_name(self.clustering_name, -1,
                                        ColType.CLUSTER_LABEL)

        for ind, clus in bp_clus:
            self.data_frame.add_result(self.clustering_name, ind, clus)

        self._assign_noise(self.clustering_name)
        self._assign_inner_points(self.clustering_name)

        self.cluster_count = len(
            set(self.data_frame.df[self.clustering_name].tolist()))
        self.clustering_result = self.data_frame.df[
            self.clustering_name].tolist()

        #  TODO If ever needed Dendrogram code:
        #  link = linkage(distance_measure_data_values, 'single', metric=self.degree_and_euclidean_distance_metric)
        #  fig = plt.figure(figsize=(25, 10))
        #  dn = dendrogram(link)
        #  plt.show()

        return self.clustering_name
Exemplo n.º 56
0
	def calc_clusters(self, tol=0.02, report=False):

		#We use the transpose of the training matrix in order to cluster features on their pairwise correlation throughout the dataset.
		try:
			clusters = hcluster.fclusterdata(self.nArray.T, tol, criterion='distance', metric='correlation', method='average')
		except ValueError:
			Z = hcluster.linkage(self.nArray.T, method='average', metric='correlation')
			np.clip(Z, 0, 10000, out=Z)
			clusters = hcluster.fcluster(Z, tol, criterion='distance')

		clusterDict = {x: [] for x in list(set(clusters))}

		for i, x in enumerate(clusters):
			clusterDict[x].append(i)

		goodBitList = []

		for k, v in clusterDict.iteritems():
			if len(v) > 1:
				goodBitList.append(choice(v))
			elif len(v) == 1:
				goodBitList.append(v[0])
			else:
				continue

		goodBitList.sort()

		goodBits = [self.nArray[:, x] for x in goodBitList]

		self.nArray = np.vstack(tuple(goodBits)).T

		if len(self.bitAddresses) == 0:
			self.bitAddresses = {k: v for k, v in zip(range(self.nArray.shape[1]), goodBitList)}
		else:
			updateGoodBits = [self.bitAddresses[x] for x in goodBitList]
			self.bitAddresses = {k: v for k, v in zip(range(self.nArray.shape[1]), sorted(updateGoodBits))}

		if report:
			return self.nArray.shape, self.bitAddresses
Exemplo n.º 57
0
def fusion_cluster(bboxes,
                   method='cluster',
                   distance_threshold=0.1,
                   nms_threshold=0.3):
    '''
        bboxes: bboxes to be fused
        method: 'cluster' or 'nms'
    '''
    if bboxes.shape[0] <= 1:
        return Disjoint(), bboxes
    if method == 'cluster':
        clusters = hcluster.fclusterdata(bboxes,
                                         distance_threshold,
                                         criterion="distance",
                                         depth=2)
        print('clusters', clusters)
        cluster_num = len(set(clusters))
        cluster_set_data = [[] for _ in range(cluster_num)]
        for i, cluster_id in enumerate(clusters):
            cluster_set_data[cluster_id - 1].append(i)

        cluster_count = np.zeros((cluster_num, 1), dtype=np.int32)
        cluster_set = Disjoint()
        cluster_set.sets = cluster_set_data
        fused_bboxes = np.zeros((cluster_num, bboxes.shape[1]))
        for k in range(bboxes.shape[0]):
            fused_bboxes[clusters[k] - 1] += bboxes[k]
            cluster_count[clusters[k] - 1] += 1
        fused_bboxes /= cluster_count
        # print(cluster_count)
        # print('fused_bboxes', fused_bboxes.shape)
        # print(np.where(cluster_count>1))
        # fused_bboxes = fused_bboxes[np.where(cluster_count>1)[0]]
        return cluster_set, fused_bboxes  # (n, 4)
    elif method == 'nms':
        return nms(bboxes, nms_threshold)
    else:
        logging.error(method + 'Not Implement yet')
        raise
Exemplo n.º 58
0
    def _detect_cluster(self):
        all_para = []
        for i in self.pages:
            all_para.extend(self.pages[i]["para"])

        features = []
        for item in all_para:
            try:
                r, b, g = item["ncolour"][1:-1].split(",") if item[
                    "ncolour"] != 'None' and item["ncolour"] != '0' else [
                        0, 0, 0
                    ]
            except:
                r, b, g = [0, 0, 0]

            bold = 1 if "bold" in item["font"].lower() else 0
            features.append(
                [float(item["size"]) / 6,
                 float(r),
                 float(b),
                 float(g), bold])

        thres = 0.1
        clusters = hcluster.fclusterdata(np.array(features),
                                         thres,
                                         criterion="distance")

        cluster_text = defaultdict(list)
        for cluster_id, para in zip(clusters, all_para):
            cluster_text[int(cluster_id)].append(para["text"])

        self.cluster_text = cluster_text

        count = 0

        for i in self.pages:
            for para_idx, _ in enumerate(self.pages[i]["para"]):
                self.pages[i]["para"][para_idx]["cluster_id"] = clusters[count]
                count += 1
Exemplo n.º 59
0
    def getCentroids(self):
        centroids = {}
        for i in range(int(self.utils.initial_cluster_size), int(self.utils.max_cluster_size)+1):
            if self.memory[str(i)]['arrayMeas'] != None:
                self.my_logger.debug("GETCENTROIDS state "+ str(i) +" measurements : "+ str(self.memory[str(i)]['arrayMeas']))
                if len(self.memory[str(i)]['arrayMeas']) > 1:
#                Y = pdist(self.memory[str(i)]['arrayMeas'], 'seuclidean')
                    Y = self.memory[str(i)]['arrayMeas']
#                Z = centroid(Y)
#                Z = linkage(Y, 'single') # single, complete, average, weighted, median centroid, ward
#                T = fcluster(Z, t=1.0, criterion='distance')
                    T= fclusterdata(self.memory[str(i)]['arrayMeas'], t=15.0, criterion='distance', metric='euclidean', method='single')
#                self.my_logger.debug("GETCENTROIDS state "+ str(i) +" centroids: "+ str(Z))
                    self.my_logger.debug("GETCENTROIDS state "+ str(i) +" clusters: "+ str(T))
                    Z = centroid(Y)
                    self.my_logger.debug("GETCENTROIDS state "+ str(i) +" centroid func: "+ str(Z))
                else:
                    centroids[str(i)] = {}
                    centroids[str(i)]['throughput'] = self.memory[str(i)]['arrayMeas'][0][0]
                    centroids[str(i)]['latency'] = self.memory[str(i)]['arrayMeas'][0][1]
        
        self.my_logger.debug("GETCENTROIDS centroids: "+ str(centroids))
        return centroids
Exemplo n.º 60
0
def plot(data, step):
    fig, ax = plt.subplots()

    # clustering
    threshold = 1.2
    clusters = hierarchy.fclusterdata(data,
                                      threshold,
                                      criterion="distance",
                                      metric=periodic_distance)

    # plotting
    ax.scatter(*np.transpose(data), c=clusters)
    #ax.set_xlim(0.0, 20.0)
    ax.axis("equal")
    title = "threshold: %f, number of clusters: %d, step: %d" % (
        threshold, len(set(clusters)), step)
    plt.title(title)
    fig.tight_layout()

    filename = out.replace(".png", "&step=%d.png" % (step))
    plt.savefig(filename)

    return len(set(clusters))