def kmeans2():
    features = locations()
    whitened = whiten(features)
    book = array((whitened[0],whitened[2]))
    kmeans(whitened,book)
    (array([[ 2.3110306 ,  2.86287398],
           [ 0.93218041,  1.24398691]]), 0.85684700941625547)
def kmeans1():
    features  = array([[ 1.9,2.3], [ 1.5,2.5], [ 0.8,0.6], [ 0.4,1.8], [ 0.1,0.1], [ 0.2,1.8], [ 2.0,0.5], [ 0.3,1.5], [ 1.0,1.0]])
    whitened = whiten(features)
    book = array((whitened[0],whitened[2]))
    kmeans(whitened,book)
    (array([[ 2.3110306 ,  2.86287398],
           [ 0.93218041,  1.24398691]]), 0.85684700941625547)
def clustering_scipy_kmeans(features, n_clust = 8):
  """
  """
  whitened = whiten(features)
  print whitened.shape
  
  initial = [kmeans(whitened,i) for i in np.arange(1,12)]
  plt.plot([var for (cent,var) in initial])
  plt.show()
  
  #cent, var = initial[3]
  ##use vq() to get as assignment for each obs.
  #assignment,cdist = vq(whitened,cent)
  #plt.scatter(whitened[:,0], whitened[:,1], c=assignment)
  #plt.show()
  
  codebook, distortion = kmeans(whitened, n_clust)
  print codebook, distortion
  assigned_label, dist = vq(whitened, codebook)
  for ii in range(8):
    plt.subplot(4,2,ii+1)
    plt.plot(codebook[ii])
  plt.show()
  
  centroid, label = kmeans2(whitened, n_clust, minit = 'points')
  print centroid, label
  for ii in range(8):
    plt.subplot(4,2,ii)
    plt.plot(centroid[ii])
  plt.show()
def kmeans_net(net, layers, num_c=16, initials=None):
    # net: 网络
    # layers: 需要量化的层
    # num_c: 各层的量化级别
    # initials: 初始聚类中心
    codebook = {} # 量化码表
    if type(num_c) == type(1):
        num_c = [num_c] * len(layers)
    else:
        assert len(num_c) == len(layers)

    # 对各层进行聚类分析
    print "==============Perform K-means============="
    for idx, layer in enumerate(layers):
        print "Eval layer:", layer
        W = net.params[layer][0].data.flatten()
        W = W[np.where(W != 0)] # 筛选不为0的权重
        # 默认情况下,聚类中心为线性分布中心
        if initials is None:  # Default: uniform sample
            min_W = np.min(W)
            max_W = np.max(W)
            initial_uni = np.linspace(min_W, max_W, num_c[idx] - 1)
            codebook[layer], _ = scv.kmeans(W, initial_uni)
        elif type(initials) == type(np.array([])):
            codebook[layer], _ = scv.kmeans(W, initials)
        elif initials == 'random':
            codebook[layer], _ = scv.kmeans(W, num_c[idx] - 1)
        else:
            raise Exception

        # 将0权重值附上
        codebook[layer] = np.append(0.0, codebook[layer])
        print "codebook size:", len(codebook[layer])

    return codebook
示例#5
0
文件: t.py 项目: mcaprari/rt-pub
def custom():
	_items = {}
	users = []

	for line in open('my_items_likehood.txt'):
		user, item = keys(line)
		users.append(user)
		if item in _items:
			_items[item].append(user)
		else:
			_items[item] = [user]


	sorted_users = sorted(users)
	l = len(sorted_users)
	items={}
	count=0
	features=[]
	for item in _items:
	
		features.append(user_matrix(l, _items[item], sorted_users))
		if count == 100: break
		count += 1

	print 'whiten'
	whitened = whiten(array(features))
	print 'kmeans'
	print kmeans(whitened)
	print "%d items voted by %d users" % (len(items), len(users))
示例#6
0
def _get_larger_chroms(ref_file):
    """Retrieve larger chromosomes, avoiding the smaller ones for plotting.
    """
    from scipy.cluster.vq import kmeans, vq
    all_sizes = []
    for c in ref.file_contigs(ref_file):
        all_sizes.append(float(c.size))
    all_sizes.sort()
    # separate out smaller chromosomes and haplotypes with kmeans
    centroids, _ = kmeans(np.array(all_sizes), 2)
    idx, _ = vq(np.array(all_sizes), centroids)
    little_sizes = tz.first(tz.partitionby(lambda xs: xs[0], zip(idx, all_sizes)))
    little_sizes = [x[1] for x in little_sizes]
    # create one more cluster with the smaller, removing the haplotypes
    centroids2, _ = kmeans(np.array(little_sizes), 2)
    idx2, _ = vq(np.array(little_sizes), centroids2)
    little_sizes2 = tz.first(tz.partitionby(lambda xs: xs[0], zip(idx2, little_sizes)))
    little_sizes2 = [x[1] for x in little_sizes2]
    # get any chromosomes not in haplotype/random bin
    thresh = max(little_sizes2)
    larger_chroms = []
    for c in ref.file_contigs(ref_file):
        if c.size > thresh:
            larger_chroms.append(c.name)
    return larger_chroms
示例#7
0
def cluster(df, means, csv_min, csv_max):
    data = []
    for i in range(csv_min, csv_max):
        a = array(df.ix[:, i].values)
        b = a[a != "--"]
        print np.sort(kmeans(b.astype(np.float), means)[0])
        data.append(np.sort(kmeans(b.astype(np.float), means)[0]))
    return data
示例#8
0
    def test_kmeans_lost_cluster(self):
        # This will cause kmean to have a cluster with no points.
        data = np.fromfile(DATAFILE1, sep=", ")
        data = data.reshape((200, 2))
        initk = np.array([[-1.8127404, -0.67128041], [2.04621601, 0.07401111], [-2.31149087, -0.05160469]])

        kmeans(data, initk)
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", UserWarning)
            kmeans2(data, initk, missing="warn")

        assert_raises(ClusterError, kmeans2, data, initk, missing="raise")
示例#9
0
    def test_kmeans_lost_cluster(self):
        # This will cause kmean to have a cluster with no points.
        data = TESTDATA_2D
        initk = np.array([[-1.8127404, -0.67128041],
                         [2.04621601, 0.07401111],
                         [-2.31149087,-0.05160469]])

        kmeans(data, initk)
        with warnings.catch_warnings():
            warnings.simplefilter('ignore', UserWarning)
            kmeans2(data, initk, missing='warn')

        assert_raises(ClusterError, kmeans2, data, initk, missing='raise')
 def clusterkmeans(self):
     wh = whiten(self.counts) #normalizes the counts for easier clustering
     scale = self.counts[0] / wh[0]
     #compute kmeans for  k = 1,2 compare the distortions and choose the better one
     one = kmeans(wh, 1)
     two = kmeans(wh, 2)
     if one[1] < two[1]:
         print 'found only one cluser'
         threshold = None
     else:
         km = two
         threshold = scale * km[0].mean() #set threshold to be the average of two centers
     return threshold
示例#11
0
    def test_kmeans_lost_cluster(self):
        # This will cause kmeans to have a cluster with no points.
        data = TESTDATA_2D
        initk = np.array([[-1.8127404, -0.67128041],
                         [2.04621601, 0.07401111],
                         [-2.31149087,-0.05160469]])

        with suppress_warnings() as sup:
            sup.filter(UserWarning,
                       "One of the clusters is empty. Re-run kmean with a different initialization")
            kmeans(data, initk)
            kmeans2(data, initk, missing='warn')

        assert_raises(ClusterError, kmeans2, data, initk, missing='raise')
示例#12
0
def cluster_points(coord_points, N):
    """
    Function that returns k which is an nx2 matrix of lon-lat vector columns
    containing the optimal cluster centroid spacings within a large set of random
    numbers e.g. those produced by the many_points() function above!
    """
    return kmeans(coord_points, N)[0]
示例#13
0
def createdatabase():
	X_train = detectcompute(train1)

	print "Clustering the data with K-means"
	codebook,distortion = kmeans(whiten(X_train),k)
	print "Done.\n"
	
	imtrain = singledetect(test1)
	Pdatabase = bow(imtrain,codebook,k) #Pseudo database with list structure


	#Writing to html.table
	print "Converting the database into a HTML file"
	htmltable = open("table.htm","r+") 
	begin = "<htm><body><table cellpadding=5><tr><th>Filename</th><th>Histogram</th></tr>"
	htmltable.write(begin)

	for i in range(len(Pdatabase)):
	    middle = "<tr><td>%(filename)s</td><td>%(histogram)s</td></tr>" % {"filename": Pdatabase[i][0], "histogram": Pdatabase[i][-1]}
	    htmltable.write(middle)

	end = "</table></body></html>"    
	htmltable.write(end)
	htmltable.close()
	print "Done.\n"

	codebook_to_file(codebook)
示例#14
0
def kmeans(features, projection, ite = 50, k = 4, threshold = 1e-5):    
    """ perform k_keamns clustering and return a the result as a subsapce clustering object """
    from scipy.cluster.vq import kmeans, vq
    import datetime

    from measures import spatial_coherence    
   
    centroids, distance = kmeans(features, k, iter=ite, thresh=threshold)
    code, _ = vq(features, centroids)
    
    run_ = datetime.datetime.now().strftime("%y_%m_%d_%H_%M")
    
    params = "projection_size=%d, k=%d" %(len(projection), k)
    clusters = clusters_from_code(code, k, projection)
  
    clustering_id = "(%s)_(%s)_(%s)_(%s)" %("exhaustive_kmeans", params, run_, projection)
    #print clustering_id
    km_clt = KMClustering(algorithm ="exhaustive_kmeans", parameters = params, run = run_,
                          clustering_id = clustering_id, clusters = clusters, ccontains_noise = False, cclustering_on_dimension = True)

   
    measures = {'spatial_coherence': spatial_coherence(km_clt, len(features))[0], 'distortion': distance}
    km_clt.update_measures(measures)
    
    return  km_clt 
def run_kmeans(whitened, k=3):
    book = list()
    for i in range(k):
        book.append(whitened[i])
    codebook, distortion = kmeans(whitened, array(book))
    # codebook, distortion = kmeans(whitened, k)
    return codebook
示例#16
0
 def worldplot(self,kmeans=None,proj='merc'):
     """
     plots customer GPS location on a map with state and national boundaries.
     IN
         kmeans (int) number of means for k-means clustering, default=None
         proj (string) the map projection to use, use 'robin' to plot the whole earth, default='merc'
     """
     # create a matplotlib Basemap object
     if proj == 'robin':
         my_map = Basemap(projection=proj,lat_0=0,lon_0=0,resolution='l',area_thresh=1000)
     else:
         my_map = Basemap(projection=proj,lat_0=33.,lon_0=-125.,resolution='l',area_thresh=1000.,
                 llcrnrlon=-130.,llcrnrlat=25,urcrnrlon=-65., urcrnrlat=50)
     my_map.drawcoastlines(color='grey')
     my_map.drawcountries(color='grey')
     my_map.drawstates(color='grey')
     my_map.drawlsmask(land_color='white',ocean_color='white')
     my_map.drawmapboundary() #my_map.fillcontinents(color='black')
     x,y = my_map(np.array(self.data['lon']),np.array(self.data['lat']))
     my_map.plot(x,y,'ro',markersize=3,alpha=.4,linewidth=0)
     if kmeans:
         # k-means clustering algorithm---see wikipedia for details
         data_in = self.data.drop(['id','clv','level'],axis=1)
         # vq is scipy's vector quantization module
         output,distortion = vq.kmeans(data_in,kmeans)
         x1,y1 = my_map(output[:,1],output[:,0])
         my_map.plot(x1,y1,'ko',markersize=20,alpha=.4,linewidth=0)
     plt.show()
     return output
def classify_kmeans(infile, clusternumber):
    '''
    apply kmeans
    '''
    
    #Load infile in data array    
    driver = gdal.GetDriverByName('GTiff')
    driver.Register()
    ds = gdal.Open(infile, gdal.GA_Update)
    databand = ds.GetRasterBand(1)
    
    #Read input raster into array
    data = ds.ReadAsArray() 
    #replace no data value with numpy.nan
    #data[data==-999.0]=numpy.nan 
    
    pixel = numpy.reshape(data,(data.shape[0]*data.shape[1]))
    centroids, variance = kmeans(pixel, clusternumber)
    code, distance = vq(pixel,centroids)
    centers_idx = numpy.reshape(code,(data.shape[0],data.shape[1]))
    clustered = centroids[centers_idx]
    
    # Write outraster to file
    databand.WriteArray(clustered)
    databand.FlushCache()        
    
    #Close file
    databand = None
    clustered = None
    ds = None  
示例#18
0
def kmeans(iData, clustNumber, oPrefix, norm=False):
    '''Perform k-means cluster analysis and return MAP of zones'''
    print 'Run K-Means'
    
    height, width = iData.shape[1:3]
    #reshape 3D cube of data into 2D matrix and get indeces of valid pixels
    iData, notNanDataI = cube2flat(iData)
    if norm:
        #center and norm
        iDataMean = iData[:, notNanDataI].mean(axis=1)
        iDataStd  = iData[:, notNanDataI].std(axis=1)
        iData = np.subtract(iData.T, iDataMean).T
        iData = np.divide(iData.T, iDataStd).T

    #perform kmeans on valid data and return codebook
    codeBook = vq.kmeans(iData[:, notNanDataI].astype('f8').T, clustNumber)[0]
    #perform vector quantization of input data uzing the codebook
    #return vector of labels (for each valid pixel)
    labelVec = vq.vq(iData[:, notNanDataI].astype('f8').T, codeBook)[0]+1
    #create and fill MAP of zones
    zoneMap = np.zeros(width*height) + np.nan
    zoneMap[notNanDataI] = labelVec
    zoneMap = zoneMap.reshape(height, width)
    
    #visualize map of zones
    plt.imsave(oPrefix + 'zones.png', zoneMap)
    
    return zoneMap
示例#19
0
    def test_large_features(self):
        # Generate a data set with large values, and run kmeans on it to
        # (regression for 1077).
        d = 300
        n = 100

        m1 = np.random.randn(d)
        m2 = np.random.randn(d)
        x = 10000 * np.random.randn(n, d) - 20000 * m1
        y = 10000 * np.random.randn(n, d) + 20000 * m2

        data = np.empty((x.shape[0] + y.shape[0], d), np.double)
        data[:x.shape[0]] = x
        data[x.shape[0]:] = y

        kmeans(data, 2)
示例#20
0
def read_unclustered_data(filename, num_clusters, cl_type="kMeans"):
    """Return dictionary of cluster id to array of points.

    Given a filename in the format of lat, lng
    generate k clusters based on arguments. Outputs a dictionary with
    the cluster id as the key mapped to a list of lat, lng pts
    """
    request_points = []
    with open(filename, 'rb') as input_file:
        input_file.next()  # Skip the header row
        for line in input_file:
            lat, lng = line.split(',')
            request_points.append((float(lat), float(lng)))
    request_points = array(request_points)

    if cl_type == "kMeans":
        # computing K-Means with K = num_clusters
        centroids, _ = kmeans(request_points, int(num_clusters))
        # assign each sample to a cluster
        idx, _ = vq(request_points, centroids)

    else:
        # computeing kMedoids using distance matrix
        centroids = get_kmedoids(request_points, int(num_clusters))
        # assign each sample to a cluster
        idx, _ = vq(request_points, centroids)

    # map cluster lat, lng to cluster index
    cluster_points = defaultdict(list)
    for i in xrange(len(request_points)):
        lat, lng = request_points[i]
        cluster_points[idx[i]].append((lat, lng))
    return cluster_points
def getPupilThresholdWithClustering(gray,K=2, distanceWeight=2, resizeTo=(40,40)):
    ''' Detects the pupil in the image, gray, using k-means
        gray            : gray scale image
        K               : Number of clusters
        distanceWeight  : Defines the weight of the position parameters
        reSize          : the size of the image to do k-means on
    '''
    
    smallI = cv2.resize(gray, resizeTo)

    M,N = smallI.shape
    #Generate coordinates in a matrix
    X,Y = np.meshgrid(range(M),range(N))

    #Make coordinates and intensity into one vectors
    z = smallI.flatten()
    x = X.flatten()
    y = Y.flatten()

    # make a feature vectors containing (x,y,intensity)
    features = np.zeros((len(x),3))
    features[:,0] = z;
    features[:,1] = y/distanceWeight; #Divide so that the distance of position weighs less than intensity
    features[:,2] = x/distanceWeight;
    features = np.array(features,'f')

    # cluster data
    centroids,variance = vq.kmeans(features,K)

    plotClusters(centroids, features, M, N)

    centroidsByPupilCandidacy = sorted(centroids, key = lambda c: evaluateCentroid(c, resizeTo))
    
    return centroidsByPupilCandidacy[-1][0] + 10
def performMCCAlgorithm(dataSet, specificDataPointIndex, numIterations = 200, numClusters = 4, subDataRatio = 0.5):
	periodsAhead = np.array([1, 2, 3, 4, 5, 6, 9, 12, 18, 24, 36, 60, 120])
	strippedDataSet = dataSet
	dataLength = strippedDataSet.shape[0]
	dataWidth = strippedDataSet.shape[1]
	specificDataPoint = strippedDataSet[specificDataPointIndex,:]

	numPeriods = len(periodsAhead)

	statisticWeightsbyIteration = np.empty(shape=(numIterations, 4),dtype=float)

	# Perform Bootstrapped Clustering
	for i in range(0,numIterations):
		# Perform Bootstrapped Clustering / Chooose Data Subset
		subDataSetIndexes = np.random.choice(range(0,dataLength),size=dataLength*subDataRatio,replace=True) 
		subDataSet = strippedDataSet[subDataSetIndexes,:]
		# Perform Bootstrapped Clustering / Find Data Clusters for Subset of Data
		kMeansClusters = spc.kmeans(subDataSet, numClusters)
		clusterCenters = kMeansClusters[0]
		# Perform Bootstrapped Clustering / Record Clustering Cost for Weighting Scheme
		clusteringCost = kMeansClusters[1]
		statisticWeightsbyIteration[i,0] = clusteringCost
		# Perform Bootstrapped Clustering / Apply Found Data Clusters to All Data
		allClusters = spc.vq(strippedDataSet, clusterCenters)
		clusterAssignments = allClusters[0]
		clusterDistortions = allClusters[1]
		display = 1 #TEST
		if display: #TEST
			plt.scatter(dataSet[0:60,0],dataSet[0:60,1],c=clusterAssignments[0:60]) #TEST	
			plt.show()	
		statisticWeightsbyIteration[i,1] = max(clusterDistortions)
		statisticWeightsbyIteration[i,2] = np.mean(clusterDistortions)
		statisticWeightsbyIteration[i,3] = np.std(clusterDistortions)
	return statisticWeightsbyIteration
示例#23
0
def create_code_book(input_filename, num_clusters, num_observations,
                     feature_list=utils.QUIVER_FEATURES):
    """Create a code book from a cmp.h5 file.
    
    Args:
        input_filename: path to the SAM or cmp.h5 file
        num_clusters: the number of codes to create in the code book
        num_observations: the number of bases to use to create the code book
            clusters
        feature_list: the list of features to read from the cmp.h5 to 
            cluster

    Returns:
        code_book: a numpy array of cluster centers. rows are codes, columns are
            features
        feature_list: labels for the columns of the code book
    """

    log.debug("Checking for missing features...")
    
    if input_filename.endswith(".cmp.h5"):
        training_array = read_cmph5(input_filename, feature_list, num_observations)
    elif input_filename.endswith(".sam") or input_filename.endswith(".bam"):
        training_array = read_sam(input_filename, feature_list, num_observations)
    else:
        raise RuntimeError, "Input file must be SAM, BAM, or cmp.h5"


    clusterable_array, std_dev = make_data_clusterable(training_array,
                                                       feature_list)
    code_book, distortion = vq.kmeans(clusterable_array, num_clusters)

    raw_code_book = convert_to_raw(code_book, feature_list, std_dev)
    return raw_code_book, feature_list
示例#24
0
  def train(self, featurefiles, k=100, subsampling=10):
    """Train a vocabulary from features in files listed in |featurefiles| using
    k-means with k words. Subsampling of training data can be used for speedup.
    """
    image_count = len(featurefiles)

    descr = []
    descr.append(sift.read_features_from_file(featurefiles[0])[1])
    descriptors = descr[0]  # Stack features for k-means.
    for i in numpy.arange(1, image_count):
      descr.append(sift.read_features_from_file(featurefiles[i])[1])
      descriptors = numpy.vstack((descriptors, descr[i]))

    # Run k-means.
    self.voc, distortion = vq.kmeans(descriptors[::subsampling, :], k, 1)
    self.word_count = self.voc.shape[0]

    # Project training data on vocabulary.
    imwords = numpy.zeros((image_count, self.word_count))
    for i in range(image_count):
      imwords[i] = self.project(descr[i])

    occurence_count = numpy.sum((imwords > 0)*1, axis=0)
    
    self.idf = numpy.log(image_count / (occurence_count + 1.0))
    self.trainingdata = featurefiles
示例#25
0
def connected_regions(image):
    """
    Converts image into grayscale, quantizes, counts connected regions
    """
    # render_image(image)

    colors = 2

    # Quantization into two colors
    image_rgb = np.dstack(image)
    pixels = np.reshape(
        image_rgb,
        (image_rgb.shape[0] * image_rgb.shape[1], image_rgb.shape[2])
    )
    centroids, _ = vq.kmeans(pixels, colors)
    quantized, _ = vq.vq(pixels, centroids)
    quantized_idx = quantized.reshape(
        (image_rgb.shape[0], image_rgb.shape[1])
    )

    if len(centroids) > 1:
        # for_render = (quantized_idx * 255).astype(np.uint8)
        # render_image(for_render)
        regions = len(region_sizes(quantized_idx))
        regions_inverted = len(region_sizes(1 - quantized_idx))
        # import pdb; pdb.set_trace()

        # if regions == 0:
        #     regions = image[0].shape[0] * image[0].shape[1]
        # print regions
        return max([regions, regions_inverted])
    else:
        return 0
示例#26
0
def main():
    args = get_args()
    # This catches files sent in with stdin
    if isinstance(args.infile, TextIOWrapper):
        data = JSONFile(args.infile, True)
    else:
        data = args.infile

    points = np.array([
        [point.get('lon'), point.get('lat')]
        for point in data
    ])

    # In testing, found that a higher number of iterations led to less
    # errors due to missing centroids (Note: whitening led to worse results)
    centroids, distortion = kmeans(points, args.number_of_vans, 2000)
    index, distortion = vq(points, centroids)

    vans = [[] for _ in range(args.number_of_vans)]

    for i, point in enumerate(data):
        vans[index[i]].append(point)

    vans = distribute(vans, len(data), centroids)


    create_output(args.outfile, vans)
def build_cluster(image, featureValue, K):
    img = cv2.imread(image)
    fast = cv2.FastFeatureDetector(featureValue)
    orb = cv2.ORB(180)
    kp = fast.detect(img,None)
    kp, des = orb.compute(img, kp)

    # build keypoints location array for cluster
    locations = np.empty((len(kp),2))
    for i in range(len(kp)):
        loc = array((int(kp[i].pt[0]), int(kp[i].pt[1])))
        locations[i]=loc

    kcenters, distortion  = kmeans(locations, K)
    kcenters = kcenters[kcenters[:,0].argsort()]

    # cluster index: 0: left eye, 1 mouth and nose, 2: right eye
    kpCluster = {i: [] for i in range(K)}
    clusterLoc = {i: [] for i in range(K)}
    for i in range(len(kp)):
        set = 0
        minDis = sys.maxint
        for j in range(K):
            dis = euclidean(locations[i], kcenters[j])
            if dis<minDis:
                set = j
                minDis = dis
        kpCluster[set].append(kp[i])
        clusterLoc[set].append(locations[i])

    imageFeature = [len(kp)]
    for i in range(K):
        clusterFeature = cluster_feature(clusterLoc[i], kcenters[i])
        imageFeature = imageFeature + clusterFeature
    return imageFeature
示例#28
0
def spectral_clustering(W, k):

    # ====================== ADD YOUR CODE HERE ======================
    # Instructions: Perform spectral clustering to partition the
    #               data into k clusters. Implement the steps that
    #               are described in Algorithm 2 on the assignment.

    L = diag(sum(W, axis=0)) - W
    w, v = linalg.eig(L)

    y = real(v[:, w.argsort()[:k]])

    clusters, _ = kmeans(y, k)

    labels = zeros(y.shape[0])
    for i in range(y.shape[0]):
        dist = inf
        for j in range(k):
            distance = euclideanDistance(y[i], clusters[j])
            if distance < dist:
                dist = distance
                labels[i] = j
    # =============================================================

    return labels
示例#29
0
def main():    
    gdal.AllRegister()
    infile = auxil.select_infile() 
    if infile:                  
        inDataset = gdal.Open(infile,GA_ReadOnly)     
        cols = inDataset.RasterXSize
        rows = inDataset.RasterYSize    
        bands = inDataset.RasterCount
    else:
        return    
    pos =  auxil.select_pos(bands)
    bands = len(pos)    
    x0,y0,rows,cols=auxil.select_dims([0,0,rows,cols])   
    K = auxil.select_integer(6,msg='Number clusters')        
    G = zeros((rows*cols,len(pos))) 
    k = 0                                   
    for b in pos:
        band = inDataset.GetRasterBand(b)
        G[:,k] = band.ReadAsArray(x0,y0,cols,rows)\
                              .astype(float).ravel()
        k += 1        
    centers, _ = kmeans(G,K)
    labels, _ = vq(G,centers)      
    outfile,fmt = auxil.select_outfilefmt() 
    if outfile:
        driver = gdal.GetDriverByName(fmt)   
        outDataset = driver.Create(outfile,
                        cols,rows,1,GDT_Byte)         
        outBand = outDataset.GetRasterBand(1)
        outBand.WriteArray(reshape(labels,(rows,cols))\
                                              ,0,0) 
        outBand.FlushCache() 
        outDataset = None    
    inDataset = None        
示例#30
0
def kmeans(X, K):
    """
    kmeans to find clusers:
    x: dataset
    k: num of clusters
    #todo: this implements just for 2 cluster, initilization need to re-implement
    """
    ret = {"mean": [], "cov": [], "coff": []}
    kmean_ret = vq.kmeans(X, K)

    ##assign data to cluster to calculate covariance
    data = []

    for i in range(0, K, 1):
        data.append([])

    for i in range(0, X.shape[0], 1):
        dis = []
        max = 0
        max_idx = -1
        for j in range(0, K, 1):
            _dis = ((X[i] - kmean_ret[0][j]) ** 2).sum()
            if _dis >= max:
                max = _dis
                max_idx = j
        data[max_idx].append(X[i])

    for i in range(0, K, 1):
        data[i] = np.asarray(data[i])
        ret["cov"].append(np.cov(data[i].transpose()))
        ret["mean"].append(kmean_ret[0][i])
        ret["coff"].append(float(data[i].size / 2) / X.shape[0])

    return ret
from scipy.cluster.vq import kmeans, whiten
from numpy import genfromtxt, zeros
from matplotlib import pyplot as plt

consolidated_data = genfromtxt('../new_consolidated_data.csv',
                               delimiter=',',
                               skip_header=1)

features = consolidated_data[:, 2:-13]

whitened = whiten(features)

k_distortion = zeros(50)

for k in range(1, 51):
    centroids, distortion = kmeans(whitened, k)
    k_distortion[k - 1] = distortion

fig, ax = plt.subplots()
plt.plot(range(1, 51), k_distortion)
plt.xlabel("Number of clusters")
plt.ylabel("Distortion")
plt.savefig("output/states_all_year_elbow.svg")
plt.show()
示例#32
0
    def testCluster(self):
        print "< testCluster >"
        numVertices = 8
        graph = SparseGraph(GeneralVertexList(numVertices))

        graph.addEdge(0, 1)
        graph.addEdge(0, 2)
        graph.addEdge(1, 2)

        graph.addEdge(3, 4)
        graph.addEdge(3, 5)
        graph.addEdge(4, 5)

        graph.addEdge(0, 3)

        W = graph.getWeightMatrix()

        graphIterator = []
        graphIterator.append(W[0:6, 0:6].copy())
        W[1, 6] += 1
        W[6, 1] += 1
        graphIterator.append(W[0:7, 0:7].copy())
        W[4, 7] += 1
        W[7, 4] += 1
        graphIterator.append(W.copy())
        graphIterator = iter(graphIterator)

        k = 2
        clusterer = NingSpectralClustering(k)
        clustersList = clusterer.cluster(
            toSparseGraphListIterator(graphIterator))

        #Why are the bottom rows of Q still zero?

        #Try example in which only edges change
        numVertices = 7
        graph = SparseGraph(GeneralVertexList(numVertices))

        graph.addEdge(0, 1)
        graph.addEdge(0, 2)
        graph.addEdge(1, 2)

        graph.addEdge(3, 4)

        WList = []
        W = graph.getWeightMatrix()
        WList.append(W[0:5, 0:5].copy())

        graph.addEdge(3, 5)
        graph.addEdge(4, 5)
        W = graph.getWeightMatrix()
        WList.append(W[0:6, 0:6].copy())

        graph.addEdge(0, 6)
        graph.addEdge(1, 6)
        graph.addEdge(2, 6)
        W = graph.getWeightMatrix()
        WList.append(W[0:7, 0:7].copy())

        iterator = iter(WList)
        clustersList = clusterer.cluster(toSparseGraphListIterator(iterator))

        #Seems to work, amazingly
        #print(clustersList)

        #Try removing rows/cols
        W2 = W[0:5, 0:5]
        W3 = W[0:4, 0:4]
        WList = [W, W2, W3]
        iterator = iter(WList)
        clustersList = clusterer.cluster(toSparseGraphListIterator(iterator))

        #nptst.assert_array_equal(clustersList[0][0:5], clustersList[1])
        nptst.assert_array_equal(clustersList[1][0:4], clustersList[2])

        #Make sure 1st clustering (without updates) is correct
        L = GraphUtils.normalisedLaplacianRw(scipy.sparse.csr_matrix(W))
        numpy.random.seed(21)
        lmbda, Q = scipy.sparse.linalg.eigs(L,
                                            min(k, L.shape[0] - 1),
                                            which="SM",
                                            ncv=min(20 * k, L.shape[0]),
                                            v0=numpy.random.rand(L.shape[0]))

        V = VqUtils.whiten(Q)
        centroids, distortion = vq.kmeans(V, k, iter=20)
        clusters, distortion = vq.vq(V, centroids)

        #This should be equal but the eigenvector computation is unstable
        #even with repeated runs (and no way to set the seed)
        nptst.assert_array_equal(clusters, clustersList[0])
示例#33
0
def kmean_anchors(path='../data/coco128.yaml', n=9, img_size=640, thr=4.0, gen=1000, verbose=True):
    """ Creates kmeans-evolved anchors from training dataset

        Arguments:
            path: path to dataset *.yaml, or a loaded dataset
            n: number of anchors
            img_size: image size used for training
            thr: anchor-label wh ratio threshold hyperparameter hyp['anchor_t'] used for training, default=4.0
            gen: generations to evolve anchors using genetic algorithm
            verbose: print all results

        Return:
            k: kmeans evolved anchors

        Usage:
            from utils.autoanchor import *; _ = kmean_anchors()
    """
    thr = 1. / thr

    def metric(k, wh):  # compute metrics
        r = wh[:, None] / k[None]
        x = torch.min(r, 1. / r).min(2)[0]  # ratio metric
        # x = wh_iou(wh, torch.tensor(k))  # iou metric
        return x, x.max(1)[0]  # x, best_x

    def anchor_fitness(k):  # mutation fitness
        _, best = metric(torch.tensor(k, dtype=torch.float32), wh)
        return (best * (best > thr).float()).mean()  # fitness

    def print_results(k):
        k = k[np.argsort(k.prod(1))]  # sort small to large
        x, best = metric(k, wh0)
        bpr, aat = (best > thr).float().mean(), (x > thr).float().mean() * n  # best possible recall, anch > thr
        print('thr=%.2f: %.4f best possible recall, %.2f anchors past thr' % (thr, bpr, aat))
        print('n=%g, img_size=%s, metric_all=%.3f/%.3f-mean/best, past_thr=%.3f-mean: ' %
              (n, img_size, x.mean(), best.mean(), x[x > thr].mean()), end='')
        for i, x in enumerate(k):
            print('%i,%i' % (round(x[0]), round(x[1])), end=',  ' if i < len(k) - 1 else '\n')  # use in *.cfg
        return k

    if isinstance(path, str):  # *.yaml file
        with open(path) as f:
            data_dict = yaml.load(f, Loader=yaml.FullLoader)  # model dict
        from utils.datasets import LoadImagesAndLabels
        dataset = LoadImagesAndLabels(data_dict['train'], augment=True, rect=True)
    else:
        dataset = path  # dataset

    # Get label wh
    shapes = img_size * dataset.shapes / dataset.shapes.max(1, keepdims=True)
    wh0 = np.concatenate([l[:, 3:5] * s for s, l in zip(shapes, dataset.labels)])  # wh

    # Filter
    i = (wh0 < 3.0).any(1).sum()
    if i:
        print('WARNING: Extremely small objects found. '
              '%g of %g labels are < 3 pixels in width or height.' % (i, len(wh0)))
    wh = wh0[(wh0 >= 2.0).any(1)]  # filter > 2 pixels
    # wh = wh * (np.random.rand(wh.shape[0], 1) * 0.9 + 0.1)  # multiply by random scale 0-1

    # Kmeans calculation
    print('Running kmeans for %g anchors on %g points...' % (n, len(wh)))
    s = wh.std(0)  # sigmas for whitening
    k, dist = kmeans(wh / s, n, iter=30)  # points, mean distance
    k *= s
    wh = torch.tensor(wh, dtype=torch.float32)  # filtered
    wh0 = torch.tensor(wh0, dtype=torch.float32)  # unfiltered
    k = print_results(k)

    # Evolve
    npr = np.random
    f, sh, mp, s = anchor_fitness(k), k.shape, 0.9, 0.1  # fitness, generations, mutation prob, sigma
    pbar = tqdm(range(gen), desc='Evolving anchors with Genetic Algorithm')  # progress bar
    for _ in pbar:
        v = np.ones(sh)
        while (v == 1).all():  # mutate until a change occurs (prevent duplicates)
            v = ((npr.random(sh) < mp) * npr.random() * npr.randn(*sh) * s + 1).clip(0.3, 3.0)
        kg = (k.copy() * v).clip(min=2.0)
        fg = anchor_fitness(kg)
        if fg > f:
            f, k = fg, kg.copy()
            pbar.desc = 'Evolving anchors with Genetic Algorithm: fitness = %.4f' % f
            if verbose:
                print_results(k)

    return print_results(k)
示例#34
0
def apply_kmeans(box_dict, k):
    # for every object class in the box_dict
    #     reduce the list of boxes to the clustered boxes with kmeans
    # return the new dictionary
    kmeans_dict = dict()
    for obj_class in box_dict:
        print obj_class
        boxes = box_dict[obj_class]
        if len(boxes) > k:
            # write a representation for each proposal box as a vector
            def box_to_vec(pbox):
                # list of metrics which we want to reduce the Euclidean distance of:
                # includes centroid, and each of the individual coordinates of the box,
                # which are used to recover box coordinates after the k means in vector reprepresentation
                # are found. To weight the impact of the centroid measure,
                # we multiply by 1/area: the centroid matters less as box area increases.
                # we also include the coordinates, since distances between them are relevant as well.
                # Note that including the original coordinates in the vector allows us to recover the
                # original representation of the box.
                # we also include the score (scaled down) for the same reason. We scale it down since score-space
                # should not really affect the distance between boxes (having similar scores is not necessarily a good reason
                # to combine or not)
                metrics = [
                    pbox.centroid()[0],
                    pbox.centroid()[1],
                    pbox.centroid()[0] / pbox.area(),
                    pbox.centroid()[1] / pbox.area(), pbox.x1, pbox.y1,
                    pbox.x2, pbox.y2, 0.00001 * pbox.score
                ]
                return metrics

            # we will append the columns together and then take transpose
            # so that each row is a box with n features (here n = 9)
            first_col = box_to_vec(boxes[0])
            # for rescaling
            oldx1, oldy1, oldx2, oldy2, oldscore = first_col[4], first_col[
                5], first_col[6], first_col[7], first_col[8]
            first_col = np.array(first_col)
            first_col = first_col.T
            box_mat = first_col
            for i in range(1, len(boxes)):
                new_col = np.array(box_to_vec(boxes[i]))
                new_col = new_col.T
                box_mat = np.c_[box_mat, new_col]
            box_mat = box_mat.T
            box_mat = box_mat.astype('float')
            # whiten
            box_mat = whiten(box_mat)
            # need to rescale the coords when we recover the boxes from the representation vectors
            newx1, newy1, newx2, newy2, newscore = 0, 0, 0, 0, 0
            if len(np.shape(box_mat)) > 1:
                newx1, newy1, newx2, newy2, newscore = box_mat[0][4], box_mat[
                    0][5], box_mat[0][6], box_mat[0][7], box_mat[0][8]
            else:
                newx1, newy1, newx2, newy2, newscore = box_mat[4], box_mat[
                    5], box_mat[6], box_mat[7], box_mat[8]
            scalex1, scaley1, scalex2, scaley2, scalescore = oldx1 / (
                0. + newx1), oldy1 / (0. + newy1), oldx2 / (
                    0. + newx2), oldy2 / (0. + newy2), oldscore / (0. +
                                                                   newscore)
            # use k-means
            codebook, distortion = kmeans(box_mat, k)
            centroid_boxes = []
            for i in range(np.shape(codebook)[0]):
                # we chop off from 4 onwards because these are (pbox.x1, pbox.y1, pbox.x2, pbox.y2, pbox.score)
                # this is a direct inverse from box_to_vec
                # need to multiply these coords by standard deviations across all instances of feature.
                thebox = box(scalex1 * codebook[i][4],
                             scaley1 * codebook[i][5],
                             scalex2 * codebook[i][6],
                             scaley2 * codebook[i][7],
                             scalescore * codebook[i][8])
                centroid_boxes.append(thebox)
            print "# of centroids: " + str(len(centroid_boxes))
            print centroid_boxes[0]
            print centroid_boxes[1]
            print centroid_boxes[2]
            if obj_class not in kmeans_dict:
                kmeans_dict[obj_class] = []
            kmeans_dict[obj_class] = centroid_boxes
        else:
            kmeans_dict[obj_class] = box_dict[obj_class]
        print "==================================="
    return kmeans_dict
示例#35
0
def kmean_anchors(path='./data/coco128.yaml', n=9, img_size=640, thr=4.0, gen=1000, verbose=True):
    """ Creates kmeans-evolved anchors from training dataset

        Arguments:
            path: path to dataset *.yaml, or a loaded dataset
            n: number of anchors
            img_size: image size used for training
            thr: anchor-label wh ratio threshold hyperparameter hyp['anchor_t'] used for training, default=4.0
            gen: generations to evolve anchors using genetic algorithm

        Return:
            k: kmeans evolved anchors

        Usage:
            from utils.utils import *; _ = kmean_anchors()
    """
    thr = 1. / thr

    def metric(k):  # compute metrics
        r = wh[:, None] / k[None]
        x = torch.min(r, 1. / r).min(2)[0]  # ratio metric
        # x = wh_iou(wh, torch.tensor(k))  # iou metric
        return x, x.max(1)[0]  # x, best_x

    def fitness(k):  # mutation fitness
        _, best = metric(k)
        return (best * (best > thr).float()).mean()  # fitness

    def print_results(k):
        k = k[np.argsort(k.prod(1))]  # sort small to large
        x, best = metric(k)
        bpr, aat = (best > thr).float().mean(), (x > thr).float().mean() * n  # best possible recall, anch > thr
        print('thr=%.2f: %.3f best possible recall, %.2f anchors past thr' % (thr, bpr, aat))
        print('n=%g, img_size=%s, metric_all=%.3f/%.3f-mean/best, past_thr=%.3f-mean: ' %
              (n, img_size, x.mean(), best.mean(), x[x > thr].mean()), end='')
        for i, x in enumerate(k):
            print('%i,%i' % (round(x[0]), round(x[1])), end=',  ' if i < len(k) - 1 else '\n')  # use in *.cfg
        return k

    if isinstance(path, str):  # *.yaml file
        with open(path) as f:
            data_dict = yaml.load(f, Loader=yaml.FullLoader)  # model dict
        from utils.datasets import LoadImagesAndLabels
        dataset = LoadImagesAndLabels(data_dict['train'], augment=True, rect=True)
    else:
        dataset = path  # dataset

    # Get label wh
    shapes = img_size * dataset.shapes / dataset.shapes.max(1, keepdims=True)
    wh = torch.tensor(np.concatenate([l[:, 3:5] * s for s, l in zip(shapes, dataset.labels)])).float()  # wh
    wh = wh[(wh > 2.0).all(1)].numpy()  # filter > 2 pixels

    # Kmeans calculation
    from scipy.cluster.vq import kmeans
    print('Running kmeans for %g anchors on %g points...' % (n, len(wh)))
    s = wh.std(0)  # sigmas for whitening
    k, dist = kmeans(wh / s, n, iter=30)  # points, mean distance
    k *= s
    wh = torch.tensor(wh)
    k = print_results(k)

    # Plot
    # k, d = [None] * 20, [None] * 20
    # for i in tqdm(range(1, 21)):
    #     k[i-1], d[i-1] = kmeans(wh / s, i)  # points, mean distance
    # fig, ax = plt.subplots(1, 2, figsize=(14, 7))
    # ax = ax.ravel()
    # ax[0].plot(np.arange(1, 21), np.array(d) ** 2, marker='.')
    # fig, ax = plt.subplots(1, 2, figsize=(14, 7))  # plot wh
    # ax[0].hist(wh[wh[:, 0]<100, 0],400)
    # ax[1].hist(wh[wh[:, 1]<100, 1],400)
    # fig.tight_layout()
    # fig.savefig('wh.png', dpi=200)

    # Evolve
    npr = np.random
    f, sh, mp, s = fitness(k), k.shape, 0.9, 0.1  # fitness, generations, mutation prob, sigma
    for _ in tqdm(range(gen), desc='Evolving anchors with Genetic Algorithm:'):
        v = np.ones(sh)
        while (v == 1).all():  # mutate until a change occurs (prevent duplicates)
            v = ((npr.random(sh) < mp) * npr.random() * npr.randn(*sh) * s + 1).clip(0.3, 3.0)
        kg = (k.copy() * v).clip(min=2.0)
        fg = fitness(kg)
        if fg > f:
            f, k = fg, kg.copy()
            if verbose:
                print_results(k)
    k = print_results(k)
    return k
示例#36
0
文件: main.py 项目: mappp7/tools
    def initialize(self,
                   poses,
                   rest_pose,
                   num_bones,
                   iterations,
                   mayaMesh=None,
                   jointList=None):

        bones = []
        num_verts = rest_pose.shape[0]  # shape mean array scale
        num_poses = poses.shape[0]

        bone_transforms = np.empty(
            (num_bones, num_poses, 4,
             3))  # [(R, T) for for each pose] for each bone
        # 3rd dim has 3 rows for R and 1 row for T

        # Use k-means to assign bones to vertices
        whitened = whiten(rest_pose)
        codebook, _ = kmeans(whitened, num_bones)
        rest_pose_corrected = np.empty(
            (num_bones, num_verts,
             3))  # Rest pose - mean of vertices attached to each bone

        # confirm mode
        if mayaMesh:
            #rigid Skin
            vert_assignments, bones = self.manual_codebook(mayaMesh, jointList)
            boneArray = []
            for i in bones:
                boneArray.append(cmds.xform(i, q=1, t=1, ws=1))

            self.rest_bones_t = np.array(boneArray)
            #rest_bones_t = np.empty((num_bones , 3))

            for bone in range(num_bones):
                #rest_bones_t[bone] = np.mean(rest_pose[vert_assignments == bone] , axis = 0)
                self.rest_bones_t[bone] = np.array(boneArray[bone])
                rest_pose_corrected[bone] = rest_pose - self.rest_bones_t[bone]

                for pose in range(num_poses):

                    bone_transforms[bone, pose] = self.kabsch(
                        rest_pose_corrected[bone, vert_assignments == bone],
                        poses[pose, vert_assignments == bone])

        else:
            # Compute initial random bone transformations

            vert_assignments, _ = vq(
                whitened,
                codebook)  # Bone assignment for each vertex (|num_verts| x 1)
            self.rest_bones_t = np.empty(
                (num_bones, 3))  # Translations for bones at rest pose

            for bone in range(num_bones):
                self.rest_bones_t[bone] = np.mean(
                    rest_pose[vert_assignments == bone], axis=0)
                rest_pose_corrected[bone] = rest_pose - self.rest_bones_t[bone]

                for pose in range(num_poses):

                    bone_transforms[bone, pose] = self.kabsch(
                        rest_pose_corrected[bone, vert_assignments == bone],
                        poses[pose, vert_assignments == bone])

        for it in range(iterations):

            # Re-assign bones to vertices using smallest reconstruction error from all poses
            constructed = np.empty(
                (num_bones, num_poses, num_verts,
                 3))  # |num_bones| x |num_poses| x |num_verts| x 3
            for bone in range(num_bones):
                Rp = bone_transforms[bone, :, :3, :].dot(
                    (rest_pose - self.rest_bones_t[bone]).T).transpose(
                        (0, 2, 1))  # |num_poses| x |num_verts| x 3
                # R * p + T
                constructed[bone] = Rp + bone_transforms[bone, :, np.newaxis,
                                                         3, :]
            errs = np.linalg.norm(constructed - poses,
                                  axis=(1, 3))  # position value average

            vert_assignments = np.argmin(errs, axis=0)

            # For each bone, for each pose, compute new transform using kabsch
            for bone in range(num_bones):

                self.rest_bones_t[bone] = np.mean(
                    rest_pose[vert_assignments == bone], axis=0)

                rest_pose_corrected[bone] = rest_pose - self.rest_bones_t[bone]

                for pose in range(num_poses):
                    P = rest_pose_corrected[bone, vert_assignments == bone]
                    Q = poses[pose, vert_assignments == bone]

                    if (P.size == 0 or Q.size == 0):
                        print 'Skip Iteration'
                    else:

                        bone_transforms[bone, pose] = self.kabsch(P, Q)

        # jointList is correct Index Joint

        return bone_transforms, self.rest_bones_t, bones
示例#37
0
    'sqlite:///c:/RBSA/year1/RBSA_METER_DATA_1/RBSA_METER_DATA_1.sqlite')
dict = {'meter_min_cluster': [], 'meter_max_cluster': []}
for siteid in pd.read_sql_query("SELECT DISTINCT siteid FROM RBSA_METER_DATA",
                                engine).values:
    siteid = int(siteid[0])
    df = pd.read_sql_query(
        "SELECT siteid, time, IDT from RBSA_METER_DATA WHERE siteid='{}'".
        format(siteid), engine)
    df['siteid'] = df['siteid'].astype('int')
    df = df.set_index('siteid')
    df = df.dropna()
    df['month'] = df['time'].apply(lambda x: x[2:5])
    df = df.loc[df['month'].isin(['DEC', 'JAN', 'FEB'])]
    if pd.isnull(df['IDT']).all():
        continue
    codebook, _ = kmeans(np.array(df['IDT']), 2)  # two clusters
    dict['meter_min_cluster'].append(min(codebook))
    dict['meter_max_cluster'].append(max(codebook))

df_meter = pd.DataFrame.from_dict(dict)

# audit
engine = sqlalchemy.create_engine(
    'sqlite:///c:/OpenStudio-ResStock/OpenStudio-ResStock/data/rbsa/rbsa.sqlite'
)
df = pd.read_sql_query(
    "SELECT siteid, ResInt_HeatTemp, ResInt_HeatTempNight from SF_ri_heu",
    engine)
resint_heattemp = df['ResInt_HeatTemp']
resint_heattemp = resint_heattemp[resint_heattemp > 0].dropna()
resint_heattempnight = df['ResInt_HeatTempNight']
示例#38
0
def kmean_anchors(path='data/DsiacPlusF2.txt', n=9, img_size=(416, 416)):
    # from utils.utils import *; _ = kmean_anchors()
    # Produces a list of target kmeans suitable for use in *.cfg files
    from utils.datasets import LoadImagesAndLabels
    thr = 0.20  # IoU threshold

    def print_results(thr, wh, k):
        k = k[np.argsort(k.prod(1))]  # sort small to large
        iou = wh_iou(torch.Tensor(wh), torch.Tensor(k))
        max_iou, min_iou = iou.max(1)[0], iou.min(1)[0]
        bpr, aat = (max_iou > thr).float().mean(), (iou > thr).float().mean() * n  # best possible recall, anch > thr
        print('%.2f iou_thr: %.3f best possible recall, %.2f anchors > thr' % (thr, bpr, aat))
        print('kmeans anchors (n=%g, img_size=%s, IoU=%.3f/%.3f/%.3f-min/mean/best): ' %
              (n, img_size, min_iou.mean(), iou.mean(), max_iou.mean()), end='')
        for i, x in enumerate(k):
            print('%i,%i' % (round(x[0]), round(x[1])), end=',  ' if i < len(k) - 1 else '\n')  # use in *.cfg
        return k

    def fitness(thr, wh, k):  # mutation fitness
        iou = wh_iou(wh, torch.Tensor(k)).max(1)[0]  # max iou
        bpr = (iou > thr).float().mean()  # best possible recall
        return iou.mean() * bpr  # product

    # Get label wh
    wh = []
    dataset = LoadImagesAndLabels(path, augment=True, rect=True, cache_labels=True)
    nr = 1 if img_size[0] == img_size[1] else 10  # number augmentation repetitions
    for s, l in zip(dataset.shapes, dataset.labels):
        wh.append(l[:, 3:5] * (s / s.max()))  # image normalized to letterbox normalized wh
    wh = np.concatenate(wh, 0).repeat(nr, axis=0)  # augment 10x
    wh *= np.random.uniform(img_size[0], img_size[1], size=(wh.shape[0], 1))  # normalized to pixels (multi-scale)

    # Darknet yolov3.cfg anchors
    use_darknet = False
    if use_darknet:
        k = np.array([[10, 13], [16, 30], [33, 23], [30, 61], [62, 45], [59, 119], [116, 90], [156, 198], [373, 326]])
    else:
        # Kmeans calculation
        from scipy.cluster.vq import kmeans
        print('Running kmeans for %g anchors on %g points...' % (n, len(wh)))
        s = wh.std(0)  # sigmas for whitening
        k, dist = kmeans(wh / s, n, iter=30)  # points, mean distance
        k *= s
    k = print_results(thr, wh, k)

    # # Plot
    # k, d = [None] * 20, [None] * 20
    # for i in tqdm(range(1, 21)):
    #     k[i-1], d[i-1] = kmeans(wh / s, i)  # points, mean distance
    # fig, ax = plt.subplots(1, 2, figsize=(14, 7))
    # ax = ax.ravel()
    # ax[0].plot(np.arange(1, 21), np.array(d) ** 2, marker='.')

    # Evolve
    npr = np.random
    wh = torch.Tensor(wh)
    f, sh, ng, mp, s = fitness(thr, wh, k), k.shape, 1000, 0.9, 0.1  # fitness, generations, mutation probability, sigma
    for _ in tqdm(range(ng), desc='Evolving anchors'):
        v = np.ones(sh)
        while (v == 1).all():  # mutate until a change occurs (prevent duplicates)
            v = ((npr.random(sh) < mp) * npr.random() * npr.randn(*sh) * s + 1).clip(0.3, 3.0)  # 98.6, 61.6
        kg = (k.copy() * v).clip(min=2.0)
        fg = fitness(thr, wh, kg)
        if fg > f:
            f, k = fg, kg.copy()
            print_results(thr, wh, k)
    k = print_results(thr, wh, k)

    return k
		v_ids = [i for i in os.listdir(input_file_path_base + vehicle_type + '/' + fuel_type) if i[-4:] == '.csv']
		for v_num, v_id in enumerate(v_ids):
			print "Creating Self-Organized Map for " + vehicle_type + " with " + fuel_type + " consuption (ID " + str(v_num) + " of " + str(len(v_ids)) + ")\r",

			# Opens ID frame sampling analysis (NOT NORMALIZED)
			input_file_name = input_file_path_base + vehicle_type + '/' + fuel_type + '/' + v_id
			df = pd.read_csv(input_file_name)
			data = df.fillna(0).as_matrix().astype(float)

			# Starts K-Means analysis
			best_distortion = None
			best_code_book = None
			best_distance = None
			best_k = None
			for k_mean in k_means:
				centroids, distortion = kmeans(data, k_mean)									# Uses kmeans to clusterize data from actual map
				code_book, distance = vq(data, centroids)										# Gets codebook of ID's

				# Saves results if distortion is more than "elbow_rate" percent smaller than the best distortion so far
				if best_distortion == None or abs(distortion - best_distortion)/best_distortion > elbow_rate:
					best_distortion = distortion
					best_code_book = code_book
					best_distance = distance
					best_k = k_mean
				# If distortion is not "elbow_percent" percent smaller than the best distortion so far, quites the analysis
				else:
					break

			df['CODE_BOOK'] = best_code_book													# Saves codebook on result on dataframe
			df['DISTANCE'] = best_distance														# Saves distances from centroids on dataframe
			f.write(vehicle_type + ' ' + fuel_type + ' ' + v_id[:-4] + ' k=' + str(best_k) + '\n')
# -*- coding: utf-8 -*-
"""
K-means clustering

@author: David André Rodríguez Méndez (AndreRdz7)
"""
# Import libraries
import numpy as numpy
from scipy.cluster.vq import vq, kmeans
# Create datasets
data = np.random.random(90).reshape(30, 3)
c1 = np.random.choice(range(len(data)))
c2 = np.random.choice(range(len(data)))
# Getting k
clust_centers = np.vstack([data[c1], data[c2]])
print(clust_centers)
print(vq(data, clust_centers))
# K-means
kmeans(data, clust_centers)
示例#41
0
文件: test_vq.py 项目: yaqiongl/scipy
 def test_kmeans_large_thres(self):
     # Regression test for gh-1774
     x = np.array([1, 2, 3, 4, 10], dtype=float)
     res = kmeans(x, 1, thresh=1e16)
     assert_allclose(res[0], np.array([4.]))
     assert_allclose(res[1], 2.3999999999999999)
        cat_path = join(datasetpath, cat)
        cat_files = get_imgfiles(cat_path)
        cat_features = extractSift(cat_files)
        all_files = all_files + cat_files
        all_features.update(cat_features)
        cat_label[cat] = label
        for i in cat_files:
            all_files_labels[i] = label

    print "---------------------"
    print "## computing the visual words via k-means"
    all_features_array = dict2numpy(all_features)
    nfeatures = all_features_array.shape[0]
    nclusters = int(sqrt(nfeatures))
    codebook, distortion = vq.kmeans(all_features_array,
                                     nclusters,
                                     thresh=K_THRESH)

    with open(datasetpath + CODEBOOK_FILE, 'wb') as f:

        dump(codebook, f, protocol=HIGHEST_PROTOCOL)

    print "---------------------"
    print "## compute the visual words histograms for each image"
    all_word_histgrams = {}
    for imagefname in all_features:
        word_histgram = computeHistograms(codebook, all_features[imagefname])
        all_word_histgrams[imagefname] = word_histgram

    print "---------------------"
    print "## write the histograms to file to pass it to the svm"
示例#43
0
from numpy import vstack, array
from numpy.random import rand
#from scipy.cluster.vq import whiten
import scipy.cluster.vq as vec

# data generation with three features
data = vstack((rand(100, 3) + array([.5, .5, .5]), rand(100, 3)))

# whitening of data
data = vec.whiten(data)

# computing K-Means with K = 3 (2 clusters)
centroids, _ = vec.kmeans(data, 3)

# assign each sample to a cluster
clx, _ = vec.vq(data, centroids)

print(data)
print(centroids)
print(clx)
示例#44
0
def find_starting_set(run=True, display=False):
    """ For of the 100 clustered demands points,
    
    Determine whether a base in the set of n bases 
    can reach that demand point within r1. 
    
    As soon as a demand point cannot be reached by a base
    within r1, throw that set of bases out
    
    There is no way to brute force every combination of 8 ambulances
    But instead use the demand points, find the surrounding bases,
    and check whether that fulfills the set coverage. 
    
    Reorder the bases list, and then randomize it. """

    if not run: return
    global calls, bases, demands, times, converted_calls, calls_kmeans

    delta = 0  # See below.
    clust_call_to_base = []

    call_array = array(converted_calls)
    if not calls_kmeans:
        calls_kmeans = kmeans(call_array, top_n)

    # Get the first coordinate, then find the closest actual base.
    calls_clustered_list = calls_kmeans[0]

    if display: print("For each representative call point, find the bases. \n")

    for each_call in calls_clustered_list:
        if display: print("\n")

        # Search for points. If it empty, then redo it.
        delta = 0.01
        actual = []
        reorder_bases = copy.deepcopy(bases)
        reorder_bases.sort(key=itemgetter(0))

        while not actual:
            actual = search.surrounding_points(
                each_call,
                delta,
                [],  # Doesn't actually do anything
                reorder_bases)
            delta += 0.01

        clust_call_to_base.append(tuple([list(each_call), actual]))

        if display:
            plot([each_call], "red")

            print("-----------------------------------------------------")
            print(each_call, " with distance of %.2f km  " % (delta))
            print("-----------------------------------------------------")

            for each in actual:
                print(each)
                plot([each[0]], "green")

            plt.show()
    if display: print("<< EOF >>")
    return clust_call_to_base
示例#45
0
bpms = []
for line in fd:
    if comment.match(line):
        continue
    md = pat.search(line)
    if md:
        bpm = md.group(2)
        bpm = float(bpm)
        title = re.sub(pat, "", line)
        track = Track(title, bpm)
        tracks.append(track)
        bpms.append(bpm)

obs = np.array(bpms).T

cb, error = kmeans(obs, args.nClusters)

codes, dist = vq(obs, cb)

total_error = 0
for i, item in enumerate(tracks):
    bpm = cb[codes[i]]
    tracks[i].new_bpm = bpm
    error = dist[i]
    tracks[i].error = error
    total_error += error

tracks.sort(key=lambda x: x.orig_bpm)

curbpm = 0
for _, track in enumerate(tracks):
示例#46
0
from collections import defaultdict
from similar_words import load_vectors
import argparse

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--basename',
                        help='base name of word vector files',
                        type=str)
    parser.add_argument('--maxwords',
                        help='maximum number of words to cluster',
                        type=int)
    parser.add_argument('--k', help='number of clusters', type=int)
    args = parser.parse_args()

    vectors, words = load_vectors(args.basename, args.maxwords)

    centroids, _ = kmeans(vectors, args.k)
    idx, _ = vq(vectors, centroids)

    clusters = defaultdict(set)
    for i, c in enumerate(idx):
        clusters[c].add(words[i])

    for c in range(args.k):
        print 'CLUSTER', c + 1,
        for word in clusters[c]:
            print word,
        print
        print
示例#47
0
def colorCluster(img):
    img = imread(img)
    pixel = reshape(img,(img.shape[0]*img.shape[1],3))
    centroids,_ = kmeans(pixel,3) # six colors will be found
    return centroids
示例#48
0
def kmean_anchors(path='./data/coco64.txt',
                  n=9,
                  img_size=(640, 640),
                  thr=0.20,
                  gen=1000):
    # Creates kmeans anchors for use in *.cfg files: from utils.utils import *; _ = kmean_anchors()
    # n: number of anchors
    # img_size: (min, max) image size used for multi-scale training (can be same values)
    # thr: IoU threshold hyperparameter used for training (0.0 - 1.0)
    # gen: generations to evolve anchors using genetic algorithm
    from utils.datasets import LoadImagesAndLabels

    def print_results(k):
        k = k[np.argsort(k.prod(1))]  # sort small to large
        iou = wh_iou(wh, torch.Tensor(k))
        max_iou = iou.max(1)[0]
        bpr, aat = (max_iou > thr).float().mean(), (
            iou > thr).float().mean() * n  # best possible recall, anch > thr
        print('%.2f iou_thr: %.3f best possible recall, %.2f anchors > thr' %
              (thr, bpr, aat))
        print(
            'n=%g, img_size=%s, IoU_all=%.3f/%.3f-mean/best, IoU>thr=%.3f-mean: '
            % (n, img_size, iou.mean(), max_iou.mean(), iou[iou > thr].mean()),
            end='')
        for i, x in enumerate(k):
            print('%i,%i' % (round(x[0]), round(x[1])),
                  end=',  ' if i < len(k) - 1 else '\n')  # use in *.cfg
        return k

    def fitness(k):  # mutation fitness
        iou = wh_iou(wh, torch.Tensor(k))  # iou
        max_iou = iou.max(1)[0]
        return (max_iou * (max_iou > thr).float()).mean()  # product

    # Get label wh
    wh = []
    dataset = LoadImagesAndLabels(path, augment=True, rect=True)
    nr = 1 if img_size[0] == img_size[
        1] else 10  # number augmentation repetitions
    for s, l in zip(dataset.shapes, dataset.labels):
        wh.append(l[:, 3:5] *
                  (s / s.max()))  # image normalized to letterbox normalized wh
    wh = np.concatenate(wh, 0).repeat(nr, axis=0)  # augment 10x
    wh *= np.random.uniform(img_size[0], img_size[1],
                            size=(wh.shape[0],
                                  1))  # normalized to pixels (multi-scale)
    wh = wh[(wh > 2.0).all(1)]  # remove below threshold boxes (< 2 pixels wh)

    # Kmeans calculation
    from scipy.cluster.vq import kmeans
    print('Running kmeans for %g anchors on %g points...' % (n, len(wh)))
    s = wh.std(0)  # sigmas for whitening
    k, dist = kmeans(wh / s, n, iter=30)  # points, mean distance
    k *= s
    wh = torch.Tensor(wh)
    k = print_results(k)

    # # Plot
    # k, d = [None] * 20, [None] * 20
    # for i in tqdm(range(1, 21)):
    #     k[i-1], d[i-1] = kmeans(wh / s, i)  # points, mean distance
    # fig, ax = plt.subplots(1, 2, figsize=(14, 7))
    # ax = ax.ravel()
    # ax[0].plot(np.arange(1, 21), np.array(d) ** 2, marker='.')
    # fig, ax = plt.subplots(1, 2, figsize=(14, 7))  # plot wh
    # ax[0].hist(wh[wh[:, 0]<100, 0],400)
    # ax[1].hist(wh[wh[:, 1]<100, 1],400)
    # fig.tight_layout()
    # fig.savefig('wh.png', dpi=200)

    # Evolve
    npr = np.random
    f, sh, mp, s = fitness(
        k), k.shape, 0.9, 0.1  # fitness, generations, mutation prob, sigma
    for _ in tqdm(range(gen), desc='Evolving anchors'):
        v = np.ones(sh)
        while (v == 1
               ).all():  # mutate until a change occurs (prevent duplicates)
            v = ((npr.random(sh) < mp) * npr.random() * npr.randn(*sh) * s +
                 1).clip(0.3, 3.0)
        kg = (k.copy() * v).clip(min=2.0)
        fg = fitness(kg)
        if fg > f:
            f, k = fg, kg.copy()
            print_results(k)
    k = print_results(k)

    return k
示例#49
0
def analyze_color(input_image,
                  transparency_threshold=50,
                  plot_3d=False,
                  plot_bar=True,
                  n_cluster=None,
                  max_cluster=10,
                  ignore_pure_black=True,
                  use_sample=True,
                  return_colors=True):

    # Copy to prevent modification (useful but mechanism needs clarification)
    input_image = input_image.copy()

    # Check input shape
    assert (len(input_image.shape) == 3)
    assert (input_image.shape[-1] in {3, 4})

    # Turn color info of pixels into dataframe, filter by transparency if RGBA image is passed
    if input_image.shape[-1] == 4:
        color_df = pd.DataFrame(input_image.reshape(-1, 4),
                                columns=list('rgba'))
        # Get the rgb info of pixels in the non-transparent part of the image
        color_df = color_df[color_df['a'] >= transparency_threshold]
    if input_image.shape[-1] == 3:
        color_df = pd.DataFrame(input_image.reshape(-1, 3),
                                columns=list('rgb'))

    if ignore_pure_black:
        color_df = color_df[~((color_df['r'] == 0) & (color_df['g'] == 0) &
                              (color_df['b'] == 0))]

    # Handle large pixel color_df
    if not use_sample and len(color_df) > 1e5:
        sample_or_not = (input(
            'Large image detected, would you like to sample the pixels in this image? (Y/N) '
        )).lower()[0] == 'y'
        if sample_or_not:
            print(
                'Sampled 100,000 pixels from the image, note that you can also resize the image before passing it to this function.'
            )
            color_df = color_df.sample(n=int(1e5), random_state=0)
        else:
            print(
                'Not sampling performed, but note that rendering 3D plot for the pixels may crash your session and K-means clustering will be slow.'
            )

    # Get std for reverse-transform the kmeans results to a meaningful rgb palette
    r_std, g_std, b_std = color_df[list('rgb')].std()
    reverse_whiten_array = np.array((r_std, g_std, b_std))

    # Normalize observations on a per feature basis, forcing features to have unit variance
    # Doc: https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.vq.whiten.html
    for color in list('rgb'):
        color_df['scaled_' + color] = whiten(color_df[color])

    ## 3D scatter plot showing color groups
    if plot_3d:
        trace = go.Scatter3d(
            x=color_df['r'],
            y=color_df['g'],
            z=color_df['b'],
            mode='markers',
            marker=dict(color=[
                'rgb({},{},{})'.format(r, g, b)
                for r, g, b in zip(color_df['r'].values, color_df['g'].values,
                                   color_df['b'].values)
            ],
                        size=1,
                        opacity=1))
        layout = go.Layout(margin=dict(l=0, r=0, b=0, t=0))
        fig = go.Figure(data=[trace], layout=layout)
        fig.show()

    ## Use K-means to identify main colors
    cluster_centers_list = []
    avg_distortion_list = []

    if n_cluster != None:
        n_cluster_range = [n_cluster - 1]  # note minus 1 to get exactly n
    else:
        n_cluster_range = range(max_cluster + 1)

    if plot_bar:
        # Initialize plt graph
        f, ax = plt.subplots(len(n_cluster_range), 1, figsize=(10, 10))

    for n in n_cluster_range:

        ###### Train clusters ######

        cluster_centers, avg_distortion = kmeans(
            color_df[['scaled_r', 'scaled_g', 'scaled_b']], n + 1)

        ###### Assign labels ######

        labels, distortions = vq(
            color_df[['scaled_r', 'scaled_g', 'scaled_b']], cluster_centers)

        color_df['label'] = labels
        color_df['distortion'] = distortions

        ###### Build palette ######

        # These parameter affects visual style only and can be exposed to user later
        height = 200
        width = 1000
        gap_size = 5
        palette = np.zeros((height, width, 3), np.uint8)

        # Count how many pixels falls under which category, let this decides the color's relative width in the palette
        cluster_proportion = color_df['label'].value_counts().sort_index(
        ) / len(color_df)
        cluster_width_list = (cluster_proportion * width).to_list()
        cluster_width_list = [
            int(x) for x in saferound(cluster_width_list, places=0)
        ]

        # Reorder clusters and widths according to the proportion, largest to smallest
        reordered_cluster_df = pd.DataFrame(
            zip(cluster_centers, cluster_width_list),
            columns=['cluster', 'width']).sort_values('width', ascending=False)
        cluster_centers = reordered_cluster_df['cluster'].tolist()
        cluster_width_list = reordered_cluster_df['width'].tolist()

        # Storing information
        cluster_centers_list.append(cluster_centers)
        avg_distortion_list.append(avg_distortion)

        if plot_bar:
            # Coloring the palette canvas based on color and width
            endpoints = list(np.cumsum(cluster_width_list))
            startpoints = [0] + endpoints[:-1]
            for cluster_index in range(len(cluster_centers)):
                # Notice here we apply the reverse_whiten_array to get meaningful RGB colors
                palette[:, startpoints[cluster_index] + gap_size:
                        endpoints[cluster_index], :] = cluster_centers[
                            cluster_index] * reverse_whiten_array
                palette[:,
                        startpoints[cluster_index]:startpoints[cluster_index] +
                        gap_size, :] = (255, 255, 255)

            # Displaying the palette when performing K-means with parameter n
            if n_cluster != None:
                ax.imshow(palette)
                ax.axis('off')
            else:
                ax[n].imshow(palette)
                ax[n].axis('off')

    if plot_bar:
        ### Show the entire palette
        f.tight_layout()
        plt.show()
        ### Show the elbow plot for choosing best n_cluster parameter for K-means
        fig = plt.figure()
        plt.scatter(x=n_cluster_range, y=avg_distortion_list)
        fig.suptitle('Elbow Plot for K-means')
        plt.xlabel('Number of Clusters')
        plt.ylabel('Average Distortion')
        print()

    if return_colors:
        if n_cluster != None:
            return (cluster_centers_list[0] * reverse_whiten_array).astype(
                np.uint8)
        else:
            return [(cluster_centers * reverse_whiten_array).astype(np.uint8)
                    for cluster_centers in cluster_centers_list]
from termcolor import colored,cprint
import matplotlib.pyplot as plt

import numpy as np
from scipy.cluster import vq
# Creating data
c1 = np.random.randn(10, 2) + 5 
c2 = np.random.randn(3, 2) - 5 
c3 = np.random.randn(5, 2)

print(c1,colored('c2=','red'),c2,colored('c3=','blue'),c3)
# Pooling all the data into one 180 x 2 array 
data = np.vstack([c1, c2, c3])
print(colored('data =','red'),'\n',data)
# Calculating the cluster centroids and variance 
# from kmeans
centroids, variance = vq.kmeans(data, 3)
# The identified variable contains the information 
# we need to separate the points in clusters
# based on the vq function.
identified, distance = vq.vq(data, centroids)
# Retrieving coordinates for points in each vq # identified core
vqc1 = data[identified == 0]
vqc2 = data[identified == 1]
vqc3 = data[identified == 2]
print('$',vqc1,'#',vqc2,'@',vqc3)
示例#51
0
def cluster_followings_sentiments(username,
                                  stop_value=None,
                                  access_token=None):
    """Returns groups and their respective centroids for the followings of a users clustered according to the sentiment reflected in their bios"""
    try:
        from textblob import TextBlob
        from scipy.cluster import vq
        import numpy

        non_empty = lambda x: x if x != ' ' else ''
        filter_crap = lambda x: {
            'username':
            x['username'],
            'bio':
            map(
                non_empty,
                re.sub(r'[^\x00-\x7f]', r' ',
                       re.sub('[^A-Za-z0-9]+', ' ', x['bio'])).encode('utf-8').
                strip(' \t\n\r').split())
        }
        non_zero = lambda x: len(x['bio']) > 3
        joiner = lambda x: {
            'username': x['username'],
            'bio': ' '.join(x['bio'])
        }
        whiten = lambda obs: obs / numpy.std(obs)

        follows_data = numpy.array(
            map(
                joiner,
                filter(non_zero,
                       map(filter_crap, get_follows(username,
                                                    'user_and_bio')))))

        grouped = []

        t = [{
            'username': a['username'],
            'bio': a['bio'],
            'sentiment': [float(a) for a in TextBlob(a['bio']).sentiment]
        } for a in follows_data]

        centers, dist = vq.kmeans(
            numpy.array([[a['sentiment'][0], a['sentiment'][1]] for a in t]),
            whiten(
                numpy.array([[a['sentiment'][0], a['sentiment'][1]]
                             for a in t])), 100)
        code, distance = vq.vq(
            numpy.array([[a['sentiment'][0], a['sentiment'][1]] for a in t]),
            centers)

        for i in range(0, len(centers)):
            grouped.append({
                'centroid': {
                    'polarity': list(map(float, centers[i]))[0],
                    'subjectivity': list(map(float, centers[i]))[1]
                },
                'cluster':
                list(
                    numpy.array([{
                        'polarity': a['sentiment'][0],
                        'subjectivity': a['sentiment'][1],
                        'username': a['username']
                    } for a in t])[code == i])
            })

        centers = sorted([list([float(b) for b in a]) for a in centers])

        return grouped, centers
    except Exception, e:
        print str(e)
示例#52
0
def open_file(path):
    file = open(path)
    lines = []
    for line in file:
        line = line.strip()
        line = float(line)
        lines.append(line)
    file.close()
    return lines


k_lines = open_file('D:\\code\\UBC\\k_VOT.txt')
g_lines = open_file('D:\\code\\UBC\\g_VOT.txt')

data = []
data.extend(k_lines)
data.extend(g_lines)

clusters, _ = kmeans(data, 2)  #_for useless variable
k_centroid = max(clusters)
g_centroid = min(clusters)

for j in range(11):
    value = random.uniform(10, 16)
    k_distance = abs(value - k_centroid)  #distance from centroid
    g_distance = abs(value - g_centroid)
    if k_distance < g_distance:
        print('This sound is probably a k')
    else:
        print('This sound is probably a g')
示例#53
0
文件: test_vq.py 项目: yaqiongl/scipy
 def test_kmeans_simple(self):
     np.random.seed(54321)
     initc = np.concatenate(([[X[0]], [X[1]], [X[2]]]))
     for tp in np.array, np.matrix:
         code1 = kmeans(tp(X), tp(initc), iter=1)[0]
         assert_array_almost_equal(code1, CODET2)
示例#54
0
# build ann index
#t = AnnoyIndex(dims)
for file_index, i in enumerate(infiles):
  file_vector = np.loadtxt(i)
  file_name = os.path.basename(i).split('.')[0]
  file_index_to_file_name[file_index] = file_name
  file_index_to_file_vector[file_index] = file_vector

  #whitened = whiten(file_vector)
  #t.add_item(file_index, file_vector)
  

#t.build(trees)
whitened = whiten(features)
codes = 3
result = kmeans(whitened, codes)

'''
# create a nearest neighbors json file for each input
if not os.path.exists('nearest_neighbors'):
  os.makedirs('nearest_neighbors')

for i in file_index_to_file_name.keys():
  master_file_name = file_index_to_file_name[i]
  master_vector = file_index_to_file_vector[i]

  named_nearest_neighbors = []
  nearest_neighbors = t.get_nns_by_item(i, n_nearest_neighbors)
  for j in nearest_neighbors:
    neighbor_file_name = file_index_to_file_name[j]
    neighbor_file_vector = file_index_to_file_vector[j]
    def cluster_segments(self):
        if self.n_segs >= 3:
            K = range(1, 4)
            N = len(self.hets['seg_id'])
            self.segs.reset_index(inplace=True, drop=False)
            tin_data = np.nanargmax(self.TiN_likelihood_matrix,
                                    axis=1).astype(float)
            km = [kmeans(tin_data, k, iter=1000) for k in K]
            centroids = [cent for (cent, var) in km]
            squared_distance_to_centroids = [
                np.power(np.subtract(tin_data[:, np.newaxis], cent), 2)
                for cent in centroids
            ]
            self.sum_squared_distance = [
                sum(np.min(d, axis=1)) / N
                for d in squared_distance_to_centroids
            ]
            cluster_assignment = [
                np.argmin(d, axis=1) for d in squared_distance_to_centroids
            ]
            het_tin_map = np.argmax(self.p_TiN, axis=1)
            self.cl_distance_points = np.zeros([3, 3])
            for k, clust in enumerate(cluster_assignment):
                for idx, row in self.segs.iterrows():
                    self.cl_distance_points[k, clust[idx]] += np.sum(
                        np.power(
                            het_tin_map[self.hets['seg_id'] == row['index']] -
                            centroids[k][clust[idx]], 2))

            self.cl_var = np.sqrt(
                np.true_divide(self.cl_distance_points,
                               len(self.hets['seg_id'])))
            p = [1, 2, 3]
            delta_bic = [0, 10, 20]
            self.bic = (np.multiply(
                N,
                np.log(
                    np.true_divide(np.sum(self.cl_distance_points, axis=1),
                                   N))) +
                        np.multiply(p, np.log(N))) + delta_bic
            if len(centroids[2]) > 2:
                dist_btwn_c3 = np.mean(
                    [abs(i - j) for i, j in combinations(centroids[2], 2)])
            else:
                dist_btwn_c3 = 0
            if len(centroids[1]) > 1:
                dist_btwn_c2 = np.abs(np.diff(centroids[1]))
            else:
                dist_btwn_c2 = 0
            if dist_btwn_c3 < np.nanmax(
                    self.cl_var[2, :]) and dist_btwn_c2 > np.nanmax(
                        self.cl_var[1, :]):
                solution_idx = np.nanargmin(self.bic[0:1])
                self.cluster_assignment = cluster_assignment[solution_idx]
                self.centroids = centroids[solution_idx]
            if dist_btwn_c3 < np.nanmax(
                    self.cl_var[2, :]) and dist_btwn_c2 < np.nanmax(
                        self.cl_var[1, :]):
                self.cluster_assignment = cluster_assignment[0]
                self.centroids = centroids[0]
            else:
                solution_idx = np.nanargmin(self.bic)
                self.cluster_assignment = cluster_assignment[solution_idx]
                self.centroids = centroids[solution_idx]

        else:
            self.cluster_assignment = 0
            self.centroids = [np.mean(self.segs['TiN_MAP'])]
示例#56
0
def find_markers(image, template=None):
    """Finds four corner markers.

    Use a combination of circle finding, corner detection and convolution to
    find the four markers in the image.

    Args:
        image (numpy.array): image array of uint8 values.
        template (numpy.array): template image of the markers.

    Returns:
        list: List of four (x, y) tuples
            in the order [top-left, bottom-left, top-right, bottom-right].
    """
    
    img_gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    img_gray = cv2.medianBlur(img_gray, 5)
    
    B = (1/8)*np.ones((8,8), dtype=np.float32)
    B[:4,4:] = -1*B[:4,4:]
    B[4:,:4] = -1*B[4:,:4]
    
    angles = np.arange(-90, 91, 7.5, dtype=int)
    for i in angles:
        B_ = ndimage.rotate(B, i, reshape=True)
        
        C = cv2.filter2D(img_gray[5:,5:][:-5,:-5], ddepth = -1, kernel = B_)

        n=22
        j=6
        u = 0.8
        v = 4
        lo = int(0.5*(n-j))
        hi = int(0.5*(n+j))
        spot = (-1)*np.ones((n,n), dtype=np.float32)
        spot[lo:hi,lo:hi] = -1*spot[lo:hi,lo:hi]
        spot[-v:,:] = u*spot[-v:,:]
        spot[:,-v:] = u*spot[:,-v:]
        spot[:v,:] =  u*spot[:v,:]
        spot[:,:v] =  u*spot[:,:v]
        blobs = cv2.filter2D(C, ddepth = -1, kernel = spot)
        
        centers = np.array(np.argwhere(blobs==255), dtype = "float32") + 5
        if centers.shape[0]>15:
            break
    
    if centers.shape[0]>3:
        markers = np.array(kmeans(centers,4)[0], dtype = int)   
        
        rank_y = markers[:,1].argsort()
        rank_x1 = markers[rank_y[:2]][:,0].argsort()
        rank_x2 = markers[rank_y[2:]][:,0].argsort()
        
        p1 = markers[rank_y[:2]][rank_x1[0]]
        p2 = markers[rank_y[:2]][rank_x1[1]]
        p3 = markers[rank_y[2:]][rank_x2[0]]
        p4 = markers[rank_y[2:]][rank_x2[1]]
        
        final_markers = [(p1[1], p1[0]), (p2[1], p2[0]), (p3[1], p3[0]), (p4[1], p4[0])]
    else:
        final_markers = [(0,0), (2,0), (0,2), (2,2)]
    
    return final_markers
    raise NotImplementedError
def search(query, n=40, start=0):
    # retrieve top n results of query
    # default is 40 results per page

    dict_res = BossImageIndex().CallBoss(query, n, start)
    im_res = dict_res['ysearchresponse']['resultset_images']
    res = []
    for i in xrange(n):
        res.append((im_res[i]['thumbnail_url'], i))

    #path_name = "/Library/WebServer/results/"+query
    path_name = "/Users/novi/my_image_search/results/" + query

    # create the folder (if does not exist) to save query results
    if os.path.isdir(path_name):
        shutil.rmtree(path_name)
        os.mkdir(path_name)
    else:
        os.mkdir(path_name)

    # download the image results
    image = urllib.URLopener()
    silentcounter = 1
    imagefile = []
    for counter in xrange(n):
        urltoberetrieved = res[counter][0]
        #print urltoberetrieved
        filename = '%s/%s.%s' % (path_name, silentcounter, 'jpg')
        #try:
        image.retrieve(urltoberetrieved, filename)
        imagefile.append(filename)
        silentcounter = silentcounter + 1
        #except IOError:
        #    print 'error at %s \n' % (urltoberetrieved)
        #    pass

    # prepare the color image feature
    pref = numpy.array([[0, 0]])  # [image #,position #]
    ldesc = []
    codes = 30  #number of k-means cluster
    ino = 5
    jno = 8  # default grid: 5 by 8 2D grid
    show = ino * jno
    lim = show

    silentcounter = 1
    for i_img in xrange(lim):
        fname = imagefile[i_img]
        try:
            im = cv.LoadImage(fname,
                              0)  # loading with OpenCV (gray chanel only)
            silentcounter = silentcounter + 1
        except:
            print 'image thumbnail can not be retrieved'
            sys.exit(0)

        #resizing the image
        #om = cv.CreateImage((psize,psize),im.depth,im.nChannels)
        #cv.Resize(im,om,cv.CV_INTER_CUBIC)
        storage = cv.CreateMemStorage(0)
        #generating the mask
        #mat = cv.CreateMat(psize,psize,cv.CV_8UC1)
        #extracting SURF feature
        #[keypoints,descriptors] = cv.ExtractSURF(om,mat,storage,(1,500,3,4))
        [keypoints, descriptors] = cv.ExtractSURF(im, im, storage,
                                                  (1, 500, 3, 4))
        ldesc.append(descriptors)

    #perform vector quantization
    tarrdesc = [numpy.array(ldesc[i]) for i in range(show)]
    lendesc = [ldesc[i].__len__() for i in range(show)]
    arrdesc = numpy.concatenate([tarrdesc[i] for i in range(show)])
    arrdesc = whiten(arrdesc)
    [codebook, distortion] = kmeans(arrdesc, codes)
    [code, dist] = vq(arrdesc, codebook)

    #generate the semantic feature
    imgdata = numpy.zeros((show, codebook.shape[0]), dtype=float)
    code_offset = 0
    for i_img in xrange(show):
        code_index = range(code_offset, code_offset + lendesc[i_img])
        for i_code in code_index:
            imgdata[i_img, code[i_code]] = imgdata[i_img, code[i_code]] + 1
        code_offset = code_offset + lendesc[i_img]

    #normalize the semantic feature
    sumimgdata = numpy.sum(imgdata, axis=1)
    sumimgdata.shape = show, 1
    imgdata = imgdata / sumimgdata

    griddata = numpy.zeros((2, ino * jno))
    griddata[0, ] = numpy.kron(range(1, ino + 1), numpy.ones((1, jno)))
    griddata[1, ] = numpy.tile(range(1, jno + 1), (1, ino))

    # do kernelized sorting procedure
    PI = KS(imgdata, griddata.T, pref)
    i_sorting = PI.argmax(axis=1)

    #creating the passed dictionary
    sorted_dict_res = {}
    sorted_dict_res['count'] = dict_res['ysearchresponse']['count']
    sorted_dict_res['totalhits'] = dict_res['ysearchresponse']['totalhits']
    sorted_dict_res['start'] = dict_res['ysearchresponse']['start']
    sorted_dict_res['resultset_images'] = [
        dict_res['ysearchresponse']['resultset_images'][i] for i in i_sorting
    ]
    return sorted_dict_res
示例#58
0
def limb_track():
    global frame_n

    cv.namedWindow("Dots")
    fps = 30
    frame_dt = 0  #1.0 / fps
    mv_i = 0
    pause = False

    while True:
        print("Frame:", mv_i)
        if frame_n >= contour_data.shape[0]:
            #mv_i = 0
            print("Frames completed:", frame_n)
            f_write.save(write_dict)
            break

        t = time.clock()
        ret, im = cap.read()

        for x, y in fs:
            cv.circle(im, (x, y), 2, (255, 0, 0), -1)

        n = n_contours[mv_i]

        if (n > 0):

            c_points = contour_data[mv_i, :n]

            limb_distances = np.empty((num_limbs, n))
            for i in range(num_limbs):
                limb_x, limb_y = fs[i]
                for j in range(n):
                    x, y = c_points[j]
                    dx = limb_x - x
                    dy = limb_y - y
                    distance = dx * dx + dy * dy
                    limb_distances[i, j] = distance

                limb_distances[i] = np.sort(limb_distances[i])

            threshold = 1500
            needed_limbs = np.where(limb_distances[:, 0] < threshold)[0]

            whitened = whiten(c_points)
            x_scale = c_points[0, 0] / whitened[0, 0]
            y_scale = c_points[0, 1] / whitened[0, 1]

            if (needed_limbs.shape[0] > 0):
                max_k = 6
                costs = np.empty(max_k - needed_limbs.shape[0])
                all_kmean_points = []
                for k in range(needed_limbs.shape[0], max_k):
                    points, distortion = kmeans(whitened, k)
                    points[:, 0] *= x_scale
                    points[:, 1] *= y_scale
                    points = points.astype('int32')
                    all_kmean_points.append(points)
                    costs[k - needed_limbs.shape[0]] = cost(
                        points, needed_limbs)

                best_ind = np.argmin(costs)
                best_points = all_kmean_points[best_ind]

                for i, (x, y) in enumerate(best_points):
                    cv.circle(im, (x, y), 2, (0, 0, 255), -1)

                distances = np.empty(
                    (needed_limbs.shape[0], best_points.shape[0]))
                indices = np.empty(
                    (needed_limbs.shape[0], best_points.shape[0], 2),
                    dtype='uint8')
                for i in range(needed_limbs.shape[0]):
                    limb_x, limb_y = fs[needed_limbs[i]]
                    for j in range(best_points.shape[0]):
                        x, y = best_points[j]
                        dx = x - limb_x
                        dy = y - limb_y
                        distance = dx * dx + dy * dy
                        distances[i, j] = distance
                        indices[i, j, 0] = needed_limbs[i]
                        indices[i, j, 1] = j

                for i in range(needed_limbs.shape[0]):
                    i, j = np.unravel_index(np.nanargmin(distances),
                                            distances.shape)
                    limb_ind = indices[i, j, 0]
                    point_ind = indices[i, j, 1]
                    new_limb_pos = (best_points[point_ind,
                                                0], best_points[point_ind, 1])
                    cv.line(im, fs[limb_ind], new_limb_pos, (255, 255, 255), 1)
                    fs[limb_ind] = new_limb_pos
                    distances[i] = np.NaN
                    distances[:, j] = np.NaN

        for i in range(num_limbs):
            name = names[i]
            x, y = fs[i]
            write_dict[name][mv_i, 0] = x
            write_dict[name][mv_i, 1] = y

        cv.putText(im, str(frame_n), (5, 25), cv.FONT_HERSHEY_SIMPLEX, 1.0,
                   (255, 255, 255))
        cv.imshow("Dots", im)

        if pause:
            k = cv.waitKey(0)
        else:
            dt = frame_dt - (time.clock() - t)
            dt_mili = int(dt * 1000)

            if (dt_mili < 1):
                dt_mili = 1

            k = cv.waitKey(dt_mili)
            mv_i += 1
            frame_n += 1

        if k == 27:  # esc key
            print("Frames completed:", frame_n)
            f_write.save(write_dict)
            break
        elif k == 32:  # space key
            pause = not (pause)
        elif k == 63235 and pause:  # right arrow
            mv_i += 1
            frame_n += 1
            print(stds[frame_n])
        elif k == 63234 and pause:  # left arrow
            mv_i -= 1
            frame_n -= 1
            print(stds[frame_n])
示例#59
0
  -0.14  -0.016  0.047 -0.017  0.047 -0.012 -0.07  -0.032 -0.012 -0.06
  -0.035 -0.018 -0.056 -0.039 -0.056 -0.028 -0.004 -0.007 -0.005 -0.006]
  ==Twitch, ash, -therm, -thatch, bandit, blackbeard
  
[-0.139 -0.412  0.204  0.135  0.063  0.07   0.063  0.043  0.51   0.061
   0.073  0.107 -0.048  0.08   0.116  0.02   0.082  0.051  0.222  0.039
  -0.044  0.006  0.031  0.098 -0.182  0.195 -0.368  0.    -0.067  0.043
  -0.135  0.016 -0.028  0.011 -0.175 -0.25  -0.05  -0.018 -0.017 -0.08
  -0.041 -0.    -0.112 -0.03  -0.093 -0.028 -0.003 -0.004 -0.01  -0.004]
  ==-twitch, ash, thermite, valk, capitao, -vigil, -nomad
  """
  

np.random.seed((1000,2000)) #random random seed
K = range(1,20) # k's for k means
KM = [kmeans(dataset,k) for k in K]
centroids = [cent for (cent,var) in KM]
D_k = [cdist(dataset, cent, 'euclidean') for cent in centroids]
cIdx = [np.argmin(D,axis=1) for D in D_k]
dist = [np.min(D,axis=1) for D in D_k]

tot_withinss = [sum(d**2) for d in dist]  # Total within-cluster sum of squares
totss = sum(pdist(dataset)**2)/dataset.shape[0]       # The total sum of squares
betweenss = totss - tot_withinss          # The between-cluster sum of squares

##### plots #####
kIdx = 7        # K=8
mrk = 'os^p<dvh8>+x.'

# elbow curve
plt.plot(K, betweenss/totss*100, 'b*-')
示例#60
0
import matplotlib.pylab as plt

#生成待聚类的数据点,这里生成了20个点,每个点4维:
points = scipy.randn(20, 4)

#1. 层次聚类
#生成点与点之间的距离矩阵,这里用的欧氏距离:
disMat = sch.distance.pdist(points, 'euclidean')
#进行层次聚类:
Z = sch.linkage(disMat, method='average')
#将层级聚类结果以树状图表示出来并保存为plot_dendrogram.png
P = sch.dendrogram(Z)
plt.savefig('plot_dendrogram.png')
#根据linkage matrix Z得到聚类结果:
cluster = sch.fcluster(Z, t=1, 'inconsistent')

print "Original cluster by hierarchy clustering:\n", cluster

#2. k-means聚类
#将原始数据做归一化处理
data = whiten(points)

#使用kmeans函数进行聚类,输入第一维为数据,第二维为聚类个数k.
#有些时候我们可能不知道最终究竟聚成多少类,一个办法是用层次聚类的结果进行初始化.当然也可以直接输入某个数值.
#k-means最后输出的结果其实是两维的,第一维是聚类中心,第二维是损失distortion,我们在这里只取第一维,所以最后有个[0]
centroid = kmeans(data, max(cluster))[0]

#使用vq函数根据聚类中心对所有数据进行分类,vq的输出也是两维的,[0]表示的是所有数据的label
label = vq(data, centroid)[0]

print "Final clustering by k-means:\n", label