Python kmeans示例，scipy.cluster.vq.kmeans Python示例

示例#1

0

显示文件

文件： locactionKmeans1.py 项目： btoffoli/data_mining_examples

def kmeans2():
    features = locations()
    whitened = whiten(features)
    book = array((whitened[0],whitened[2]))
    kmeans(whitened,book)
    (array([[ 2.3110306 ,  2.86287398],
           [ 0.93218041,  1.24398691]]), 0.85684700941625547)

示例#2

0

显示文件

文件： locactionKmeans1.py 项目： btoffoli/data_mining_examples

def kmeans1():
    features  = array([[ 1.9,2.3], [ 1.5,2.5], [ 0.8,0.6], [ 0.4,1.8], [ 0.1,0.1], [ 0.2,1.8], [ 2.0,0.5], [ 0.3,1.5], [ 1.0,1.0]])
    whitened = whiten(features)
    book = array((whitened[0],whitened[2]))
    kmeans(whitened,book)
    (array([[ 2.3110306 ,  2.86287398],
           [ 0.93218041,  1.24398691]]), 0.85684700941625547)

示例#3

0

显示文件

文件： dump_transition_for_clustering.py 项目： kaustuvkanti/Experiments

def clustering_scipy_kmeans(features, n_clust = 8):
  """
  """
  whitened = whiten(features)
  print whitened.shape
  
  initial = [kmeans(whitened,i) for i in np.arange(1,12)]
  plt.plot([var for (cent,var) in initial])
  plt.show()
  
  #cent, var = initial[3]
  ##use vq() to get as assignment for each obs.
  #assignment,cdist = vq(whitened,cent)
  #plt.scatter(whitened[:,0], whitened[:,1], c=assignment)
  #plt.show()
  
  codebook, distortion = kmeans(whitened, n_clust)
  print codebook, distortion
  assigned_label, dist = vq(whitened, codebook)
  for ii in range(8):
    plt.subplot(4,2,ii+1)
    plt.plot(codebook[ii])
  plt.show()
  
  centroid, label = kmeans2(whitened, n_clust, minit = 'points')
  print centroid, label
  for ii in range(8):
    plt.subplot(4,2,ii)
    plt.plot(centroid[ii])
  plt.show()

示例#4

0

显示文件

文件： quantize.py 项目： zhang405744522/Caffe-Python-Tutorial

def kmeans_net(net, layers, num_c=16, initials=None):
    # net: 网络
    # layers: 需要量化的层
    # num_c: 各层的量化级别
    # initials: 初始聚类中心
    codebook = {} # 量化码表
    if type(num_c) == type(1):
        num_c = [num_c] * len(layers)
    else:
        assert len(num_c) == len(layers)

    # 对各层进行聚类分析
    print "==============Perform K-means============="
    for idx, layer in enumerate(layers):
        print "Eval layer:", layer
        W = net.params[layer][0].data.flatten()
        W = W[np.where(W != 0)] # 筛选不为0的权重
        # 默认情况下，聚类中心为线性分布中心
        if initials is None:  # Default: uniform sample
            min_W = np.min(W)
            max_W = np.max(W)
            initial_uni = np.linspace(min_W, max_W, num_c[idx] - 1)
            codebook[layer], _ = scv.kmeans(W, initial_uni)
        elif type(initials) == type(np.array([])):
            codebook[layer], _ = scv.kmeans(W, initials)
        elif initials == 'random':
            codebook[layer], _ = scv.kmeans(W, num_c[idx] - 1)
        else:
            raise Exception

        # 将0权重值附上
        codebook[layer] = np.append(0.0, codebook[layer])
        print "codebook size:", len(codebook[layer])

    return codebook

示例#5

0

显示文件

文件： t.py 项目： mcaprari/rt-pub

def custom():
	_items = {}
	users = []

	for line in open('my_items_likehood.txt'):
		user, item = keys(line)
		users.append(user)
		if item in _items:
			_items[item].append(user)
		else:
			_items[item] = [user]


	sorted_users = sorted(users)
	l = len(sorted_users)
	items={}
	count=0
	features=[]
	for item in _items:
	
		features.append(user_matrix(l, _items[item], sorted_users))
		if count == 100: break
		count += 1

	print 'whiten'
	whitened = whiten(array(features))
	print 'kmeans'
	print kmeans(whitened)
	print "%d items voted by %d users" % (len(items), len(users))

示例#6

0

显示文件

文件： cnvkit.py 项目： Kisun/bcbio-nextgen

def _get_larger_chroms(ref_file):
    """Retrieve larger chromosomes, avoiding the smaller ones for plotting.
    """
    from scipy.cluster.vq import kmeans, vq
    all_sizes = []
    for c in ref.file_contigs(ref_file):
        all_sizes.append(float(c.size))
    all_sizes.sort()
    # separate out smaller chromosomes and haplotypes with kmeans
    centroids, _ = kmeans(np.array(all_sizes), 2)
    idx, _ = vq(np.array(all_sizes), centroids)
    little_sizes = tz.first(tz.partitionby(lambda xs: xs[0], zip(idx, all_sizes)))
    little_sizes = [x[1] for x in little_sizes]
    # create one more cluster with the smaller, removing the haplotypes
    centroids2, _ = kmeans(np.array(little_sizes), 2)
    idx2, _ = vq(np.array(little_sizes), centroids2)
    little_sizes2 = tz.first(tz.partitionby(lambda xs: xs[0], zip(idx2, little_sizes)))
    little_sizes2 = [x[1] for x in little_sizes2]
    # get any chromosomes not in haplotype/random bin
    thresh = max(little_sizes2)
    larger_chroms = []
    for c in ref.file_contigs(ref_file):
        if c.size > thresh:
            larger_chroms.append(c.name)
    return larger_chroms

示例#7

0

显示文件

文件： cluster.py 项目： JasonAHeron/Data-Visualization

def cluster(df, means, csv_min, csv_max):
    data = []
    for i in range(csv_min, csv_max):
        a = array(df.ix[:, i].values)
        b = a[a != "--"]
        print np.sort(kmeans(b.astype(np.float), means)[0])
        data.append(np.sort(kmeans(b.astype(np.float), means)[0]))
    return data

示例#8

0

显示文件

文件： test_vq.py 项目： ymarfoq/outilACVDesagregation

    def test_kmeans_lost_cluster(self):
        # This will cause kmean to have a cluster with no points.
        data = np.fromfile(DATAFILE1, sep=", ")
        data = data.reshape((200, 2))
        initk = np.array([[-1.8127404, -0.67128041], [2.04621601, 0.07401111], [-2.31149087, -0.05160469]])

        kmeans(data, initk)
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", UserWarning)
            kmeans2(data, initk, missing="warn")

        assert_raises(ClusterError, kmeans2, data, initk, missing="raise")

示例#9

0

显示文件

文件： test_vq.py 项目： dyao-vu/meta-core

    def test_kmeans_lost_cluster(self):
        # This will cause kmean to have a cluster with no points.
        data = TESTDATA_2D
        initk = np.array([[-1.8127404, -0.67128041],
                         [2.04621601, 0.07401111],
                         [-2.31149087,-0.05160469]])

        kmeans(data, initk)
        with warnings.catch_warnings():
            warnings.simplefilter('ignore', UserWarning)
            kmeans2(data, initk, missing='warn')

        assert_raises(ClusterError, kmeans2, data, initk, missing='raise')

示例#10

0

显示文件

文件： dataProcessor_ionSwap_backup2.py 项目： HaeffnerLab/sqip

 def clusterkmeans(self):
     wh = whiten(self.counts) #normalizes the counts for easier clustering
     scale = self.counts[0] / wh[0]
     #compute kmeans for  k = 1,2 compare the distortions and choose the better one
     one = kmeans(wh, 1)
     two = kmeans(wh, 2)
     if one[1] < two[1]:
         print 'found only one cluser'
         threshold = None
     else:
         km = two
         threshold = scale * km[0].mean() #set threshold to be the average of two centers
     return threshold

示例#11

0

显示文件

文件： test_vq.py 项目： Brucechen13/scipy

    def test_kmeans_lost_cluster(self):
        # This will cause kmeans to have a cluster with no points.
        data = TESTDATA_2D
        initk = np.array([[-1.8127404, -0.67128041],
                         [2.04621601, 0.07401111],
                         [-2.31149087,-0.05160469]])

        with suppress_warnings() as sup:
            sup.filter(UserWarning,
                       "One of the clusters is empty. Re-run kmean with a different initialization")
            kmeans(data, initk)
            kmeans2(data, initk, missing='warn')

        assert_raises(ClusterError, kmeans2, data, initk, missing='raise')

示例#12

0

显示文件

文件： wednesday.py 项目： iceseismic/SeisSuite

def cluster_points(coord_points, N):
    """
    Function that returns k which is an nx2 matrix of lon-lat vector columns
    containing the optimal cluster centroid spacings within a large set of random
    numbers e.g. those produced by the many_points() function above!
    """
    return kmeans(coord_points, N)[0]

示例#13

0

显示文件

文件： main.py 项目： MariaBarrett/VIPExam2

def createdatabase():
	X_train = detectcompute(train1)

	print "Clustering the data with K-means"
	codebook,distortion = kmeans(whiten(X_train),k)
	print "Done.\n"
	
	imtrain = singledetect(test1)
	Pdatabase = bow(imtrain,codebook,k) #Pseudo database with list structure


	#Writing to html.table
	print "Converting the database into a HTML file"
	htmltable = open("table.htm","r+") 
	begin = "<htm><body><table cellpadding=5><tr><th>Filename</th><th>Histogram</th></tr>"
	htmltable.write(begin)

	for i in range(len(Pdatabase)):
	    middle = "<tr><td>%(filename)s</td><td>%(histogram)s</td></tr>" % {"filename": Pdatabase[i][0], "histogram": Pdatabase[i][-1]}
	    htmltable.write(middle)

	end = "</table></body></html>"    
	htmltable.write(end)
	htmltable.close()
	print "Done.\n"

	codebook_to_file(codebook)

示例#14

0

显示文件

文件： km.py 项目： samzhang111/subspace_clustering

def kmeans(features, projection, ite = 50, k = 4, threshold = 1e-5):    
    """ perform k_keamns clustering and return a the result as a subsapce clustering object """
    from scipy.cluster.vq import kmeans, vq
    import datetime

    from measures import spatial_coherence    
   
    centroids, distance = kmeans(features, k, iter=ite, thresh=threshold)
    code, _ = vq(features, centroids)
    
    run_ = datetime.datetime.now().strftime("%y_%m_%d_%H_%M")
    
    params = "projection_size=%d, k=%d" %(len(projection), k)
    clusters = clusters_from_code(code, k, projection)
  
    clustering_id = "(%s)_(%s)_(%s)_(%s)" %("exhaustive_kmeans", params, run_, projection)
    #print clustering_id
    km_clt = KMClustering(algorithm ="exhaustive_kmeans", parameters = params, run = run_,
                          clustering_id = clustering_id, clusters = clusters, ccontains_noise = False, cclustering_on_dimension = True)

   
    measures = {'spatial_coherence': spatial_coherence(km_clt, len(features))[0], 'distortion': distance}
    km_clt.update_measures(measures)
    
    return  km_clt

示例#15

0

显示文件

文件： item_clustering.py 项目： jacekwasilewski/recsys2014

def run_kmeans(whitened, k=3):
    book = list()
    for i in range(k):
        book.append(whitened[i])
    codebook, distortion = kmeans(whitened, array(book))
    # codebook, distortion = kmeans(whitened, k)
    return codebook

示例#16

0

显示文件

文件： tools.py 项目： dgmiller/portfolio

 def worldplot(self,kmeans=None,proj='merc'):
     """
     plots customer GPS location on a map with state and national boundaries.
     IN
         kmeans (int) number of means for k-means clustering, default=None
         proj (string) the map projection to use, use 'robin' to plot the whole earth, default='merc'
     """
     # create a matplotlib Basemap object
     if proj == 'robin':
         my_map = Basemap(projection=proj,lat_0=0,lon_0=0,resolution='l',area_thresh=1000)
     else:
         my_map = Basemap(projection=proj,lat_0=33.,lon_0=-125.,resolution='l',area_thresh=1000.,
                 llcrnrlon=-130.,llcrnrlat=25,urcrnrlon=-65., urcrnrlat=50)
     my_map.drawcoastlines(color='grey')
     my_map.drawcountries(color='grey')
     my_map.drawstates(color='grey')
     my_map.drawlsmask(land_color='white',ocean_color='white')
     my_map.drawmapboundary() #my_map.fillcontinents(color='black')
     x,y = my_map(np.array(self.data['lon']),np.array(self.data['lat']))
     my_map.plot(x,y,'ro',markersize=3,alpha=.4,linewidth=0)
     if kmeans:
         # k-means clustering algorithm---see wikipedia for details
         data_in = self.data.drop(['id','clv','level'],axis=1)
         # vq is scipy's vector quantization module
         output,distortion = vq.kmeans(data_in,kmeans)
         x1,y1 = my_map(output[:,1],output[:,0])
         my_map.plot(x1,y1,'ko',markersize=20,alpha=.4,linewidth=0)
     plt.show()
     return output

示例#17

0

显示文件

文件： GlacierSurfaceType_kmeans.py 项目： NatPi/RemoteSensing

def classify_kmeans(infile, clusternumber):
    '''
    apply kmeans
    '''
    
    #Load infile in data array    
    driver = gdal.GetDriverByName('GTiff')
    driver.Register()
    ds = gdal.Open(infile, gdal.GA_Update)
    databand = ds.GetRasterBand(1)
    
    #Read input raster into array
    data = ds.ReadAsArray() 
    #replace no data value with numpy.nan
    #data[data==-999.0]=numpy.nan 
    
    pixel = numpy.reshape(data,(data.shape[0]*data.shape[1]))
    centroids, variance = kmeans(pixel, clusternumber)
    code, distance = vq(pixel,centroids)
    centers_idx = numpy.reshape(code,(data.shape[0],data.shape[1]))
    clustered = centroids[centers_idx]
    
    # Write outraster to file
    databand.WriteArray(clustered)
    databand.FlushCache()        
    
    #Close file
    databand = None
    clustered = None
    ds = None

示例#18

0

显示文件

文件： zoning.py 项目： geogradient/zoning

def kmeans(iData, clustNumber, oPrefix, norm=False):
    '''Perform k-means cluster analysis and return MAP of zones'''
    print 'Run K-Means'
    
    height, width = iData.shape[1:3]
    #reshape 3D cube of data into 2D matrix and get indeces of valid pixels
    iData, notNanDataI = cube2flat(iData)
    if norm:
        #center and norm
        iDataMean = iData[:, notNanDataI].mean(axis=1)
        iDataStd  = iData[:, notNanDataI].std(axis=1)
        iData = np.subtract(iData.T, iDataMean).T
        iData = np.divide(iData.T, iDataStd).T

    #perform kmeans on valid data and return codebook
    codeBook = vq.kmeans(iData[:, notNanDataI].astype('f8').T, clustNumber)[0]
    #perform vector quantization of input data uzing the codebook
    #return vector of labels (for each valid pixel)
    labelVec = vq.vq(iData[:, notNanDataI].astype('f8').T, codeBook)[0]+1
    #create and fill MAP of zones
    zoneMap = np.zeros(width*height) + np.nan
    zoneMap[notNanDataI] = labelVec
    zoneMap = zoneMap.reshape(height, width)
    
    #visualize map of zones
    plt.imsave(oPrefix + 'zones.png', zoneMap)
    
    return zoneMap

示例#19

0

显示文件

文件： test_vq.py 项目： beiko-lab/gengis

    def test_large_features(self):
        # Generate a data set with large values, and run kmeans on it to
        # (regression for 1077).
        d = 300
        n = 100

        m1 = np.random.randn(d)
        m2 = np.random.randn(d)
        x = 10000 * np.random.randn(n, d) - 20000 * m1
        y = 10000 * np.random.randn(n, d) + 20000 * m2

        data = np.empty((x.shape[0] + y.shape[0], d), np.double)
        data[:x.shape[0]] = x
        data[x.shape[0]:] = y

        kmeans(data, 2)

示例#20

0

显示文件

文件： parser.py 项目： koosha/turf-maintenance

def read_unclustered_data(filename, num_clusters, cl_type="kMeans"):
    """Return dictionary of cluster id to array of points.

    Given a filename in the format of lat, lng
    generate k clusters based on arguments. Outputs a dictionary with
    the cluster id as the key mapped to a list of lat, lng pts
    """
    request_points = []
    with open(filename, 'rb') as input_file:
        input_file.next()  # Skip the header row
        for line in input_file:
            lat, lng = line.split(',')
            request_points.append((float(lat), float(lng)))
    request_points = array(request_points)

    if cl_type == "kMeans":
        # computing K-Means with K = num_clusters
        centroids, _ = kmeans(request_points, int(num_clusters))
        # assign each sample to a cluster
        idx, _ = vq(request_points, centroids)

    else:
        # computeing kMedoids using distance matrix
        centroids = get_kmedoids(request_points, int(num_clusters))
        # assign each sample to a cluster
        idx, _ = vq(request_points, centroids)

    # map cluster lat, lng to cluster index
    cluster_points = defaultdict(list)
    for i in xrange(len(request_points)):
        lat, lng = request_points[i]
        cluster_points[idx[i]].append((lat, lng))
    return cluster_points

示例#21

0

显示文件

文件： normalizer.py 项目： AndersHqst/estimating-gaze-direction

def getPupilThresholdWithClustering(gray,K=2, distanceWeight=2, resizeTo=(40,40)):
    ''' Detects the pupil in the image, gray, using k-means
        gray            : gray scale image
        K               : Number of clusters
        distanceWeight  : Defines the weight of the position parameters
        reSize          : the size of the image to do k-means on
    '''
    
    smallI = cv2.resize(gray, resizeTo)

    M,N = smallI.shape
    #Generate coordinates in a matrix
    X,Y = np.meshgrid(range(M),range(N))

    #Make coordinates and intensity into one vectors
    z = smallI.flatten()
    x = X.flatten()
    y = Y.flatten()

    # make a feature vectors containing (x,y,intensity)
    features = np.zeros((len(x),3))
    features[:,0] = z;
    features[:,1] = y/distanceWeight; #Divide so that the distance of position weighs less than intensity
    features[:,2] = x/distanceWeight;
    features = np.array(features,'f')

    # cluster data
    centroids,variance = vq.kmeans(features,K)

    plotClusters(centroids, features, M, N)

    centroidsByPupilCandidacy = sorted(centroids, key = lambda c: evaluateCentroid(c, resizeTo))
    
    return centroidsByPupilCandidacy[-1][0] + 10

示例#22

0

显示文件

文件： _test_KmeansDistortion.py 项目： rickpaul/pythonCluster

def performMCCAlgorithm(dataSet, specificDataPointIndex, numIterations = 200, numClusters = 4, subDataRatio = 0.5):
	periodsAhead = np.array([1, 2, 3, 4, 5, 6, 9, 12, 18, 24, 36, 60, 120])
	strippedDataSet = dataSet
	dataLength = strippedDataSet.shape[0]
	dataWidth = strippedDataSet.shape[1]
	specificDataPoint = strippedDataSet[specificDataPointIndex,:]

	numPeriods = len(periodsAhead)

	statisticWeightsbyIteration = np.empty(shape=(numIterations, 4),dtype=float)

	# Perform Bootstrapped Clustering
	for i in range(0,numIterations):
		# Perform Bootstrapped Clustering / Chooose Data Subset
		subDataSetIndexes = np.random.choice(range(0,dataLength),size=dataLength*subDataRatio,replace=True) 
		subDataSet = strippedDataSet[subDataSetIndexes,:]
		# Perform Bootstrapped Clustering / Find Data Clusters for Subset of Data
		kMeansClusters = spc.kmeans(subDataSet, numClusters)
		clusterCenters = kMeansClusters[0]
		# Perform Bootstrapped Clustering / Record Clustering Cost for Weighting Scheme
		clusteringCost = kMeansClusters[1]
		statisticWeightsbyIteration[i,0] = clusteringCost
		# Perform Bootstrapped Clustering / Apply Found Data Clusters to All Data
		allClusters = spc.vq(strippedDataSet, clusterCenters)
		clusterAssignments = allClusters[0]
		clusterDistortions = allClusters[1]
		display = 1 #TEST
		if display: #TEST
			plt.scatter(dataSet[0:60,0],dataSet[0:60,1],c=clusterAssignments[0:60]) #TEST	
			plt.show()	
		statisticWeightsbyIteration[i,1] = max(clusterDistortions)
		statisticWeightsbyIteration[i,2] = np.mean(clusterDistortions)
		statisticWeightsbyIteration[i,3] = np.std(clusterDistortions)
	return statisticWeightsbyIteration

示例#23

0

显示文件

文件： code_book.py 项目： mckinsel/qv_compress

def create_code_book(input_filename, num_clusters, num_observations,
                     feature_list=utils.QUIVER_FEATURES):
    """Create a code book from a cmp.h5 file.
    
    Args:
        input_filename: path to the SAM or cmp.h5 file
        num_clusters: the number of codes to create in the code book
        num_observations: the number of bases to use to create the code book
            clusters
        feature_list: the list of features to read from the cmp.h5 to 
            cluster

    Returns:
        code_book: a numpy array of cluster centers. rows are codes, columns are
            features
        feature_list: labels for the columns of the code book
    """

    log.debug("Checking for missing features...")
    
    if input_filename.endswith(".cmp.h5"):
        training_array = read_cmph5(input_filename, feature_list, num_observations)
    elif input_filename.endswith(".sam") or input_filename.endswith(".bam"):
        training_array = read_sam(input_filename, feature_list, num_observations)
    else:
        raise RuntimeError, "Input file must be SAM, BAM, or cmp.h5"


    clusterable_array, std_dev = make_data_clusterable(training_array,
                                                       feature_list)
    code_book, distortion = vq.kmeans(clusterable_array, num_clusters)

    raw_code_book = convert_to_raw(code_book, feature_list, std_dev)
    return raw_code_book, feature_list

示例#24

0

显示文件

文件： vocabulary.py 项目： PhilomontPhlea/PhleaBytesCV

  def train(self, featurefiles, k=100, subsampling=10):
    """Train a vocabulary from features in files listed in |featurefiles| using
    k-means with k words. Subsampling of training data can be used for speedup.
    """
    image_count = len(featurefiles)

    descr = []
    descr.append(sift.read_features_from_file(featurefiles[0])[1])
    descriptors = descr[0]  # Stack features for k-means.
    for i in numpy.arange(1, image_count):
      descr.append(sift.read_features_from_file(featurefiles[i])[1])
      descriptors = numpy.vstack((descriptors, descr[i]))

    # Run k-means.
    self.voc, distortion = vq.kmeans(descriptors[::subsampling, :], k, 1)
    self.word_count = self.voc.shape[0]

    # Project training data on vocabulary.
    imwords = numpy.zeros((image_count, self.word_count))
    for i in range(image_count):
      imwords[i] = self.project(descr[i])

    occurence_count = numpy.sum((imwords > 0)*1, axis=0)
    
    self.idf = numpy.log(image_count / (occurence_count + 1.0))
    self.trainingdata = featurefiles

示例#25

0

显示文件

文件： analysis.py 项目： tomasra/ga_sandbox

def connected_regions(image):
    """
    Converts image into grayscale, quantizes, counts connected regions
    """
    # render_image(image)

    colors = 2

    # Quantization into two colors
    image_rgb = np.dstack(image)
    pixels = np.reshape(
        image_rgb,
        (image_rgb.shape[0] * image_rgb.shape[1], image_rgb.shape[2])
    )
    centroids, _ = vq.kmeans(pixels, colors)
    quantized, _ = vq.vq(pixels, centroids)
    quantized_idx = quantized.reshape(
        (image_rgb.shape[0], image_rgb.shape[1])
    )

    if len(centroids) > 1:
        # for_render = (quantized_idx * 255).astype(np.uint8)
        # render_image(for_render)
        regions = len(region_sizes(quantized_idx))
        regions_inverted = len(region_sizes(1 - quantized_idx))
        # import pdb; pdb.set_trace()

        # if regions == 0:
        #     regions = image[0].shape[0] * image[0].shape[1]
        # print regions
        return max([regions, regions_inverted])
    else:
        return 0

示例#26

0

显示文件

文件： main.py 项目： adregan/point-grouping-test

def main():
    args = get_args()
    # This catches files sent in with stdin
    if isinstance(args.infile, TextIOWrapper):
        data = JSONFile(args.infile, True)
    else:
        data = args.infile

    points = np.array([
        [point.get('lon'), point.get('lat')]
        for point in data
    ])

    # In testing, found that a higher number of iterations led to less
    # errors due to missing centroids (Note: whitening led to worse results)
    centroids, distortion = kmeans(points, args.number_of_vans, 2000)
    index, distortion = vq(points, centroids)

    vans = [[] for _ in range(args.number_of_vans)]

    for i, point in enumerate(data):
        vans[index[i]].append(point)

    vans = distribute(vans, len(data), centroids)


    create_output(args.outfile, vans)

示例#27

0

显示文件

文件： Cluster_feature.py 项目： deepxkn/facial-expression

def build_cluster(image, featureValue, K):
    img = cv2.imread(image)
    fast = cv2.FastFeatureDetector(featureValue)
    orb = cv2.ORB(180)
    kp = fast.detect(img,None)
    kp, des = orb.compute(img, kp)

    # build keypoints location array for cluster
    locations = np.empty((len(kp),2))
    for i in range(len(kp)):
        loc = array((int(kp[i].pt[0]), int(kp[i].pt[1])))
        locations[i]=loc

    kcenters, distortion  = kmeans(locations, K)
    kcenters = kcenters[kcenters[:,0].argsort()]

    # cluster index: 0: left eye, 1 mouth and nose, 2: right eye
    kpCluster = {i: [] for i in range(K)}
    clusterLoc = {i: [] for i in range(K)}
    for i in range(len(kp)):
        set = 0
        minDis = sys.maxint
        for j in range(K):
            dis = euclidean(locations[i], kcenters[j])
            if dis<minDis:
                set = j
                minDis = dis
        kpCluster[set].append(kp[i])
        clusterLoc[set].append(locations[i])

    imageFeature = [len(kp)]
    for i in range(K):
        clusterFeature = cluster_feature(clusterLoc[i], kcenters[i])
        imageFeature = imageFeature + clusterFeature
    return imageFeature

示例#28

0

显示文件

文件： spectralClustering.py 项目： omoindrot/INF582

def spectral_clustering(W, k):

    # ====================== ADD YOUR CODE HERE ======================
    # Instructions: Perform spectral clustering to partition the
    #               data into k clusters. Implement the steps that
    #               are described in Algorithm 2 on the assignment.

    L = diag(sum(W, axis=0)) - W
    w, v = linalg.eig(L)

    y = real(v[:, w.argsort()[:k]])

    clusters, _ = kmeans(y, k)

    labels = zeros(y.shape[0])
    for i in range(y.shape[0]):
        dist = inf
        for j in range(k):
            distance = euclideanDistance(y[i], clusters[j])
            if distance < dist:
                dist = distance
                labels[i] = j
    # =============================================================

    return labels

示例#29

0

显示文件

文件： ex8_1.py 项目： GarfieldEr007/CRCPython

def main():    
    gdal.AllRegister()
    infile = auxil.select_infile() 
    if infile:                  
        inDataset = gdal.Open(infile,GA_ReadOnly)     
        cols = inDataset.RasterXSize
        rows = inDataset.RasterYSize    
        bands = inDataset.RasterCount
    else:
        return    
    pos =  auxil.select_pos(bands)
    bands = len(pos)    
    x0,y0,rows,cols=auxil.select_dims([0,0,rows,cols])   
    K = auxil.select_integer(6,msg='Number clusters')        
    G = zeros((rows*cols,len(pos))) 
    k = 0                                   
    for b in pos:
        band = inDataset.GetRasterBand(b)
        G[:,k] = band.ReadAsArray(x0,y0,cols,rows)\
                              .astype(float).ravel()
        k += 1        
    centers, _ = kmeans(G,K)
    labels, _ = vq(G,centers)      
    outfile,fmt = auxil.select_outfilefmt() 
    if outfile:
        driver = gdal.GetDriverByName(fmt)   
        outDataset = driver.Create(outfile,
                        cols,rows,1,GDT_Byte)         
        outBand = outDataset.GetRasterBand(1)
        outBand.WriteArray(reshape(labels,(rows,cols))\
                                              ,0,0) 
        outBand.FlushCache() 
        outDataset = None    
    inDataset = None

示例#30

0

显示文件

文件： em-algorithm.py 项目： huydx/datamining-collection

def kmeans(X, K):
    """
    kmeans to find clusers:
    x: dataset
    k: num of clusters
    #todo: this implements just for 2 cluster, initilization need to re-implement
    """
    ret = {"mean": [], "cov": [], "coff": []}
    kmean_ret = vq.kmeans(X, K)

    ##assign data to cluster to calculate covariance
    data = []

    for i in range(0, K, 1):
        data.append([])

    for i in range(0, X.shape[0], 1):
        dis = []
        max = 0
        max_idx = -1
        for j in range(0, K, 1):
            _dis = ((X[i] - kmean_ret[0][j]) ** 2).sum()
            if _dis >= max:
                max = _dis
                max_idx = j
        data[max_idx].append(X[i])

    for i in range(0, K, 1):
        data[i] = np.asarray(data[i])
        ret["cov"].append(np.cov(data[i].transpose()))
        ret["mean"].append(kmean_ret[0][i])
        ret["coff"].append(float(data[i].size / 2) / X.shape[0])

    return ret

示例#31

0

显示文件

文件： kmeans_all_elbow.py 项目： zaqwer777/renewable_energy_analysis

from scipy.cluster.vq import kmeans, whiten
from numpy import genfromtxt, zeros
from matplotlib import pyplot as plt

consolidated_data = genfromtxt('../new_consolidated_data.csv',
                               delimiter=',',
                               skip_header=1)

features = consolidated_data[:, 2:-13]

whitened = whiten(features)

k_distortion = zeros(50)

for k in range(1, 51):
    centroids, distortion = kmeans(whitened, k)
    k_distortion[k - 1] = distortion

fig, ax = plt.subplots()
plt.plot(range(1, 51), k_distortion)
plt.xlabel("Number of clusters")
plt.ylabel("Distortion")
plt.savefig("output/states_all_year_elbow.svg")
plt.show()

示例#32

0

显示文件

    def testCluster(self):
        print "< testCluster >"
        numVertices = 8
        graph = SparseGraph(GeneralVertexList(numVertices))

        graph.addEdge(0, 1)
        graph.addEdge(0, 2)
        graph.addEdge(1, 2)

        graph.addEdge(3, 4)
        graph.addEdge(3, 5)
        graph.addEdge(4, 5)

        graph.addEdge(0, 3)

        W = graph.getWeightMatrix()

        graphIterator = []
        graphIterator.append(W[0:6, 0:6].copy())
        W[1, 6] += 1
        W[6, 1] += 1
        graphIterator.append(W[0:7, 0:7].copy())
        W[4, 7] += 1
        W[7, 4] += 1
        graphIterator.append(W.copy())
        graphIterator = iter(graphIterator)

        k = 2
        clusterer = NingSpectralClustering(k)
        clustersList = clusterer.cluster(
            toSparseGraphListIterator(graphIterator))

        #Why are the bottom rows of Q still zero?

        #Try example in which only edges change
        numVertices = 7
        graph = SparseGraph(GeneralVertexList(numVertices))

        graph.addEdge(0, 1)
        graph.addEdge(0, 2)
        graph.addEdge(1, 2)

        graph.addEdge(3, 4)

        WList = []
        W = graph.getWeightMatrix()
        WList.append(W[0:5, 0:5].copy())

        graph.addEdge(3, 5)
        graph.addEdge(4, 5)
        W = graph.getWeightMatrix()
        WList.append(W[0:6, 0:6].copy())

        graph.addEdge(0, 6)
        graph.addEdge(1, 6)
        graph.addEdge(2, 6)
        W = graph.getWeightMatrix()
        WList.append(W[0:7, 0:7].copy())

        iterator = iter(WList)
        clustersList = clusterer.cluster(toSparseGraphListIterator(iterator))

        #Seems to work, amazingly
        #print(clustersList)

        #Try removing rows/cols
        W2 = W[0:5, 0:5]
        W3 = W[0:4, 0:4]
        WList = [W, W2, W3]
        iterator = iter(WList)
        clustersList = clusterer.cluster(toSparseGraphListIterator(iterator))

        #nptst.assert_array_equal(clustersList[0][0:5], clustersList[1])
        nptst.assert_array_equal(clustersList[1][0:4], clustersList[2])

        #Make sure 1st clustering (without updates) is correct
        L = GraphUtils.normalisedLaplacianRw(scipy.sparse.csr_matrix(W))
        numpy.random.seed(21)
        lmbda, Q = scipy.sparse.linalg.eigs(L,
                                            min(k, L.shape[0] - 1),
                                            which="SM",
                                            ncv=min(20 * k, L.shape[0]),
                                            v0=numpy.random.rand(L.shape[0]))

        V = VqUtils.whiten(Q)
        centroids, distortion = vq.kmeans(V, k, iter=20)
        clusters, distortion = vq.vq(V, centroids)

        #This should be equal but the eigenvector computation is unstable
        #even with repeated runs (and no way to set the seed)
        nptst.assert_array_equal(clusters, clustersList[0])

示例#33

0

显示文件

def kmean_anchors(path='../data/coco128.yaml', n=9, img_size=640, thr=4.0, gen=1000, verbose=True):
    """ Creates kmeans-evolved anchors from training dataset

        Arguments:
            path: path to dataset *.yaml, or a loaded dataset
            n: number of anchors
            img_size: image size used for training
            thr: anchor-label wh ratio threshold hyperparameter hyp['anchor_t'] used for training, default=4.0
            gen: generations to evolve anchors using genetic algorithm
            verbose: print all results

        Return:
            k: kmeans evolved anchors

        Usage:
            from utils.autoanchor import *; _ = kmean_anchors()
    """
    thr = 1. / thr

    def metric(k, wh):  # compute metrics
        r = wh[:, None] / k[None]
        x = torch.min(r, 1. / r).min(2)[0]  # ratio metric
        # x = wh_iou(wh, torch.tensor(k))  # iou metric
        return x, x.max(1)[0]  # x, best_x

    def anchor_fitness(k):  # mutation fitness
        _, best = metric(torch.tensor(k, dtype=torch.float32), wh)
        return (best * (best > thr).float()).mean()  # fitness

    def print_results(k):
        k = k[np.argsort(k.prod(1))]  # sort small to large
        x, best = metric(k, wh0)
        bpr, aat = (best > thr).float().mean(), (x > thr).float().mean() * n  # best possible recall, anch > thr
        print('thr=%.2f: %.4f best possible recall, %.2f anchors past thr' % (thr, bpr, aat))
        print('n=%g, img_size=%s, metric_all=%.3f/%.3f-mean/best, past_thr=%.3f-mean: ' %
              (n, img_size, x.mean(), best.mean(), x[x > thr].mean()), end='')
        for i, x in enumerate(k):
            print('%i,%i' % (round(x[0]), round(x[1])), end=',  ' if i < len(k) - 1 else '\n')  # use in *.cfg
        return k

    if isinstance(path, str):  # *.yaml file
        with open(path) as f:
            data_dict = yaml.load(f, Loader=yaml.FullLoader)  # model dict
        from utils.datasets import LoadImagesAndLabels
        dataset = LoadImagesAndLabels(data_dict['train'], augment=True, rect=True)
    else:
        dataset = path  # dataset

    # Get label wh
    shapes = img_size * dataset.shapes / dataset.shapes.max(1, keepdims=True)
    wh0 = np.concatenate([l[:, 3:5] * s for s, l in zip(shapes, dataset.labels)])  # wh

    # Filter
    i = (wh0 < 3.0).any(1).sum()
    if i:
        print('WARNING: Extremely small objects found. '
              '%g of %g labels are < 3 pixels in width or height.' % (i, len(wh0)))
    wh = wh0[(wh0 >= 2.0).any(1)]  # filter > 2 pixels
    # wh = wh * (np.random.rand(wh.shape[0], 1) * 0.9 + 0.1)  # multiply by random scale 0-1

    # Kmeans calculation
    print('Running kmeans for %g anchors on %g points...' % (n, len(wh)))
    s = wh.std(0)  # sigmas for whitening
    k, dist = kmeans(wh / s, n, iter=30)  # points, mean distance
    k *= s
    wh = torch.tensor(wh, dtype=torch.float32)  # filtered
    wh0 = torch.tensor(wh0, dtype=torch.float32)  # unfiltered
    k = print_results(k)

    # Evolve
    npr = np.random
    f, sh, mp, s = anchor_fitness(k), k.shape, 0.9, 0.1  # fitness, generations, mutation prob, sigma
    pbar = tqdm(range(gen), desc='Evolving anchors with Genetic Algorithm')  # progress bar
    for _ in pbar:
        v = np.ones(sh)
        while (v == 1).all():  # mutate until a change occurs (prevent duplicates)
            v = ((npr.random(sh) < mp) * npr.random() * npr.randn(*sh) * s + 1).clip(0.3, 3.0)
        kg = (k.copy() * v).clip(min=2.0)
        fg = anchor_fitness(kg)
        if fg > f:
            f, k = fg, kg.copy()
            pbar.desc = 'Evolving anchors with Genetic Algorithm: fitness = %.4f' % f
            if verbose:
                print_results(k)

    return print_results(k)

示例#34

0

显示文件

def apply_kmeans(box_dict, k):
    # for every object class in the box_dict
    #     reduce the list of boxes to the clustered boxes with kmeans
    # return the new dictionary
    kmeans_dict = dict()
    for obj_class in box_dict:
        print obj_class
        boxes = box_dict[obj_class]
        if len(boxes) > k:
            # write a representation for each proposal box as a vector
            def box_to_vec(pbox):
                # list of metrics which we want to reduce the Euclidean distance of:
                # includes centroid, and each of the individual coordinates of the box,
                # which are used to recover box coordinates after the k means in vector reprepresentation
                # are found. To weight the impact of the centroid measure,
                # we multiply by 1/area: the centroid matters less as box area increases.
                # we also include the coordinates, since distances between them are relevant as well.
                # Note that including the original coordinates in the vector allows us to recover the
                # original representation of the box.
                # we also include the score (scaled down) for the same reason. We scale it down since score-space
                # should not really affect the distance between boxes (having similar scores is not necessarily a good reason
                # to combine or not)
                metrics = [
                    pbox.centroid()[0],
                    pbox.centroid()[1],
                    pbox.centroid()[0] / pbox.area(),
                    pbox.centroid()[1] / pbox.area(), pbox.x1, pbox.y1,
                    pbox.x2, pbox.y2, 0.00001 * pbox.score
                ]
                return metrics

            # we will append the columns together and then take transpose
            # so that each row is a box with n features (here n = 9)
            first_col = box_to_vec(boxes[0])
            # for rescaling
            oldx1, oldy1, oldx2, oldy2, oldscore = first_col[4], first_col[
                5], first_col[6], first_col[7], first_col[8]
            first_col = np.array(first_col)
            first_col = first_col.T
            box_mat = first_col
            for i in range(1, len(boxes)):
                new_col = np.array(box_to_vec(boxes[i]))
                new_col = new_col.T
                box_mat = np.c_[box_mat, new_col]
            box_mat = box_mat.T
            box_mat = box_mat.astype('float')
            # whiten
            box_mat = whiten(box_mat)
            # need to rescale the coords when we recover the boxes from the representation vectors
            newx1, newy1, newx2, newy2, newscore = 0, 0, 0, 0, 0
            if len(np.shape(box_mat)) > 1:
                newx1, newy1, newx2, newy2, newscore = box_mat[0][4], box_mat[
                    0][5], box_mat[0][6], box_mat[0][7], box_mat[0][8]
            else:
                newx1, newy1, newx2, newy2, newscore = box_mat[4], box_mat[
                    5], box_mat[6], box_mat[7], box_mat[8]
            scalex1, scaley1, scalex2, scaley2, scalescore = oldx1 / (
                0. + newx1), oldy1 / (0. + newy1), oldx2 / (
                    0. + newx2), oldy2 / (0. + newy2), oldscore / (0. +
                                                                   newscore)
            # use k-means
            codebook, distortion = kmeans(box_mat, k)
            centroid_boxes = []
            for i in range(np.shape(codebook)[0]):
                # we chop off from 4 onwards because these are (pbox.x1, pbox.y1, pbox.x2, pbox.y2, pbox.score)
                # this is a direct inverse from box_to_vec
                # need to multiply these coords by standard deviations across all instances of feature.
                thebox = box(scalex1 * codebook[i][4],
                             scaley1 * codebook[i][5],
                             scalex2 * codebook[i][6],
                             scaley2 * codebook[i][7],
                             scalescore * codebook[i][8])
                centroid_boxes.append(thebox)
            print "# of centroids: " + str(len(centroid_boxes))
            print centroid_boxes[0]
            print centroid_boxes[1]
            print centroid_boxes[2]
            if obj_class not in kmeans_dict:
                kmeans_dict[obj_class] = []
            kmeans_dict[obj_class] = centroid_boxes
        else:
            kmeans_dict[obj_class] = box_dict[obj_class]
        print "==================================="
    return kmeans_dict

示例#35

0

显示文件

def kmean_anchors(path='./data/coco128.yaml', n=9, img_size=640, thr=4.0, gen=1000, verbose=True):
    """ Creates kmeans-evolved anchors from training dataset

        Arguments:
            path: path to dataset *.yaml, or a loaded dataset
            n: number of anchors
            img_size: image size used for training
            thr: anchor-label wh ratio threshold hyperparameter hyp['anchor_t'] used for training, default=4.0
            gen: generations to evolve anchors using genetic algorithm

        Return:
            k: kmeans evolved anchors

        Usage:
            from utils.utils import *; _ = kmean_anchors()
    """
    thr = 1. / thr

    def metric(k):  # compute metrics
        r = wh[:, None] / k[None]
        x = torch.min(r, 1. / r).min(2)[0]  # ratio metric
        # x = wh_iou(wh, torch.tensor(k))  # iou metric
        return x, x.max(1)[0]  # x, best_x

    def fitness(k):  # mutation fitness
        _, best = metric(k)
        return (best * (best > thr).float()).mean()  # fitness

    def print_results(k):
        k = k[np.argsort(k.prod(1))]  # sort small to large
        x, best = metric(k)
        bpr, aat = (best > thr).float().mean(), (x > thr).float().mean() * n  # best possible recall, anch > thr
        print('thr=%.2f: %.3f best possible recall, %.2f anchors past thr' % (thr, bpr, aat))
        print('n=%g, img_size=%s, metric_all=%.3f/%.3f-mean/best, past_thr=%.3f-mean: ' %
              (n, img_size, x.mean(), best.mean(), x[x > thr].mean()), end='')
        for i, x in enumerate(k):
            print('%i,%i' % (round(x[0]), round(x[1])), end=',  ' if i < len(k) - 1 else '\n')  # use in *.cfg
        return k

    if isinstance(path, str):  # *.yaml file
        with open(path) as f:
            data_dict = yaml.load(f, Loader=yaml.FullLoader)  # model dict
        from utils.datasets import LoadImagesAndLabels
        dataset = LoadImagesAndLabels(data_dict['train'], augment=True, rect=True)
    else:
        dataset = path  # dataset

    # Get label wh
    shapes = img_size * dataset.shapes / dataset.shapes.max(1, keepdims=True)
    wh = torch.tensor(np.concatenate([l[:, 3:5] * s for s, l in zip(shapes, dataset.labels)])).float()  # wh
    wh = wh[(wh > 2.0).all(1)].numpy()  # filter > 2 pixels

    # Kmeans calculation
    from scipy.cluster.vq import kmeans
    print('Running kmeans for %g anchors on %g points...' % (n, len(wh)))
    s = wh.std(0)  # sigmas for whitening
    k, dist = kmeans(wh / s, n, iter=30)  # points, mean distance
    k *= s
    wh = torch.tensor(wh)
    k = print_results(k)

    # Plot
    # k, d = [None] * 20, [None] * 20
    # for i in tqdm(range(1, 21)):
    #     k[i-1], d[i-1] = kmeans(wh / s, i)  # points, mean distance
    # fig, ax = plt.subplots(1, 2, figsize=(14, 7))
    # ax = ax.ravel()
    # ax[0].plot(np.arange(1, 21), np.array(d) ** 2, marker='.')
    # fig, ax = plt.subplots(1, 2, figsize=(14, 7))  # plot wh
    # ax[0].hist(wh[wh[:, 0]<100, 0],400)
    # ax[1].hist(wh[wh[:, 1]<100, 1],400)
    # fig.tight_layout()
    # fig.savefig('wh.png', dpi=200)

    # Evolve
    npr = np.random
    f, sh, mp, s = fitness(k), k.shape, 0.9, 0.1  # fitness, generations, mutation prob, sigma
    for _ in tqdm(range(gen), desc='Evolving anchors with Genetic Algorithm:'):
        v = np.ones(sh)
        while (v == 1).all():  # mutate until a change occurs (prevent duplicates)
            v = ((npr.random(sh) < mp) * npr.random() * npr.randn(*sh) * s + 1).clip(0.3, 3.0)
        kg = (k.copy() * v).clip(min=2.0)
        fg = fitness(kg)
        if fg > f:
            f, k = fg, kg.copy()
            if verbose:
                print_results(k)
    k = print_results(k)
    return k

示例#36

0

显示文件

文件： main.py 项目： mappp7/tools

    def initialize(self,
                   poses,
                   rest_pose,
                   num_bones,
                   iterations,
                   mayaMesh=None,
                   jointList=None):

        bones = []
        num_verts = rest_pose.shape[0]  # shape mean array scale
        num_poses = poses.shape[0]

        bone_transforms = np.empty(
            (num_bones, num_poses, 4,
             3))  # [(R, T) for for each pose] for each bone
        # 3rd dim has 3 rows for R and 1 row for T

        # Use k-means to assign bones to vertices
        whitened = whiten(rest_pose)
        codebook, _ = kmeans(whitened, num_bones)
        rest_pose_corrected = np.empty(
            (num_bones, num_verts,
             3))  # Rest pose - mean of vertices attached to each bone

        # confirm mode
        if mayaMesh:
            #rigid Skin
            vert_assignments, bones = self.manual_codebook(mayaMesh, jointList)
            boneArray = []
            for i in bones:
                boneArray.append(cmds.xform(i, q=1, t=1, ws=1))

            self.rest_bones_t = np.array(boneArray)
            #rest_bones_t = np.empty((num_bones , 3))

            for bone in range(num_bones):
                #rest_bones_t[bone] = np.mean(rest_pose[vert_assignments == bone] , axis = 0)
                self.rest_bones_t[bone] = np.array(boneArray[bone])
                rest_pose_corrected[bone] = rest_pose - self.rest_bones_t[bone]

                for pose in range(num_poses):

                    bone_transforms[bone, pose] = self.kabsch(
                        rest_pose_corrected[bone, vert_assignments == bone],
                        poses[pose, vert_assignments == bone])

        else:
            # Compute initial random bone transformations

            vert_assignments, _ = vq(
                whitened,
                codebook)  # Bone assignment for each vertex (|num_verts| x 1)
            self.rest_bones_t = np.empty(
                (num_bones, 3))  # Translations for bones at rest pose

            for bone in range(num_bones):
                self.rest_bones_t[bone] = np.mean(
                    rest_pose[vert_assignments == bone], axis=0)
                rest_pose_corrected[bone] = rest_pose - self.rest_bones_t[bone]

                for pose in range(num_poses):

                    bone_transforms[bone, pose] = self.kabsch(
                        rest_pose_corrected[bone, vert_assignments == bone],
                        poses[pose, vert_assignments == bone])

        for it in range(iterations):

            # Re-assign bones to vertices using smallest reconstruction error from all poses
            constructed = np.empty(
                (num_bones, num_poses, num_verts,
                 3))  # |num_bones| x |num_poses| x |num_verts| x 3
            for bone in range(num_bones):
                Rp = bone_transforms[bone, :, :3, :].dot(
                    (rest_pose - self.rest_bones_t[bone]).T).transpose(
                        (0, 2, 1))  # |num_poses| x |num_verts| x 3
                # R * p + T
                constructed[bone] = Rp + bone_transforms[bone, :, np.newaxis,
                                                         3, :]
            errs = np.linalg.norm(constructed - poses,
                                  axis=(1, 3))  # position value average

            vert_assignments = np.argmin(errs, axis=0)

            # For each bone, for each pose, compute new transform using kabsch
            for bone in range(num_bones):

                self.rest_bones_t[bone] = np.mean(
                    rest_pose[vert_assignments == bone], axis=0)

                rest_pose_corrected[bone] = rest_pose - self.rest_bones_t[bone]

                for pose in range(num_poses):
                    P = rest_pose_corrected[bone, vert_assignments == bone]
                    Q = poses[pose, vert_assignments == bone]

                    if (P.size == 0 or Q.size == 0):
                        print 'Skip Iteration'
                    else:

                        bone_transforms[bone, pose] = self.kabsch(P, Q)

        # jointList is correct Index Joint

        return bone_transforms, self.rest_bones_t, bones

示例#37

0

显示文件

文件： setpoints.py 项目： sd8781/OpenStudio-BuildStock

    'sqlite:///c:/RBSA/year1/RBSA_METER_DATA_1/RBSA_METER_DATA_1.sqlite')
dict = {'meter_min_cluster': [], 'meter_max_cluster': []}
for siteid in pd.read_sql_query("SELECT DISTINCT siteid FROM RBSA_METER_DATA",
                                engine).values:
    siteid = int(siteid[0])
    df = pd.read_sql_query(
        "SELECT siteid, time, IDT from RBSA_METER_DATA WHERE siteid='{}'".
        format(siteid), engine)
    df['siteid'] = df['siteid'].astype('int')
    df = df.set_index('siteid')
    df = df.dropna()
    df['month'] = df['time'].apply(lambda x: x[2:5])
    df = df.loc[df['month'].isin(['DEC', 'JAN', 'FEB'])]
    if pd.isnull(df['IDT']).all():
        continue
    codebook, _ = kmeans(np.array(df['IDT']), 2)  # two clusters
    dict['meter_min_cluster'].append(min(codebook))
    dict['meter_max_cluster'].append(max(codebook))

df_meter = pd.DataFrame.from_dict(dict)

# audit
engine = sqlalchemy.create_engine(
    'sqlite:///c:/OpenStudio-ResStock/OpenStudio-ResStock/data/rbsa/rbsa.sqlite'
)
df = pd.read_sql_query(
    "SELECT siteid, ResInt_HeatTemp, ResInt_HeatTempNight from SF_ri_heu",
    engine)
resint_heattemp = df['ResInt_HeatTemp']
resint_heattemp = resint_heattemp[resint_heattemp > 0].dropna()
resint_heattempnight = df['ResInt_HeatTempNight']

示例#38

0

显示文件

def kmean_anchors(path='data/DsiacPlusF2.txt', n=9, img_size=(416, 416)):
    # from utils.utils import *; _ = kmean_anchors()
    # Produces a list of target kmeans suitable for use in *.cfg files
    from utils.datasets import LoadImagesAndLabels
    thr = 0.20  # IoU threshold

    def print_results(thr, wh, k):
        k = k[np.argsort(k.prod(1))]  # sort small to large
        iou = wh_iou(torch.Tensor(wh), torch.Tensor(k))
        max_iou, min_iou = iou.max(1)[0], iou.min(1)[0]
        bpr, aat = (max_iou > thr).float().mean(), (iou > thr).float().mean() * n  # best possible recall, anch > thr
        print('%.2f iou_thr: %.3f best possible recall, %.2f anchors > thr' % (thr, bpr, aat))
        print('kmeans anchors (n=%g, img_size=%s, IoU=%.3f/%.3f/%.3f-min/mean/best): ' %
              (n, img_size, min_iou.mean(), iou.mean(), max_iou.mean()), end='')
        for i, x in enumerate(k):
            print('%i,%i' % (round(x[0]), round(x[1])), end=',  ' if i < len(k) - 1 else '\n')  # use in *.cfg
        return k

    def fitness(thr, wh, k):  # mutation fitness
        iou = wh_iou(wh, torch.Tensor(k)).max(1)[0]  # max iou
        bpr = (iou > thr).float().mean()  # best possible recall
        return iou.mean() * bpr  # product

    # Get label wh
    wh = []
    dataset = LoadImagesAndLabels(path, augment=True, rect=True, cache_labels=True)
    nr = 1 if img_size[0] == img_size[1] else 10  # number augmentation repetitions
    for s, l in zip(dataset.shapes, dataset.labels):
        wh.append(l[:, 3:5] * (s / s.max()))  # image normalized to letterbox normalized wh
    wh = np.concatenate(wh, 0).repeat(nr, axis=0)  # augment 10x
    wh *= np.random.uniform(img_size[0], img_size[1], size=(wh.shape[0], 1))  # normalized to pixels (multi-scale)

    # Darknet yolov3.cfg anchors
    use_darknet = False
    if use_darknet:
        k = np.array([[10, 13], [16, 30], [33, 23], [30, 61], [62, 45], [59, 119], [116, 90], [156, 198], [373, 326]])
    else:
        # Kmeans calculation
        from scipy.cluster.vq import kmeans
        print('Running kmeans for %g anchors on %g points...' % (n, len(wh)))
        s = wh.std(0)  # sigmas for whitening
        k, dist = kmeans(wh / s, n, iter=30)  # points, mean distance
        k *= s
    k = print_results(thr, wh, k)

    # # Plot
    # k, d = [None] * 20, [None] * 20
    # for i in tqdm(range(1, 21)):
    #     k[i-1], d[i-1] = kmeans(wh / s, i)  # points, mean distance
    # fig, ax = plt.subplots(1, 2, figsize=(14, 7))
    # ax = ax.ravel()
    # ax[0].plot(np.arange(1, 21), np.array(d) ** 2, marker='.')

    # Evolve
    npr = np.random
    wh = torch.Tensor(wh)
    f, sh, ng, mp, s = fitness(thr, wh, k), k.shape, 1000, 0.9, 0.1  # fitness, generations, mutation probability, sigma
    for _ in tqdm(range(ng), desc='Evolving anchors'):
        v = np.ones(sh)
        while (v == 1).all():  # mutate until a change occurs (prevent duplicates)
            v = ((npr.random(sh) < mp) * npr.random() * npr.randn(*sh) * s + 1).clip(0.3, 3.0)  # 98.6, 61.6
        kg = (k.copy() * v).clip(min=2.0)
        fg = fitness(thr, wh, kg)
        if fg > f:
            f, k = fg, kg.copy()
            print_results(thr, wh, k)
    k = print_results(thr, wh, k)

    return k

示例#39

0

显示文件

文件： frame_sampling_KMEANS.py 项目： EEmery/anomaly-detection

		v_ids = [i for i in os.listdir(input_file_path_base + vehicle_type + '/' + fuel_type) if i[-4:] == '.csv']
		for v_num, v_id in enumerate(v_ids):
			print "Creating Self-Organized Map for " + vehicle_type + " with " + fuel_type + " consuption (ID " + str(v_num) + " of " + str(len(v_ids)) + ")\r",

			# Opens ID frame sampling analysis (NOT NORMALIZED)
			input_file_name = input_file_path_base + vehicle_type + '/' + fuel_type + '/' + v_id
			df = pd.read_csv(input_file_name)
			data = df.fillna(0).as_matrix().astype(float)

			# Starts K-Means analysis
			best_distortion = None
			best_code_book = None
			best_distance = None
			best_k = None
			for k_mean in k_means:
				centroids, distortion = kmeans(data, k_mean)									# Uses kmeans to clusterize data from actual map
				code_book, distance = vq(data, centroids)										# Gets codebook of ID's

				# Saves results if distortion is more than "elbow_rate" percent smaller than the best distortion so far
				if best_distortion == None or abs(distortion - best_distortion)/best_distortion > elbow_rate:
					best_distortion = distortion
					best_code_book = code_book
					best_distance = distance
					best_k = k_mean
				# If distortion is not "elbow_percent" percent smaller than the best distortion so far, quites the analysis
				else:
					break

			df['CODE_BOOK'] = best_code_book													# Saves codebook on result on dataframe
			df['DISTANCE'] = best_distance														# Saves distances from centroids on dataframe
			f.write(vehicle_type + ' ' + fuel_type + ' ' + v_id[:-4] + ' k=' + str(best_k) + '\n')

示例#40

0

显示文件

文件： kmeansclustering.py 项目： AndreRdz7/MachineLearning

# -*- coding: utf-8 -*-
"""
K-means clustering

@author: David André Rodríguez Méndez (AndreRdz7)
"""
# Import libraries
import numpy as numpy
from scipy.cluster.vq import vq, kmeans
# Create datasets
data = np.random.random(90).reshape(30, 3)
c1 = np.random.choice(range(len(data)))
c2 = np.random.choice(range(len(data)))
# Getting k
clust_centers = np.vstack([data[c1], data[c2]])
print(clust_centers)
print(vq(data, clust_centers))
# K-means
kmeans(data, clust_centers)

示例#41

0

显示文件

文件： test_vq.py 项目： yaqiongl/scipy

 def test_kmeans_large_thres(self):
     # Regression test for gh-1774
     x = np.array([1, 2, 3, 4, 10], dtype=float)
     res = kmeans(x, 1, thresh=1e16)
     assert_allclose(res[0], np.array([4.]))
     assert_allclose(res[1], 2.3999999999999999)

示例#42

0

显示文件

文件： learn.py 项目： ankit-maverick/Minimal-Bag-of-Visual-Words-Image-Classifier

        cat_path = join(datasetpath, cat)
        cat_files = get_imgfiles(cat_path)
        cat_features = extractSift(cat_files)
        all_files = all_files + cat_files
        all_features.update(cat_features)
        cat_label[cat] = label
        for i in cat_files:
            all_files_labels[i] = label

    print "---------------------"
    print "## computing the visual words via k-means"
    all_features_array = dict2numpy(all_features)
    nfeatures = all_features_array.shape[0]
    nclusters = int(sqrt(nfeatures))
    codebook, distortion = vq.kmeans(all_features_array,
                                     nclusters,
                                     thresh=K_THRESH)

    with open(datasetpath + CODEBOOK_FILE, 'wb') as f:

        dump(codebook, f, protocol=HIGHEST_PROTOCOL)

    print "---------------------"
    print "## compute the visual words histograms for each image"
    all_word_histgrams = {}
    for imagefname in all_features:
        word_histgram = computeHistograms(codebook, all_features[imagefname])
        all_word_histgrams[imagefname] = word_histgram

    print "---------------------"
    print "## write the histograms to file to pass it to the svm"

示例#43

0

显示文件

文件： SciPy.py 项目： PriyankaSrivastava10/ml-basics

from numpy import vstack, array
from numpy.random import rand
#from scipy.cluster.vq import whiten
import scipy.cluster.vq as vec

# data generation with three features
data = vstack((rand(100, 3) + array([.5, .5, .5]), rand(100, 3)))

# whitening of data
data = vec.whiten(data)

# computing K-Means with K = 3 (2 clusters)
centroids, _ = vec.kmeans(data, 3)

# assign each sample to a cluster
clx, _ = vec.vq(data, centroids)

print(data)
print(centroids)
print(clx)

示例#44

0

显示文件

文件： execute.py 项目： hansyuan/cr-algs

def find_starting_set(run=True, display=False):
    """ For of the 100 clustered demands points,
    
    Determine whether a base in the set of n bases 
    can reach that demand point within r1. 
    
    As soon as a demand point cannot be reached by a base
    within r1, throw that set of bases out
    
    There is no way to brute force every combination of 8 ambulances
    But instead use the demand points, find the surrounding bases,
    and check whether that fulfills the set coverage. 
    
    Reorder the bases list, and then randomize it. """

    if not run: return
    global calls, bases, demands, times, converted_calls, calls_kmeans

    delta = 0  # See below.
    clust_call_to_base = []

    call_array = array(converted_calls)
    if not calls_kmeans:
        calls_kmeans = kmeans(call_array, top_n)

    # Get the first coordinate, then find the closest actual base.
    calls_clustered_list = calls_kmeans[0]

    if display: print("For each representative call point, find the bases. \n")

    for each_call in calls_clustered_list:
        if display: print("\n")

        # Search for points. If it empty, then redo it.
        delta = 0.01
        actual = []
        reorder_bases = copy.deepcopy(bases)
        reorder_bases.sort(key=itemgetter(0))

        while not actual:
            actual = search.surrounding_points(
                each_call,
                delta,
                [],  # Doesn't actually do anything
                reorder_bases)
            delta += 0.01

        clust_call_to_base.append(tuple([list(each_call), actual]))

        if display:
            plot([each_call], "red")

            print("-----------------------------------------------------")
            print(each_call, " with distance of %.2f km  " % (delta))
            print("-----------------------------------------------------")

            for each in actual:
                print(each)
                plot([each[0]], "green")

            plt.show()
    if display: print("<< EOF >>")
    return clust_call_to_base

示例#45

0

显示文件

文件： bpmgroup.py 项目： ryuhonllc/mls2

bpms = []
for line in fd:
    if comment.match(line):
        continue
    md = pat.search(line)
    if md:
        bpm = md.group(2)
        bpm = float(bpm)
        title = re.sub(pat, "", line)
        track = Track(title, bpm)
        tracks.append(track)
        bpms.append(bpm)

obs = np.array(bpms).T

cb, error = kmeans(obs, args.nClusters)

codes, dist = vq(obs, cb)

total_error = 0
for i, item in enumerate(tracks):
    bpm = cb[codes[i]]
    tracks[i].new_bpm = bpm
    error = dist[i]
    tracks[i].error = error
    total_error += error

tracks.sort(key=lambda x: x.orig_bpm)

curbpm = 0
for _, track in enumerate(tracks):

示例#46

0

显示文件

from collections import defaultdict
from similar_words import load_vectors
import argparse

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--basename',
                        help='base name of word vector files',
                        type=str)
    parser.add_argument('--maxwords',
                        help='maximum number of words to cluster',
                        type=int)
    parser.add_argument('--k', help='number of clusters', type=int)
    args = parser.parse_args()

    vectors, words = load_vectors(args.basename, args.maxwords)

    centroids, _ = kmeans(vectors, args.k)
    idx, _ = vq(vectors, centroids)

    clusters = defaultdict(set)
    for i, c in enumerate(idx):
        clusters[c].add(words[i])

    for c in range(args.k):
        print 'CLUSTER', c + 1,
        for word in clusters[c]:
            print word,
        print
        print

示例#47

0

显示文件

def colorCluster(img):
    img = imread(img)
    pixel = reshape(img,(img.shape[0]*img.shape[1],3))
    centroids,_ = kmeans(pixel,3) # six colors will be found
    return centroids

示例#48

0

显示文件

def kmean_anchors(path='./data/coco64.txt',
                  n=9,
                  img_size=(640, 640),
                  thr=0.20,
                  gen=1000):
    # Creates kmeans anchors for use in *.cfg files: from utils.utils import *; _ = kmean_anchors()
    # n: number of anchors
    # img_size: (min, max) image size used for multi-scale training (can be same values)
    # thr: IoU threshold hyperparameter used for training (0.0 - 1.0)
    # gen: generations to evolve anchors using genetic algorithm
    from utils.datasets import LoadImagesAndLabels

    def print_results(k):
        k = k[np.argsort(k.prod(1))]  # sort small to large
        iou = wh_iou(wh, torch.Tensor(k))
        max_iou = iou.max(1)[0]
        bpr, aat = (max_iou > thr).float().mean(), (
            iou > thr).float().mean() * n  # best possible recall, anch > thr
        print('%.2f iou_thr: %.3f best possible recall, %.2f anchors > thr' %
              (thr, bpr, aat))
        print(
            'n=%g, img_size=%s, IoU_all=%.3f/%.3f-mean/best, IoU>thr=%.3f-mean: '
            % (n, img_size, iou.mean(), max_iou.mean(), iou[iou > thr].mean()),
            end='')
        for i, x in enumerate(k):
            print('%i,%i' % (round(x[0]), round(x[1])),
                  end=',  ' if i < len(k) - 1 else '\n')  # use in *.cfg
        return k

    def fitness(k):  # mutation fitness
        iou = wh_iou(wh, torch.Tensor(k))  # iou
        max_iou = iou.max(1)[0]
        return (max_iou * (max_iou > thr).float()).mean()  # product

    # Get label wh
    wh = []
    dataset = LoadImagesAndLabels(path, augment=True, rect=True)
    nr = 1 if img_size[0] == img_size[
        1] else 10  # number augmentation repetitions
    for s, l in zip(dataset.shapes, dataset.labels):
        wh.append(l[:, 3:5] *
                  (s / s.max()))  # image normalized to letterbox normalized wh
    wh = np.concatenate(wh, 0).repeat(nr, axis=0)  # augment 10x
    wh *= np.random.uniform(img_size[0], img_size[1],
                            size=(wh.shape[0],
                                  1))  # normalized to pixels (multi-scale)
    wh = wh[(wh > 2.0).all(1)]  # remove below threshold boxes (< 2 pixels wh)

    # Kmeans calculation
    from scipy.cluster.vq import kmeans
    print('Running kmeans for %g anchors on %g points...' % (n, len(wh)))
    s = wh.std(0)  # sigmas for whitening
    k, dist = kmeans(wh / s, n, iter=30)  # points, mean distance
    k *= s
    wh = torch.Tensor(wh)
    k = print_results(k)

    # # Plot
    # k, d = [None] * 20, [None] * 20
    # for i in tqdm(range(1, 21)):
    #     k[i-1], d[i-1] = kmeans(wh / s, i)  # points, mean distance
    # fig, ax = plt.subplots(1, 2, figsize=(14, 7))
    # ax = ax.ravel()
    # ax[0].plot(np.arange(1, 21), np.array(d) ** 2, marker='.')
    # fig, ax = plt.subplots(1, 2, figsize=(14, 7))  # plot wh
    # ax[0].hist(wh[wh[:, 0]<100, 0],400)
    # ax[1].hist(wh[wh[:, 1]<100, 1],400)
    # fig.tight_layout()
    # fig.savefig('wh.png', dpi=200)

    # Evolve
    npr = np.random
    f, sh, mp, s = fitness(
        k), k.shape, 0.9, 0.1  # fitness, generations, mutation prob, sigma
    for _ in tqdm(range(gen), desc='Evolving anchors'):
        v = np.ones(sh)
        while (v == 1
               ).all():  # mutate until a change occurs (prevent duplicates)
            v = ((npr.random(sh) < mp) * npr.random() * npr.randn(*sh) * s +
                 1).clip(0.3, 3.0)
        kg = (k.copy() * v).clip(min=2.0)
        fg = fitness(kg)
        if fg > f:
            f, k = fg, kg.copy()
            print_results(k)
    k = print_results(k)

    return k

示例#49

0

显示文件

文件： map_functions.py 项目： HRL-at-NYUSH/nl4ds

def analyze_color(input_image,
                  transparency_threshold=50,
                  plot_3d=False,
                  plot_bar=True,
                  n_cluster=None,
                  max_cluster=10,
                  ignore_pure_black=True,
                  use_sample=True,
                  return_colors=True):

    # Copy to prevent modification (useful but mechanism needs clarification)
    input_image = input_image.copy()

    # Check input shape
    assert (len(input_image.shape) == 3)
    assert (input_image.shape[-1] in {3, 4})

    # Turn color info of pixels into dataframe, filter by transparency if RGBA image is passed
    if input_image.shape[-1] == 4:
        color_df = pd.DataFrame(input_image.reshape(-1, 4),
                                columns=list('rgba'))
        # Get the rgb info of pixels in the non-transparent part of the image
        color_df = color_df[color_df['a'] >= transparency_threshold]
    if input_image.shape[-1] == 3:
        color_df = pd.DataFrame(input_image.reshape(-1, 3),
                                columns=list('rgb'))

    if ignore_pure_black:
        color_df = color_df[~((color_df['r'] == 0) & (color_df['g'] == 0) &
                              (color_df['b'] == 0))]

    # Handle large pixel color_df
    if not use_sample and len(color_df) > 1e5:
        sample_or_not = (input(
            'Large image detected, would you like to sample the pixels in this image? (Y/N) '
        )).lower()[0] == 'y'
        if sample_or_not:
            print(
                'Sampled 100,000 pixels from the image, note that you can also resize the image before passing it to this function.'
            )
            color_df = color_df.sample(n=int(1e5), random_state=0)
        else:
            print(
                'Not sampling performed, but note that rendering 3D plot for the pixels may crash your session and K-means clustering will be slow.'
            )

    # Get std for reverse-transform the kmeans results to a meaningful rgb palette
    r_std, g_std, b_std = color_df[list('rgb')].std()
    reverse_whiten_array = np.array((r_std, g_std, b_std))

    # Normalize observations on a per feature basis, forcing features to have unit variance
    # Doc: https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.vq.whiten.html
    for color in list('rgb'):
        color_df['scaled_' + color] = whiten(color_df[color])

    ## 3D scatter plot showing color groups
    if plot_3d:
        trace = go.Scatter3d(
            x=color_df['r'],
            y=color_df['g'],
            z=color_df['b'],
            mode='markers',
            marker=dict(color=[
                'rgb({},{},{})'.format(r, g, b)
                for r, g, b in zip(color_df['r'].values, color_df['g'].values,
                                   color_df['b'].values)
            ],
                        size=1,
                        opacity=1))
        layout = go.Layout(margin=dict(l=0, r=0, b=0, t=0))
        fig = go.Figure(data=[trace], layout=layout)
        fig.show()

    ## Use K-means to identify main colors
    cluster_centers_list = []
    avg_distortion_list = []

    if n_cluster != None:
        n_cluster_range = [n_cluster - 1]  # note minus 1 to get exactly n
    else:
        n_cluster_range = range(max_cluster + 1)

    if plot_bar:
        # Initialize plt graph
        f, ax = plt.subplots(len(n_cluster_range), 1, figsize=(10, 10))

    for n in n_cluster_range:

        ###### Train clusters ######

        cluster_centers, avg_distortion = kmeans(
            color_df[['scaled_r', 'scaled_g', 'scaled_b']], n + 1)

        ###### Assign labels ######

        labels, distortions = vq(
            color_df[['scaled_r', 'scaled_g', 'scaled_b']], cluster_centers)

        color_df['label'] = labels
        color_df['distortion'] = distortions

        ###### Build palette ######

        # These parameter affects visual style only and can be exposed to user later
        height = 200
        width = 1000
        gap_size = 5
        palette = np.zeros((height, width, 3), np.uint8)

        # Count how many pixels falls under which category, let this decides the color's relative width in the palette
        cluster_proportion = color_df['label'].value_counts().sort_index(
        ) / len(color_df)
        cluster_width_list = (cluster_proportion * width).to_list()
        cluster_width_list = [
            int(x) for x in saferound(cluster_width_list, places=0)
        ]

        # Reorder clusters and widths according to the proportion, largest to smallest
        reordered_cluster_df = pd.DataFrame(
            zip(cluster_centers, cluster_width_list),
            columns=['cluster', 'width']).sort_values('width', ascending=False)
        cluster_centers = reordered_cluster_df['cluster'].tolist()
        cluster_width_list = reordered_cluster_df['width'].tolist()

        # Storing information
        cluster_centers_list.append(cluster_centers)
        avg_distortion_list.append(avg_distortion)

        if plot_bar:
            # Coloring the palette canvas based on color and width
            endpoints = list(np.cumsum(cluster_width_list))
            startpoints = [0] + endpoints[:-1]
            for cluster_index in range(len(cluster_centers)):
                # Notice here we apply the reverse_whiten_array to get meaningful RGB colors
                palette[:, startpoints[cluster_index] + gap_size:
                        endpoints[cluster_index], :] = cluster_centers[
                            cluster_index] * reverse_whiten_array
                palette[:,
                        startpoints[cluster_index]:startpoints[cluster_index] +
                        gap_size, :] = (255, 255, 255)

            # Displaying the palette when performing K-means with parameter n
            if n_cluster != None:
                ax.imshow(palette)
                ax.axis('off')
            else:
                ax[n].imshow(palette)
                ax[n].axis('off')

    if plot_bar:
        ### Show the entire palette
        f.tight_layout()
        plt.show()
        ### Show the elbow plot for choosing best n_cluster parameter for K-means
        fig = plt.figure()
        plt.scatter(x=n_cluster_range, y=avg_distortion_list)
        fig.suptitle('Elbow Plot for K-means')
        plt.xlabel('Number of Clusters')
        plt.ylabel('Average Distortion')
        print()

    if return_colors:
        if n_cluster != None:
            return (cluster_centers_list[0] * reverse_whiten_array).astype(
                np.uint8)
        else:
            return [(cluster_centers * reverse_whiten_array).astype(np.uint8)
                    for cluster_centers in cluster_centers_list]

示例#50

0

显示文件

文件： i17vector quantization.py 项目： sb123456789sb/MachineLearning-49

from termcolor import colored,cprint
import matplotlib.pyplot as plt

import numpy as np
from scipy.cluster import vq
# Creating data
c1 = np.random.randn(10, 2) + 5 
c2 = np.random.randn(3, 2) - 5 
c3 = np.random.randn(5, 2)

print(c1,colored('c2=','red'),c2,colored('c3=','blue'),c3)
# Pooling all the data into one 180 x 2 array 
data = np.vstack([c1, c2, c3])
print(colored('data =','red'),'\n',data)
# Calculating the cluster centroids and variance 
# from kmeans
centroids, variance = vq.kmeans(data, 3)
# The identified variable contains the information 
# we need to separate the points in clusters
# based on the vq function.
identified, distance = vq.vq(data, centroids)
# Retrieving coordinates for points in each vq # identified core
vqc1 = data[identified == 0]
vqc2 = data[identified == 1]
vqc3 = data[identified == 2]
print('$',vqc1,'#',vqc2,'@',vqc3)

示例#51

0

显示文件

文件： instadata.py 项目： cleciomar/instadata

def cluster_followings_sentiments(username,
                                  stop_value=None,
                                  access_token=None):
    """Returns groups and their respective centroids for the followings of a users clustered according to the sentiment reflected in their bios"""
    try:
        from textblob import TextBlob
        from scipy.cluster import vq
        import numpy

        non_empty = lambda x: x if x != ' ' else ''
        filter_crap = lambda x: {
            'username':
            x['username'],
            'bio':
            map(
                non_empty,
                re.sub(r'[^\x00-\x7f]', r' ',
                       re.sub('[^A-Za-z0-9]+', ' ', x['bio'])).encode('utf-8').
                strip(' \t\n\r').split())
        }
        non_zero = lambda x: len(x['bio']) > 3
        joiner = lambda x: {
            'username': x['username'],
            'bio': ' '.join(x['bio'])
        }
        whiten = lambda obs: obs / numpy.std(obs)

        follows_data = numpy.array(
            map(
                joiner,
                filter(non_zero,
                       map(filter_crap, get_follows(username,
                                                    'user_and_bio')))))

        grouped = []

        t = [{
            'username': a['username'],
            'bio': a['bio'],
            'sentiment': [float(a) for a in TextBlob(a['bio']).sentiment]
        } for a in follows_data]

        centers, dist = vq.kmeans(
            numpy.array([[a['sentiment'][0], a['sentiment'][1]] for a in t]),
            whiten(
                numpy.array([[a['sentiment'][0], a['sentiment'][1]]
                             for a in t])), 100)
        code, distance = vq.vq(
            numpy.array([[a['sentiment'][0], a['sentiment'][1]] for a in t]),
            centers)

        for i in range(0, len(centers)):
            grouped.append({
                'centroid': {
                    'polarity': list(map(float, centers[i]))[0],
                    'subjectivity': list(map(float, centers[i]))[1]
                },
                'cluster':
                list(
                    numpy.array([{
                        'polarity': a['sentiment'][0],
                        'subjectivity': a['sentiment'][1],
                        'username': a['username']
                    } for a in t])[code == i])
            })

        centers = sorted([list([float(b) for b in a]) for a in centers])

        return grouped, centers
    except Exception, e:
        print str(e)

示例#52

0

显示文件

def open_file(path):
    file = open(path)
    lines = []
    for line in file:
        line = line.strip()
        line = float(line)
        lines.append(line)
    file.close()
    return lines


k_lines = open_file('D:\\code\\UBC\\k_VOT.txt')
g_lines = open_file('D:\\code\\UBC\\g_VOT.txt')

data = []
data.extend(k_lines)
data.extend(g_lines)

clusters, _ = kmeans(data, 2)  #_for useless variable
k_centroid = max(clusters)
g_centroid = min(clusters)

for j in range(11):
    value = random.uniform(10, 16)
    k_distance = abs(value - k_centroid)  #distance from centroid
    g_distance = abs(value - g_centroid)
    if k_distance < g_distance:
        print('This sound is probably a k')
    else:
        print('This sound is probably a g')

示例#53

0

显示文件

文件： test_vq.py 项目： yaqiongl/scipy

 def test_kmeans_simple(self):
     np.random.seed(54321)
     initc = np.concatenate(([[X[0]], [X[1]], [X[2]]]))
     for tp in np.array, np.matrix:
         code1 = kmeans(tp(X), tp(initc), iter=1)[0]
         assert_array_almost_equal(code1, CODET2)

示例#54

0

显示文件

# build ann index
#t = AnnoyIndex(dims)
for file_index, i in enumerate(infiles):
  file_vector = np.loadtxt(i)
  file_name = os.path.basename(i).split('.')[0]
  file_index_to_file_name[file_index] = file_name
  file_index_to_file_vector[file_index] = file_vector

  #whitened = whiten(file_vector)
  #t.add_item(file_index, file_vector)
  

#t.build(trees)
whitened = whiten(features)
codes = 3
result = kmeans(whitened, codes)

'''
# create a nearest neighbors json file for each input
if not os.path.exists('nearest_neighbors'):
  os.makedirs('nearest_neighbors')

for i in file_index_to_file_name.keys():
  master_file_name = file_index_to_file_name[i]
  master_vector = file_index_to_file_vector[i]

  named_nearest_neighbors = []
  nearest_neighbors = t.get_nns_by_item(i, n_nearest_neighbors)
  for j in nearest_neighbors:
    neighbor_file_name = file_index_to_file_name[j]
    neighbor_file_vector = file_index_to_file_vector[j]

示例#55

0

显示文件

文件： deTiN_aSCNA_based_estimate.py 项目： flywind2/deTiN

    def cluster_segments(self):
        if self.n_segs >= 3:
            K = range(1, 4)
            N = len(self.hets['seg_id'])
            self.segs.reset_index(inplace=True, drop=False)
            tin_data = np.nanargmax(self.TiN_likelihood_matrix,
                                    axis=1).astype(float)
            km = [kmeans(tin_data, k, iter=1000) for k in K]
            centroids = [cent for (cent, var) in km]
            squared_distance_to_centroids = [
                np.power(np.subtract(tin_data[:, np.newaxis], cent), 2)
                for cent in centroids
            ]
            self.sum_squared_distance = [
                sum(np.min(d, axis=1)) / N
                for d in squared_distance_to_centroids
            ]
            cluster_assignment = [
                np.argmin(d, axis=1) for d in squared_distance_to_centroids
            ]
            het_tin_map = np.argmax(self.p_TiN, axis=1)
            self.cl_distance_points = np.zeros([3, 3])
            for k, clust in enumerate(cluster_assignment):
                for idx, row in self.segs.iterrows():
                    self.cl_distance_points[k, clust[idx]] += np.sum(
                        np.power(
                            het_tin_map[self.hets['seg_id'] == row['index']] -
                            centroids[k][clust[idx]], 2))

            self.cl_var = np.sqrt(
                np.true_divide(self.cl_distance_points,
                               len(self.hets['seg_id'])))
            p = [1, 2, 3]
            delta_bic = [0, 10, 20]
            self.bic = (np.multiply(
                N,
                np.log(
                    np.true_divide(np.sum(self.cl_distance_points, axis=1),
                                   N))) +
                        np.multiply(p, np.log(N))) + delta_bic
            if len(centroids[2]) > 2:
                dist_btwn_c3 = np.mean(
                    [abs(i - j) for i, j in combinations(centroids[2], 2)])
            else:
                dist_btwn_c3 = 0
            if len(centroids[1]) > 1:
                dist_btwn_c2 = np.abs(np.diff(centroids[1]))
            else:
                dist_btwn_c2 = 0
            if dist_btwn_c3 < np.nanmax(
                    self.cl_var[2, :]) and dist_btwn_c2 > np.nanmax(
                        self.cl_var[1, :]):
                solution_idx = np.nanargmin(self.bic[0:1])
                self.cluster_assignment = cluster_assignment[solution_idx]
                self.centroids = centroids[solution_idx]
            if dist_btwn_c3 < np.nanmax(
                    self.cl_var[2, :]) and dist_btwn_c2 < np.nanmax(
                        self.cl_var[1, :]):
                self.cluster_assignment = cluster_assignment[0]
                self.centroids = centroids[0]
            else:
                solution_idx = np.nanargmin(self.bic)
                self.cluster_assignment = cluster_assignment[solution_idx]
                self.centroids = centroids[solution_idx]

        else:
            self.cluster_assignment = 0
            self.centroids = [np.mean(self.segs['TiN_MAP'])]

示例#56

0

显示文件

文件： ps3.py 项目： sahil-dhingra/Computer-Vision

def find_markers(image, template=None):
    """Finds four corner markers.

    Use a combination of circle finding, corner detection and convolution to
    find the four markers in the image.

    Args:
        image (numpy.array): image array of uint8 values.
        template (numpy.array): template image of the markers.

    Returns:
        list: List of four (x, y) tuples
            in the order [top-left, bottom-left, top-right, bottom-right].
    """
    
    img_gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    img_gray = cv2.medianBlur(img_gray, 5)
    
    B = (1/8)*np.ones((8,8), dtype=np.float32)
    B[:4,4:] = -1*B[:4,4:]
    B[4:,:4] = -1*B[4:,:4]
    
    angles = np.arange(-90, 91, 7.5, dtype=int)
    for i in angles:
        B_ = ndimage.rotate(B, i, reshape=True)
        
        C = cv2.filter2D(img_gray[5:,5:][:-5,:-5], ddepth = -1, kernel = B_)

        n=22
        j=6
        u = 0.8
        v = 4
        lo = int(0.5*(n-j))
        hi = int(0.5*(n+j))
        spot = (-1)*np.ones((n,n), dtype=np.float32)
        spot[lo:hi,lo:hi] = -1*spot[lo:hi,lo:hi]
        spot[-v:,:] = u*spot[-v:,:]
        spot[:,-v:] = u*spot[:,-v:]
        spot[:v,:] =  u*spot[:v,:]
        spot[:,:v] =  u*spot[:,:v]
        blobs = cv2.filter2D(C, ddepth = -1, kernel = spot)
        
        centers = np.array(np.argwhere(blobs==255), dtype = "float32") + 5
        if centers.shape[0]>15:
            break
    
    if centers.shape[0]>3:
        markers = np.array(kmeans(centers,4)[0], dtype = int)   
        
        rank_y = markers[:,1].argsort()
        rank_x1 = markers[rank_y[:2]][:,0].argsort()
        rank_x2 = markers[rank_y[2:]][:,0].argsort()
        
        p1 = markers[rank_y[:2]][rank_x1[0]]
        p2 = markers[rank_y[:2]][rank_x1[1]]
        p3 = markers[rank_y[2:]][rank_x2[0]]
        p4 = markers[rank_y[2:]][rank_x2[1]]
        
        final_markers = [(p1[1], p1[0]), (p2[1], p2[0]), (p3[1], p3[0]), (p4[1], p4[0])]
    else:
        final_markers = [(0,0), (2,0), (0,2), (2,2)]
    
    return final_markers
    raise NotImplementedError

示例#57

0

显示文件

文件： novisurf.py 项目： corynscott/Artwork-Navigation--Masters-

def search(query, n=40, start=0):
    # retrieve top n results of query
    # default is 40 results per page

    dict_res = BossImageIndex().CallBoss(query, n, start)
    im_res = dict_res['ysearchresponse']['resultset_images']
    res = []
    for i in xrange(n):
        res.append((im_res[i]['thumbnail_url'], i))

    #path_name = "/Library/WebServer/results/"+query
    path_name = "/Users/novi/my_image_search/results/" + query

    # create the folder (if does not exist) to save query results
    if os.path.isdir(path_name):
        shutil.rmtree(path_name)
        os.mkdir(path_name)
    else:
        os.mkdir(path_name)

    # download the image results
    image = urllib.URLopener()
    silentcounter = 1
    imagefile = []
    for counter in xrange(n):
        urltoberetrieved = res[counter][0]
        #print urltoberetrieved
        filename = '%s/%s.%s' % (path_name, silentcounter, 'jpg')
        #try:
        image.retrieve(urltoberetrieved, filename)
        imagefile.append(filename)
        silentcounter = silentcounter + 1
        #except IOError:
        #    print 'error at %s \n' % (urltoberetrieved)
        #    pass

    # prepare the color image feature
    pref = numpy.array([[0, 0]])  # [image #,position #]
    ldesc = []
    codes = 30  #number of k-means cluster
    ino = 5
    jno = 8  # default grid: 5 by 8 2D grid
    show = ino * jno
    lim = show

    silentcounter = 1
    for i_img in xrange(lim):
        fname = imagefile[i_img]
        try:
            im = cv.LoadImage(fname,
                              0)  # loading with OpenCV (gray chanel only)
            silentcounter = silentcounter + 1
        except:
            print 'image thumbnail can not be retrieved'
            sys.exit(0)

        #resizing the image
        #om = cv.CreateImage((psize,psize),im.depth,im.nChannels)
        #cv.Resize(im,om,cv.CV_INTER_CUBIC)
        storage = cv.CreateMemStorage(0)
        #generating the mask
        #mat = cv.CreateMat(psize,psize,cv.CV_8UC1)
        #extracting SURF feature
        #[keypoints,descriptors] = cv.ExtractSURF(om,mat,storage,(1,500,3,4))
        [keypoints, descriptors] = cv.ExtractSURF(im, im, storage,
                                                  (1, 500, 3, 4))
        ldesc.append(descriptors)

    #perform vector quantization
    tarrdesc = [numpy.array(ldesc[i]) for i in range(show)]
    lendesc = [ldesc[i].__len__() for i in range(show)]
    arrdesc = numpy.concatenate([tarrdesc[i] for i in range(show)])
    arrdesc = whiten(arrdesc)
    [codebook, distortion] = kmeans(arrdesc, codes)
    [code, dist] = vq(arrdesc, codebook)

    #generate the semantic feature
    imgdata = numpy.zeros((show, codebook.shape[0]), dtype=float)
    code_offset = 0
    for i_img in xrange(show):
        code_index = range(code_offset, code_offset + lendesc[i_img])
        for i_code in code_index:
            imgdata[i_img, code[i_code]] = imgdata[i_img, code[i_code]] + 1
        code_offset = code_offset + lendesc[i_img]

    #normalize the semantic feature
    sumimgdata = numpy.sum(imgdata, axis=1)
    sumimgdata.shape = show, 1
    imgdata = imgdata / sumimgdata

    griddata = numpy.zeros((2, ino * jno))
    griddata[0, ] = numpy.kron(range(1, ino + 1), numpy.ones((1, jno)))
    griddata[1, ] = numpy.tile(range(1, jno + 1), (1, ino))

    # do kernelized sorting procedure
    PI = KS(imgdata, griddata.T, pref)
    i_sorting = PI.argmax(axis=1)

    #creating the passed dictionary
    sorted_dict_res = {}
    sorted_dict_res['count'] = dict_res['ysearchresponse']['count']
    sorted_dict_res['totalhits'] = dict_res['ysearchresponse']['totalhits']
    sorted_dict_res['start'] = dict_res['ysearchresponse']['start']
    sorted_dict_res['resultset_images'] = [
        dict_res['ysearchresponse']['resultset_images'][i] for i in i_sorting
    ]
    return sorted_dict_res

示例#58

0

显示文件

文件： trackPoints.py 项目： ashwinrammohan/MouseCV

def limb_track():
    global frame_n

    cv.namedWindow("Dots")
    fps = 30
    frame_dt = 0  #1.0 / fps
    mv_i = 0
    pause = False

    while True:
        print("Frame:", mv_i)
        if frame_n >= contour_data.shape[0]:
            #mv_i = 0
            print("Frames completed:", frame_n)
            f_write.save(write_dict)
            break

        t = time.clock()
        ret, im = cap.read()

        for x, y in fs:
            cv.circle(im, (x, y), 2, (255, 0, 0), -1)

        n = n_contours[mv_i]

        if (n > 0):

            c_points = contour_data[mv_i, :n]

            limb_distances = np.empty((num_limbs, n))
            for i in range(num_limbs):
                limb_x, limb_y = fs[i]
                for j in range(n):
                    x, y = c_points[j]
                    dx = limb_x - x
                    dy = limb_y - y
                    distance = dx * dx + dy * dy
                    limb_distances[i, j] = distance

                limb_distances[i] = np.sort(limb_distances[i])

            threshold = 1500
            needed_limbs = np.where(limb_distances[:, 0] < threshold)[0]

            whitened = whiten(c_points)
            x_scale = c_points[0, 0] / whitened[0, 0]
            y_scale = c_points[0, 1] / whitened[0, 1]

            if (needed_limbs.shape[0] > 0):
                max_k = 6
                costs = np.empty(max_k - needed_limbs.shape[0])
                all_kmean_points = []
                for k in range(needed_limbs.shape[0], max_k):
                    points, distortion = kmeans(whitened, k)
                    points[:, 0] *= x_scale
                    points[:, 1] *= y_scale
                    points = points.astype('int32')
                    all_kmean_points.append(points)
                    costs[k - needed_limbs.shape[0]] = cost(
                        points, needed_limbs)

                best_ind = np.argmin(costs)
                best_points = all_kmean_points[best_ind]

                for i, (x, y) in enumerate(best_points):
                    cv.circle(im, (x, y), 2, (0, 0, 255), -1)

                distances = np.empty(
                    (needed_limbs.shape[0], best_points.shape[0]))
                indices = np.empty(
                    (needed_limbs.shape[0], best_points.shape[0], 2),
                    dtype='uint8')
                for i in range(needed_limbs.shape[0]):
                    limb_x, limb_y = fs[needed_limbs[i]]
                    for j in range(best_points.shape[0]):
                        x, y = best_points[j]
                        dx = x - limb_x
                        dy = y - limb_y
                        distance = dx * dx + dy * dy
                        distances[i, j] = distance
                        indices[i, j, 0] = needed_limbs[i]
                        indices[i, j, 1] = j

                for i in range(needed_limbs.shape[0]):
                    i, j = np.unravel_index(np.nanargmin(distances),
                                            distances.shape)
                    limb_ind = indices[i, j, 0]
                    point_ind = indices[i, j, 1]
                    new_limb_pos = (best_points[point_ind,
                                                0], best_points[point_ind, 1])
                    cv.line(im, fs[limb_ind], new_limb_pos, (255, 255, 255), 1)
                    fs[limb_ind] = new_limb_pos
                    distances[i] = np.NaN
                    distances[:, j] = np.NaN

        for i in range(num_limbs):
            name = names[i]
            x, y = fs[i]
            write_dict[name][mv_i, 0] = x
            write_dict[name][mv_i, 1] = y

        cv.putText(im, str(frame_n), (5, 25), cv.FONT_HERSHEY_SIMPLEX, 1.0,
                   (255, 255, 255))
        cv.imshow("Dots", im)

        if pause:
            k = cv.waitKey(0)
        else:
            dt = frame_dt - (time.clock() - t)
            dt_mili = int(dt * 1000)

            if (dt_mili < 1):
                dt_mili = 1

            k = cv.waitKey(dt_mili)
            mv_i += 1
            frame_n += 1

        if k == 27:  # esc key
            print("Frames completed:", frame_n)
            f_write.save(write_dict)
            break
        elif k == 32:  # space key
            pause = not (pause)
        elif k == 63235 and pause:  # right arrow
            mv_i += 1
            frame_n += 1
            print(stds[frame_n])
        elif k == 63234 and pause:  # left arrow
            mv_i -= 1
            frame_n -= 1
            print(stds[frame_n])

示例#59

0

显示文件

  -0.14  -0.016  0.047 -0.017  0.047 -0.012 -0.07  -0.032 -0.012 -0.06
  -0.035 -0.018 -0.056 -0.039 -0.056 -0.028 -0.004 -0.007 -0.005 -0.006]
  ==Twitch, ash, -therm, -thatch, bandit, blackbeard
  
[-0.139 -0.412  0.204  0.135  0.063  0.07   0.063  0.043  0.51   0.061
   0.073  0.107 -0.048  0.08   0.116  0.02   0.082  0.051  0.222  0.039
  -0.044  0.006  0.031  0.098 -0.182  0.195 -0.368  0.    -0.067  0.043
  -0.135  0.016 -0.028  0.011 -0.175 -0.25  -0.05  -0.018 -0.017 -0.08
  -0.041 -0.    -0.112 -0.03  -0.093 -0.028 -0.003 -0.004 -0.01  -0.004]
  ==-twitch, ash, thermite, valk, capitao, -vigil, -nomad
  """
  

np.random.seed((1000,2000)) #random random seed
K = range(1,20) # k's for k means
KM = [kmeans(dataset,k) for k in K]
centroids = [cent for (cent,var) in KM]
D_k = [cdist(dataset, cent, 'euclidean') for cent in centroids]
cIdx = [np.argmin(D,axis=1) for D in D_k]
dist = [np.min(D,axis=1) for D in D_k]

tot_withinss = [sum(d**2) for d in dist]  # Total within-cluster sum of squares
totss = sum(pdist(dataset)**2)/dataset.shape[0]       # The total sum of squares
betweenss = totss - tot_withinss          # The between-cluster sum of squares

##### plots #####
kIdx = 7        # K=8
mrk = 'os^p<dvh8>+x.'

# elbow curve
plt.plot(K, betweenss/totss*100, 'b*-')

示例#60

0

显示文件

文件： cluster_1.py 项目： ixuxinyue/chartGeneration

import matplotlib.pylab as plt

#生成待聚类的数据点,这里生成了20个点,每个点4维:
points = scipy.randn(20, 4)

#1. 层次聚类
#生成点与点之间的距离矩阵,这里用的欧氏距离:
disMat = sch.distance.pdist(points, 'euclidean')
#进行层次聚类:
Z = sch.linkage(disMat, method='average')
#将层级聚类结果以树状图表示出来并保存为plot_dendrogram.png
P = sch.dendrogram(Z)
plt.savefig('plot_dendrogram.png')
#根据linkage matrix Z得到聚类结果:
cluster = sch.fcluster(Z, t=1, 'inconsistent')

print "Original cluster by hierarchy clustering:\n", cluster

#2. k-means聚类
#将原始数据做归一化处理
data = whiten(points)

#使用kmeans函数进行聚类,输入第一维为数据,第二维为聚类个数k.
#有些时候我们可能不知道最终究竟聚成多少类,一个办法是用层次聚类的结果进行初始化.当然也可以直接输入某个数值.
#k-means最后输出的结果其实是两维的,第一维是聚类中心,第二维是损失distortion,我们在这里只取第一维,所以最后有个[0]
centroid = kmeans(data, max(cluster))[0]

#使用vq函数根据聚类中心对所有数据进行分类,vq的输出也是两维的,[0]表示的是所有数据的label
label = vq(data, centroid)[0]

print "Final clustering by k-means:\n", label