Python whiten示例，scipy.cluster.vq.whiten Python示例

示例#1

0

显示文件

文件： author.py 项目： sophie-greene/repo

def LexicalFeatures():
    """
    Compute feature vectors for word and punctuation features
    """
    num_chapters = len(chapters)
    fvs_lexical = np.zeros((len(chapters), 3), np.float64)
    fvs_punct = np.zeros((len(chapters), 3), np.float64)
    for e, ch_text in enumerate(chapters):
        # note: the nltk.word_tokenize includes punctuation
        tokens = nltk.word_tokenize(ch_text.lower())
        words = word_tokenizer.tokenize(ch_text.lower())
        sentences = sentence_tokenizer.tokenize(ch_text)
        vocab = set(words)
        words_per_sentence = np.array([len(word_tokenizer.tokenize(s))
                                       for s in sentences])

        # average number of words per sentence
        fvs_lexical[e, 0] = words_per_sentence.mean()
        # sentence length variation
        fvs_lexical[e, 1] = words_per_sentence.std()
        # Lexical diversity
        fvs_lexical[e, 2] = len(vocab) / float(len(words))

        # Commas per sentence
        fvs_punct[e, 0] = tokens.count(',') / float(len(sentences))
        # Semicolons per sentence
        fvs_punct[e, 1] = tokens.count(';') / float(len(sentences))
        # Colons per sentence
        fvs_punct[e, 2] = tokens.count(':') / float(len(sentences))

    # apply whitening to decorrelate the features
    fvs_lexical = whiten(fvs_lexical)
    fvs_punct = whiten(fvs_punct)

    return fvs_lexical, fvs_punct

示例#2

0

显示文件

文件： test_kmeans.py 项目： jwallp/151-Assignments

    def test1(self):
        print "TEST 1:----------------------------------------------------------------"
        features = np.array([[1.9, 2.3],
                          [1.5, 2.5],
                          [0.8, 0.6],
                          [0.4, 1.8],
                          [0.1, 0.1],
                          [0.2, 1.8],
                          [2.0, 0.5],
                          [0.3, 1.5],
                          [1.0, 1.0]])
        whitened = whiten(features)
        book = np.array((whitened[0], whitened[2]))
        numpy_result = kmeans(whitened, book)[0]
        print numpy_result
        print ""

        features2 = np.array([[1.9, 2.3,0],
                             [1.5, 2.5,0],
                             [0.8, 0.6,0],
                             [0.4, 1.8,0],
                             [0.1, 0.1,0],
                             [0.2, 1.8,0],
                             [2.0, 0.5,0],
                             [0.3, 1.5,0],
                             [1.0, 1.0,0]])
        whitened2 = whiten(features2)
        book2 = [whitened[0], whitened[2]]
        our_result = np.array(KMeans.k_means2(whitened2.tolist(), 2, book2).centroids)[:, :-1]
        print our_result

示例#3

0

显示文件

文件： distances_lib.py 项目： ArtMgn/k-means-pca

def clust_scatter(samples, clusters, allocation_table, n):

    c = len(allocation_table[0])  # Columns
    r = len(allocation_table)  # Rows

    time_scat_square = 0
    mat_scatter = 0

    for j in range(0, c):  # clusters
        for t in range(0, 10):  # maturities
            for p in range(0, r):  # samples within a cluster
                index = allocation_table[p, j]
                if index != 0:
                    time_scat_square += samples.samples[index-1].scatter_maturity[t].scatter
            mat_scatter += time_scat_square**2
            time_scat_square = 0
        clusters.clusters[j].scatter = np.sqrt(mat_scatter - 10 * clusters.clusters[j].mean**2)
        mat_scatter = 0
        if n == 0 or n == 4999:
            print('clust scatter : ' + str(clusters.clusters[j].scatter))

    # Normalize clusters' scatter
    vec = np.zeros(4)
    for j in range(0, c):
        vec[j] = clusters.clusters[j].scatter

    whiten(vec)
    for j in range(0, c):
        clusters.clusters[j].scatter = vec[j]

    return clusters;

示例#4

0

显示文件

文件： qiita_doc_sim.py 项目： NuitNoir/MachineLearning

def sent_integrate(sim_matrix,n_class):
    # 次元ごとの分散を均一にする
    whiten(sim_matrix)

    centroid, destortion = kmeans(sim_matrix, n_class, iter=100, thresh=1e-05)
    labels, dist = vq(sim_matrix, centroid)
    return labels

示例#5

0

显示文件

文件： regression.py 项目： AustinStoneProjects/statsHw2

def parse(data_file_name, predict_index, ignore_indices, **options):
	data_file = open(data_file_name, 'r')
	lines = data_file.read().splitlines()
	x = []
	y = []
	for i, line in enumerate(lines):
		if i == 0 or i == 1:
			continue
		datas = line.split()
		x_category = []
		for i, data in enumerate(datas):
			if ignore_indices.has_key(i):
				continue
			if i == predict_index:
				if data == 'T':
					y.append(1.0)
				elif data == 'F':
					y.append(0.0)
				else:
					y.append(float(data))
				continue
			x_category.append(float(data))
		x.append(x_category)
	x = whiten(np.array(x)) if options.get('whiten_x') else np.array(x)
	y = whiten(np.array(y)) if options.get('whiten_y') else np.array(y)
	x = x - x.mean() if options.get('mean_center_x') else x
	y = y - y.mean() if options.get('mean_center_y') else y
	return (x, y)

示例#6

0

显示文件

文件： xmeans.py 项目： m-tian/msaf-copy

    def compute_bic(self, D, means, labels, K, R):
        """Computes the Bayesian Information Criterion."""
        D = vq.whiten(D)
        Rn = D.shape[0]
        M = D.shape[1]

        if R == K:
            return 1

        # Maximum likelihood estimate (MLE)
        mle_var = 0
        for k in xrange(len(means)):
            X = D[np.argwhere(labels == k)]
            X = X.reshape((X.shape[0], X.shape[-1]))
            for x in X:
                mle_var += distance.euclidean(x, means[k])
                #print x, means[k], mle_var
        mle_var /= (float(R - K))

        # Log-likelihood of the data
        l_D = - Rn/2. * np.log(2*np.pi) - (Rn * M)/2. * np.log(mle_var) - \
            (Rn - K) / 2. + Rn * np.log(Rn) - Rn * np.log(R)

        # Params of BIC
        p = (K-1) + M * K + mle_var

        #print "BIC:", l_D, p, R, K

        # Return the bic
        return l_D - p/2. * np.log(R)

示例#7

0

显示文件

文件： pairdist.py 项目： billy322/BioNLP-2016

def process_options(args):    
    options = argparser().parse_args(args)

    if options.max_rank is not None and options.max_rank < 1:
        raise ValueError('max-rank must be >= 1')
    if options.threshold is not None and options.threshold < 0.0:
        raise ValueError('threshold must be >= 0')
    if options.tolerance is not None and options.tolerance < 0.0:
        raise ValueError('tolerance must be >= 0')
    if options.approximate and not options.threshold:
        raise ValueError('approximate only makes sense with a threshold')
    if options.approximate and options.metric != 'cosine':
        raise NotImplementedError('approximate only supported for cosine')

    wv = wvlib.load(options.vectors[0], max_rank=options.max_rank)

    if options.normalize:
        logging.info('normalize vectors to unit length')
        wv.normalize()

    words, vectors = wv.words(), wv.vectors()

    if options.whiten:
        # whitening should be implemented in wvlib to support together with
        # approximate similarity
        if options.approximate:
            raise NotImplemenedError
        logging.info('normalize features to unit variance')
        vectors = whiten(vectors)

    return words, vectors, wv, options

示例#8

0

显示文件

文件： main.py 项目： MariaBarrett/VIPExam2

def createdatabase():
	X_train = detectcompute(train1)

	print "Clustering the data with K-means"
	codebook,distortion = kmeans(whiten(X_train),k)
	print "Done.\n"
	
	imtrain = singledetect(test1)
	Pdatabase = bow(imtrain,codebook,k) #Pseudo database with list structure


	#Writing to html.table
	print "Converting the database into a HTML file"
	htmltable = open("table.htm","r+") 
	begin = "<htm><body><table cellpadding=5><tr><th>Filename</th><th>Histogram</th></tr>"
	htmltable.write(begin)

	for i in range(len(Pdatabase)):
	    middle = "<tr><td>%(filename)s</td><td>%(histogram)s</td></tr>" % {"filename": Pdatabase[i][0], "histogram": Pdatabase[i][-1]}
	    htmltable.write(middle)

	end = "</table></body></html>"    
	htmltable.write(end)
	htmltable.close()
	print "Done.\n"

	codebook_to_file(codebook)

示例#9

0

显示文件

文件： main.py 项目： MariaBarrett/VIPExam2

def bow(images,codebook,clusters):
	out = images
	temp = []

	print "-"*60
	print "Creating the pseudo database."
	for im in images:
		c = Counter()
		bag,dist = vq(whiten(im[1]),codebook)
		
		for word in bag:
			c[word]+=1

		#Creating histograms
		for i in range(clusters):
			if i in c.iterkeys():
				c[i] = c[i]/sum(c.values())
			if i not in c.iterkeys():
				c[i] = 0
		
		temp.append(c)
		
	for i in range(len(temp)):
		out[i].append(temp[i])

	print "Done.\n"
	return out

示例#10

0

显示文件

文件： bag_of_words.py 项目： 1987hasit/BoVW_Image

    def generateCodebook(self, features):
        """ Generate codebook using extracted features """
    
        
        codebook = None
        
        if self._codebookGenerateMethod == 'k-means':
#             # Codebook generation using scipy k-means
#             while run:
#                 try:
#                     # Set missing = 'raise' to raise exception 
#                     # when one of the clusters is empty
#                     whitenedFeatures = whiten(features)
#                     codebook, _ = kmeans2(whitenedFeatures, 
#                                           self._codebookSize, 
#                                           missing = 'raise')
#                     
#                     # No empty clusters
#                     run = False
#                 except ClusterError:
#                     # If one of the clusters is empty, re-run k-means
#                     run = True
            
            # Codebook generation using sklearn k-means
            whitenedFeatures = whiten(features)
            kmeans = MiniBatchKMeans(n_clusters = config.codebookSize)
            kmeans.fit(whitenedFeatures)
            codebook = kmeans.cluster_centers_
        else:
            pass
        
        self._codebook = codebook

示例#11

0

显示文件

文件： analysis.py 项目： ymohanty/data-analysis

def kmeans(d, headers, K, metric, whiten=True, categories=None):
    '''Takes in a Data object, a set of headers, and the number of clusters to create
    Computes and returns the codebook, codes and representation errors.
    If given an Nx1 matrix of categories, it uses the category labels
    to calculate the initial cluster means.
    '''

    # assign to A the result getting the data given the headers
    try:
        A = d.get_data(headers)
    except AttributeError:
        A = d

    if whiten:
        W = vq.whiten(A)
    else:
        W = A

    codebook = kmeans_init(W, K, categories)

    # assign to codebook, codes, errors, the result of calling kmeans_algorithm with W and codebook
    codebook, codes, errors = kmeans_algorithm(W, codebook, metric)

    # return the codebook, codes, and representation error
    return codebook, codes, errors

示例#12

0

显示文件

文件： t.py 项目： mcaprari/rt-pub

def custom():
	_items = {}
	users = []

	for line in open('my_items_likehood.txt'):
		user, item = keys(line)
		users.append(user)
		if item in _items:
			_items[item].append(user)
		else:
			_items[item] = [user]


	sorted_users = sorted(users)
	l = len(sorted_users)
	items={}
	count=0
	features=[]
	for item in _items:
	
		features.append(user_matrix(l, _items[item], sorted_users))
		if count == 100: break
		count += 1

	print 'whiten'
	whitened = whiten(array(features))
	print 'kmeans'
	print kmeans(whitened)
	print "%d items voted by %d users" % (len(items), len(users))

示例#13

0

显示文件

文件： clustering.py 项目： lucidfrontier45/PyNumPDB

def normalize(data,mode="pca",n=10):
  """ normalize and reduce data by PCA"""
  
  if mode == "whiten":
    res = whiten(data)
  elif mode == "pca":
    v,P,res = pca_train(data,n,0,1) 
    print v
    print "eigen ratio is ",v[n-1] / v[0] 
  elif mode == "pca_whiten": 
    v,P,proj = pca_train(data,n,0,1) 
    res = whiten(proj)
  else:
    res = np.array(data)    
 
  return res

示例#14

0

显示文件

文件： test_kmeans.py 项目： apetcho/SciPy-CookBook

def test_kmeans():
	obs = sp.random.uniform(0, 10, (1000, 2))
	# knum = 7
	obs = scvq.whiten(obs)

	# run kmeans with diffirent number of clusters
	for knum in range(2, 8):
		codebook, dist = scvq.kmeans(obs, knum)
		ind, dist = scvq.vq(obs, codebook)

		# visualize
		# plt.ion()
		plt.ioff()
		plt.figure(knum)
		colors = ["b*", "g+", "ro", "yp", "ms", "ch", "wx"]

		for icluster in range(knum):
			x = (ind == icluster).nonzero()[0]
			plt.plot(obs[x, 0], obs[x, 1], colors[icluster])

			for iline in range(sp.size(x)):
				plt.plot([obs[x[iline], 0], codebook[icluster, 0]],
					[obs[x[iline], 1], codebook[icluster, 1]], "k--")

		# the cluster centroid
		plt.plot(codebook[:, 0], codebook[:, 1], "ko")

		# the plot size
		plt.xlim((-0.3, 3.8))
		plt.ylim((-0.3, 3.8))
	plt.show()

示例#15

0

显示文件

文件： cluster.py 项目： persistforever/sentenceEmbedding

def kmeans(dataset, n_cluster = 625):
    from scipy.cluster.vq import kmeans2, whiten
    feature_matrix = numpy.asarray(dataset)
    whitened = whiten(feature_matrix)
    cluster_num = 625
    _, cluster_labels = kmeans2(whitened, cluster_num, iter = 100)
    return cluster_labels

示例#16

0

显示文件

文件： cluster.py 项目： realstraw/abathur

        def _get_jump(feat_array, max_cluster):
            if max_cluster < 2:
                max_cluster = self._determine_max_k(feat_array)
            whitened = whiten(feat_array)
            # first obtain the covariance matrix of the feature array
            gamma = np.cov(whitened.T)
            num_dim = whitened.shape[1]
            jump = {}
            distortions_dict = {0: 1}
            power_fact = -num_dim / 2.0
            # Run k mean for all possible number of clusters
            for k in xrange(1, max_cluster + 1):
                codebook, _ = kmeans(whitened, k, iter=self.iter)
                code, _ = vq(whitened, codebook)

                clusters_dict = self._segment_to_clusters(whitened, code)
                mahalanobis_dist_list = []
                for cid, cvals in clusters_dict.iteritems():
                    centroid = codebook[cid]
                    cluster_mahalanobis_dist = map(
                        lambda x: self._sq_mahalanobis(x, centroid, gamma),
                        clusters_dict[cid].values)
                    mahalanobis_dist_list.extend(cluster_mahalanobis_dist)
                this_distortion = np.mean(mahalanobis_dist_list) / num_dim
                distortions_dict[k] = this_distortion ** power_fact

            for k in xrange(1, max_cluster + 1):
                jump[k] = distortions_dict[k] - distortions_dict[k - 1]

            return jump

示例#17

0

显示文件

文件： dump_transition_for_clustering.py 项目： kaustuvkanti/Experiments

def clustering_scipy_kmeans(features, n_clust = 8):
  """
  """
  whitened = whiten(features)
  print whitened.shape
  
  initial = [kmeans(whitened,i) for i in np.arange(1,12)]
  plt.plot([var for (cent,var) in initial])
  plt.show()
  
  #cent, var = initial[3]
  ##use vq() to get as assignment for each obs.
  #assignment,cdist = vq(whitened,cent)
  #plt.scatter(whitened[:,0], whitened[:,1], c=assignment)
  #plt.show()
  
  codebook, distortion = kmeans(whitened, n_clust)
  print codebook, distortion
  assigned_label, dist = vq(whitened, codebook)
  for ii in range(8):
    plt.subplot(4,2,ii+1)
    plt.plot(codebook[ii])
  plt.show()
  
  centroid, label = kmeans2(whitened, n_clust, minit = 'points')
  print centroid, label
  for ii in range(8):
    plt.subplot(4,2,ii)
    plt.plot(centroid[ii])
  plt.show()

示例#18

0

显示文件

文件： cluster.py 项目： tuka04/mc906

 def perform(self):
     print "Start KMeans"
     data = whiten(self.seeds)#normalizando os dados
     self.centro,self.sens = kmeans(data,self.k)
     self.matrix,_ = vq(data,self.centro)
     self.resp = self.centro[self.matrix]
     print "Sensibilidade: "+str(self.sens)

示例#19

0

显示文件

文件： cluster.py 项目： realstraw/abathur

    def _get_cluster(self, feat_array, k):
        # Normalise the feature array
        whitened = whiten(feat_array)

        codebook, _ = kmeans(whitened, k, iter=self.iter)
        code, _ = vq(whitened, codebook)
        return code

示例#20

0

显示文件

文件： k_means_classification.py 项目： fmihaich/annic

 def _get_k_means_centroids(self):
     whitened_set = whiten(self.training_set)
     centroids, _ = kmeans(obs = whitened_set, 
                           k_or_guess = self.class_number,
                           iter = self.max_iteration_number,
                           thresh = self.training_error)
     return centroids

示例#21

0

显示文件

文件： analyze.py 项目： WeiliangXing/Facebook-Data-Mining

def sparse_run(g, pos1):

    g2 = sparse_graph(g)

    # pos1 = nx.spring_layout(g)
    pos2 = nx.spring_layout(g2)

    features = []
    for u in g2.nodes_iter():
        # print type(u)
        # print u
        # print pos[u]
        features.append(pos2[u])
    print "featurs:", len(features)
    features = ny.array(features)

    method = 2
    if method == 1:
        whitened = whiten(features)
        book = ny.array((whitened[0],whitened[2]))
        km = kmeans(whitened, book)

        print km
    elif method == 2:
        n_digits = 4
        km = KMeans(init='k-means++', n_clusters=n_digits, n_init=10)
        res = km.fit(features)
        print len(km.labels_), km.labels_
        print res
    return km.labels_, g2

示例#22

0

显示文件

文件： locactionKmeans1.py 项目： btoffoli/data_mining_examples

def kmeans1():
    features  = array([[ 1.9,2.3], [ 1.5,2.5], [ 0.8,0.6], [ 0.4,1.8], [ 0.1,0.1], [ 0.2,1.8], [ 2.0,0.5], [ 0.3,1.5], [ 1.0,1.0]])
    whitened = whiten(features)
    book = array((whitened[0],whitened[2]))
    kmeans(whitened,book)
    (array([[ 2.3110306 ,  2.86287398],
           [ 0.93218041,  1.24398691]]), 0.85684700941625547)

示例#23

0

显示文件

文件： locactionKmeans1.py 项目： btoffoli/data_mining_examples

def kmeans2():
    features = locations()
    whitened = whiten(features)
    book = array((whitened[0],whitened[2]))
    kmeans(whitened,book)
    (array([[ 2.3110306 ,  2.86287398],
           [ 0.93218041,  1.24398691]]), 0.85684700941625547)

示例#24

0

显示文件

文件： BP.Merge.Multi.Sample.py 项目： xuefzhao/1000Genome.Trio

def k_means_cluster(data_list):
	if max(data_list[0])-min(data_list[0])>10 and max(data_list[1])-min(data_list[1])>10:
		array_diagnal=array([[data_list[0][x],data_list[1][x]] for x in range(len(data_list[0]))])
		ks = range(1,min([5,len(data_list[0])+1]))
		KMeans = [cluster.KMeans(n_clusters = i, init="k-means++").fit(array_diagnal) for i in ks]
		KMeans_predict=[cluster.KMeans(n_clusters = i, init="k-means++").fit_predict(array_diagnal) for i in ks]
		BIC=[]
		BIC_rec=[]
		for x in ks:
			if KMeans_predict[x-1].max()<x-1: continue
			else:
				BIC_i=compute_bic(KMeans[x-1],array_diagnal)
				if abs(BIC_i)<10**8:
					BIC.append(BIC_i)
					BIC_rec.append(x)
		#BIC = [compute_bic(kmeansi,array_diagnal) for kmeansi in KMeans]
		#ks_picked=ks[BIC.index(max(BIC))]
		ks_picked=BIC_rec[BIC.index(max(BIC))]
		if ks_picked==1:
			return [data_list]
		else:
			out=[]
			std_rec=[scipy.std(data_list[0]),scipy.std(data_list[1])]
			whitened = whiten(array_diagnal)
			centroids, distortion=kmeans(whitened,ks_picked)
			idx,_= vq(whitened,centroids)
			for x in range(ks_picked):
				group1=[[int(i) for i in array_diagnal[idx==x,0]],[int(i) for i in array_diagnal[idx==x,1]]]
				out.append(group1)
			return out
	else:
		return [data_list]

示例#25

0

显示文件

文件： kmeans_dialog.py 项目： GerrardYNWA/KmeansClustering

    def kmeansCluster(self, layer, distance, number):
        import scipy
        import scipy.cluster.hierarchy as sch
        from scipy.cluster.vq import vq,kmeans,whiten
        import numpy as np

        count = layer.featureCount()
        self.setProgressRange(count)
        points = []
        for f in layer.getFeatures():
            geom = f.geometry()
            x = geom.asPoint().x()
            y = geom.asPoint().y()
            point = []
            point.append(x)
            point.append(y)
            points.append(point)
            self.updateProgress()

        distances = {0:'euclidean', 1:'cityblock', 2:'hamming'}
        disMat = sch.distance.pdist(points, distances.get(distance))#'euclidean''cityblock''hamming''cosine' 
        Z=sch.linkage(disMat,method='average') 
        P=sch.dendrogram(Z)
        cluster= sch.fcluster(Z, t=1, criterion='inconsistent')
        data=whiten(points)
        centroid=kmeans(data, number)[0]
        label=vq(data, centroid)[0]
        return centroid, label

示例#26

0

显示文件

文件： G_run.py 项目： fmcc/mss_layout_analysis

def new_labelled_page(no_of_samples:int, window_size:int, page_scale:int or tuple, labelled_centroids:[tuple], page_paths:[str]):
    ### Duplication from above
    weighter = gaussian_weighter(window_size)
    windower = f.partial(win_centred_on, window=window_size)
    shifter = f.partial(point_shift, window=window_size)
    scaler = img_scaler(page_scale)
    make_observations = compose(prepare_features, real_fft, weighter, std_dev_contrast_stretch)
    img, label = open_image_label(*page_paths)
    img, label = scaler(img, label)
    f_img = prepare_fft_image(img, window_size)
    access_img = img_accessor(img, identity)
    access_label = img_accessor(label, identity)
    access_f_img = img_accessor(f_img, compose(windower, shifter))
    ### End of duplication
    labels = [a[0] for a in labelled_centroids]
    centroids = np.asarray([a[1] for a in labelled_centroids])
    new_label = np.zeros_like(label)
    for s in img_slices(new_label.shape, 80):
        unlabelled_samples = sample_all_in_area(s, applier(identity, compose(make_observations, access_f_img)))   
        coords = [a[0] for a in unlabelled_samples]
        observations = vq.whiten(np.asarray([a[1] for a in unlabelled_samples]))
        codes, dist = vq.vq(observations, centroids)
        for i, code in zip(coords, codes):
            new_label[i] = labels[code]
    return new_label

示例#27

0

显示文件

文件： jkm.py 项目： jrbl/wikilytics

def do_cluster(cluster_count, filename):
    """Use the scipy k-means clustering algorithms to cluster data.

    Return the item names for the smallest cluster.
    """
    input = Data(filename, -1)
    d = vq.whiten(input.data.transpose())
    codebook, avg_distortion = vq.kmeans(d, cluster_count, 150)
    codes, distortions = vq.vq(d, codebook)

    # codes is now a vector of cluster assignments
    # it is ordered the same as data elements in input

    c_sizes = {}
    small_i = 0
    if DEBUG: print "Cluster Sizes: ",
    for i in range(cluster_count):
        c_sizes[i] = count(codes, i)
        if DEBUG: print c_sizes[i],
    if DEBUG: print
    for i in range(cluster_count):
        if c_sizes[i] < c_sizes[small_i]: 
            small_i = i

    if DEBUG: print "Smallest cluster size: " + str(c_sizes[small_i])

    return [input._names[i] for i in findall(codes, small_i)]

示例#28

0

显示文件

文件： SpectralClusterer.py 项目： charanpald/sandbox

    def cluster(self, graph):
        """
        Take a graph and cluster using the method in "On spectral clusering: analysis
        and algorithm" by Ng et al., 2001. 

        :param graph: the graph to cluster
        :type graph: :class:`apgl.graph.AbstractMatrixGraph`

        :returns:  An array of size graph.getNumVertices() of cluster membership 
        """
        L = graph.normalisedLaplacianSym()

        omega, Q = numpy.linalg.eig(L)
        inds = numpy.argsort(omega)

        #First normalise rows, then columns
        standardiser = Standardiser()
        V = standardiser.normaliseArray(Q[:, inds[0:self.k]].T).T
        V = vq.whiten(V)
        #Using kmeans2 here seems to result in a high variance
        #in the quality of clustering. Therefore stick to kmeans
        centroids, clusters = vq.kmeans(V, self.k, iter=self.numIterKmeans)
        clusters, distortion = vq.vq(V, centroids)

        return clusters

示例#29

0

显示文件

文件： govorec.py 项目： zejn/govorec

def recognize(wavfn):
    samplerate, w = wavfile.read(open(wavfn))
    mfcc = run_mfcc(samplerate, w, FRAME_SIZE, STEP, NUM_COEFFICIENTS)
    sample_length = mfcc.shape[0]
    whitened = vq.whiten(mfcc)
    
    def getfile(x):
        return os.path.join(DATADIR, x)
    
    sq_sum_candidates = []
    cos_sim_candidates = []
    for dirname in os.listdir(DATADIR):
        codebook_fn = os.path.join(DATADIR, dirname, CODEBOOK_FN)
        if not os.path.isfile(codebook_fn):
            continue
        codebook, dist_1 = numpy.load(open(codebook_fn, 'rb'))
        code, dist = vq.vq(whitened, codebook)
        sq_sum_candidates.append((sum(dist*dist)/sample_length, dirname))
        cos_dist = []
        for c, d, w in zip(code, dist, whitened):
            cdist = cosine_distance(codebook[c], w)
            cos_dist.append(cdist)
        cdista = numpy.array(cos_dist)
        cos_sim_candidates.append((sum(cdista)/sample_length, dirname))
    
    #print 'Order by square-sum error ascending:'
    #for score, person in sorted(sq_sum_candidates):
        #print '\t', score, person

    print 'Cosine similarity'
    for score, person in sorted(cos_sim_candidates, reverse=True):
        print '\t', score, person

示例#30

0

显示文件

文件： core.py 项目： sovannara-hak/manfredor

def manfredor(list_obj, rules, num_cluster=10):
    score_list = []
    for obj in list_obj:
        score_list.append(obj.computeScore(rules))

    #Normalize observations
    whitened = scv.whiten(score_list) 

    #Compute Kmeans on the set of observations
    #centroids contains the center of each cluster
    centroids, _ = scv.kmeans(whitened, num_cluster)

    #Assign each sample to a cluster
    idx,_ = scv.vq(whitened, centroids)

    #Get index that will sort centroids
    rank = np.argsort(centroids)

    #Map a centroid to a rank
    rank_mapping = dict(zip([c for c in centroids], rank))

    clustered = {}
    i = 0
    for obj in list_obj:
        cluster_of_obs = idx[i]
        centroid = centroids[cluster_of_obs]
        #map url to rank
        clustered[obj.url] = rank_mapping[centroid]
        i += 1

    sorted_cluster = sorted(clustered.iteritems(), key=operator.itemgetter(1))

    return sorted_cluster

示例#31

0

显示文件

文件： main.py 项目： mappp7/tools

    def initialize(self,
                   poses,
                   rest_pose,
                   num_bones,
                   iterations,
                   mayaMesh=None,
                   jointList=None):

        bones = []
        num_verts = rest_pose.shape[0]  # shape mean array scale
        num_poses = poses.shape[0]

        bone_transforms = np.empty(
            (num_bones, num_poses, 4,
             3))  # [(R, T) for for each pose] for each bone
        # 3rd dim has 3 rows for R and 1 row for T

        # Use k-means to assign bones to vertices
        whitened = whiten(rest_pose)
        codebook, _ = kmeans(whitened, num_bones)
        rest_pose_corrected = np.empty(
            (num_bones, num_verts,
             3))  # Rest pose - mean of vertices attached to each bone

        # confirm mode
        if mayaMesh:
            #rigid Skin
            vert_assignments, bones = self.manual_codebook(mayaMesh, jointList)
            boneArray = []
            for i in bones:
                boneArray.append(cmds.xform(i, q=1, t=1, ws=1))

            self.rest_bones_t = np.array(boneArray)
            #rest_bones_t = np.empty((num_bones , 3))

            for bone in range(num_bones):
                #rest_bones_t[bone] = np.mean(rest_pose[vert_assignments == bone] , axis = 0)
                self.rest_bones_t[bone] = np.array(boneArray[bone])
                rest_pose_corrected[bone] = rest_pose - self.rest_bones_t[bone]

                for pose in range(num_poses):

                    bone_transforms[bone, pose] = self.kabsch(
                        rest_pose_corrected[bone, vert_assignments == bone],
                        poses[pose, vert_assignments == bone])

        else:
            # Compute initial random bone transformations

            vert_assignments, _ = vq(
                whitened,
                codebook)  # Bone assignment for each vertex (|num_verts| x 1)
            self.rest_bones_t = np.empty(
                (num_bones, 3))  # Translations for bones at rest pose

            for bone in range(num_bones):
                self.rest_bones_t[bone] = np.mean(
                    rest_pose[vert_assignments == bone], axis=0)
                rest_pose_corrected[bone] = rest_pose - self.rest_bones_t[bone]

                for pose in range(num_poses):

                    bone_transforms[bone, pose] = self.kabsch(
                        rest_pose_corrected[bone, vert_assignments == bone],
                        poses[pose, vert_assignments == bone])

        for it in range(iterations):

            # Re-assign bones to vertices using smallest reconstruction error from all poses
            constructed = np.empty(
                (num_bones, num_poses, num_verts,
                 3))  # |num_bones| x |num_poses| x |num_verts| x 3
            for bone in range(num_bones):
                Rp = bone_transforms[bone, :, :3, :].dot(
                    (rest_pose - self.rest_bones_t[bone]).T).transpose(
                        (0, 2, 1))  # |num_poses| x |num_verts| x 3
                # R * p + T
                constructed[bone] = Rp + bone_transforms[bone, :, np.newaxis,
                                                         3, :]
            errs = np.linalg.norm(constructed - poses,
                                  axis=(1, 3))  # position value average

            vert_assignments = np.argmin(errs, axis=0)

            # For each bone, for each pose, compute new transform using kabsch
            for bone in range(num_bones):

                self.rest_bones_t[bone] = np.mean(
                    rest_pose[vert_assignments == bone], axis=0)

                rest_pose_corrected[bone] = rest_pose - self.rest_bones_t[bone]

                for pose in range(num_poses):
                    P = rest_pose_corrected[bone, vert_assignments == bone]
                    Q = poses[pose, vert_assignments == bone]

                    if (P.size == 0 or Q.size == 0):
                        print 'Skip Iteration'
                    else:

                        bone_transforms[bone, pose] = self.kabsch(P, Q)

        # jointList is correct Index Joint

        return bone_transforms, self.rest_bones_t, bones

示例#32

0

显示文件

文件： novisurf.py 项目： corynscott/Artwork-Navigation--Masters-

def search(query, n=40, start=0):
    # retrieve top n results of query
    # default is 40 results per page

    dict_res = BossImageIndex().CallBoss(query, n, start)
    im_res = dict_res['ysearchresponse']['resultset_images']
    res = []
    for i in xrange(n):
        res.append((im_res[i]['thumbnail_url'], i))

    #path_name = "/Library/WebServer/results/"+query
    path_name = "/Users/novi/my_image_search/results/" + query

    # create the folder (if does not exist) to save query results
    if os.path.isdir(path_name):
        shutil.rmtree(path_name)
        os.mkdir(path_name)
    else:
        os.mkdir(path_name)

    # download the image results
    image = urllib.URLopener()
    silentcounter = 1
    imagefile = []
    for counter in xrange(n):
        urltoberetrieved = res[counter][0]
        #print urltoberetrieved
        filename = '%s/%s.%s' % (path_name, silentcounter, 'jpg')
        #try:
        image.retrieve(urltoberetrieved, filename)
        imagefile.append(filename)
        silentcounter = silentcounter + 1
        #except IOError:
        #    print 'error at %s \n' % (urltoberetrieved)
        #    pass

    # prepare the color image feature
    pref = numpy.array([[0, 0]])  # [image #,position #]
    ldesc = []
    codes = 30  #number of k-means cluster
    ino = 5
    jno = 8  # default grid: 5 by 8 2D grid
    show = ino * jno
    lim = show

    silentcounter = 1
    for i_img in xrange(lim):
        fname = imagefile[i_img]
        try:
            im = cv.LoadImage(fname,
                              0)  # loading with OpenCV (gray chanel only)
            silentcounter = silentcounter + 1
        except:
            print 'image thumbnail can not be retrieved'
            sys.exit(0)

        #resizing the image
        #om = cv.CreateImage((psize,psize),im.depth,im.nChannels)
        #cv.Resize(im,om,cv.CV_INTER_CUBIC)
        storage = cv.CreateMemStorage(0)
        #generating the mask
        #mat = cv.CreateMat(psize,psize,cv.CV_8UC1)
        #extracting SURF feature
        #[keypoints,descriptors] = cv.ExtractSURF(om,mat,storage,(1,500,3,4))
        [keypoints, descriptors] = cv.ExtractSURF(im, im, storage,
                                                  (1, 500, 3, 4))
        ldesc.append(descriptors)

    #perform vector quantization
    tarrdesc = [numpy.array(ldesc[i]) for i in range(show)]
    lendesc = [ldesc[i].__len__() for i in range(show)]
    arrdesc = numpy.concatenate([tarrdesc[i] for i in range(show)])
    arrdesc = whiten(arrdesc)
    [codebook, distortion] = kmeans(arrdesc, codes)
    [code, dist] = vq(arrdesc, codebook)

    #generate the semantic feature
    imgdata = numpy.zeros((show, codebook.shape[0]), dtype=float)
    code_offset = 0
    for i_img in xrange(show):
        code_index = range(code_offset, code_offset + lendesc[i_img])
        for i_code in code_index:
            imgdata[i_img, code[i_code]] = imgdata[i_img, code[i_code]] + 1
        code_offset = code_offset + lendesc[i_img]

    #normalize the semantic feature
    sumimgdata = numpy.sum(imgdata, axis=1)
    sumimgdata.shape = show, 1
    imgdata = imgdata / sumimgdata

    griddata = numpy.zeros((2, ino * jno))
    griddata[0, ] = numpy.kron(range(1, ino + 1), numpy.ones((1, jno)))
    griddata[1, ] = numpy.tile(range(1, jno + 1), (1, ino))

    # do kernelized sorting procedure
    PI = KS(imgdata, griddata.T, pref)
    i_sorting = PI.argmax(axis=1)

    #creating the passed dictionary
    sorted_dict_res = {}
    sorted_dict_res['count'] = dict_res['ysearchresponse']['count']
    sorted_dict_res['totalhits'] = dict_res['ysearchresponse']['totalhits']
    sorted_dict_res['start'] = dict_res['ysearchresponse']['start']
    sorted_dict_res['resultset_images'] = [
        dict_res['ysearchresponse']['resultset_images'][i] for i in i_sorting
    ]
    return sorted_dict_res

示例#33

0

显示文件

文件： extrac_keyword_chinse_TextCollec.py 项目： zhishui3/w

topic_word = model.topic_word_
for i in range(len(doc_topic)):
    topic_most_pr = doc_topic[i].argsort()
    keywords = [topic_word[topic_most_pr[n]].argmax()
                for n in range(K)]  ##话题中概率最大的词
    print('*keywords {}'.format([word[n] for n in keywords]))

### cluster candidates words by topic/svd

from scipy import spatial
from scipy.cluster.vq import kmeans, vq, whiten

word_topic = topic_word.transpose()  # 词-话题向量

for n in range(len(doc_topic)):
    keywords = []
    data = [(w, word.index(w)) for w in candidates[n] if w in word]
    cand_vec = word_topic[[w[1] for w in data], :]  # 候选词-话题向量
    centroids, _ = kmeans(whiten(cand_vec), K)
    for i in range(K):
        min_dist = 100
        near_word = -1
        for j in range(len(cand_vec)):
            a = np.dot(centroids[i, :], cand_vec[j, :])
            if a <= min_dist and j not in keywords:
                min_dist = a
                near_word = j
        keywords.append(near_word)
    keywords = [data[w][0] for w in keywords]
    print('*keywords {}'.format(keywords))

示例#34

0

显示文件

文件： FIFA 18: Normalize data.py 项目： vaibhavkri/DataCamp-Data_Scientist_with_python

'''FIFA 18 is a football video game that was released in 2017 for PC and consoles. The dataset that you are about to work on contains data on the 1000 top individual players in the game. You will explore various features of the data as we move ahead in the course. In this exercise, you will work with two columns, eur_wage, the wage of a player in Euros and eur_value, their current transfer market value.

The data for this exercise is stored in a Pandas dataframe, fifa. whiten from scipy.cluster.vq and matplotlib.pyplot as plt have been pre-loaded.'''

import pandas as pd
from scipy.cluster.vq import whiten
from matplotlib import pyplot as plt

fifa = pd.read_csv(
    '/Users/vaibhav/Desktop/Python Projects/DataCamp-Data Scientist with python/26-Cluster Analysis in Python /Introduction to Clustering /fifa.csv',
    index_col=0)

# Scale wage and value
fifa['scaled_wage'] = whiten(fifa['eur_wage'])

fifa['scaled_value'] = whiten(fifa['eur_value'])

# Plot the two columns in a scatter plot
fifa.plot(x='scaled_wage', y='scaled_value', kind='scatter')
plt.show()

示例#35

0

显示文件

    data = json.load(f)
    print(type(data))

i = 0
usr_prob = np.ndarray((1334, 5))
for usr in data:
    usr_prob[i] = usr["problems"]
    i += 1
# print(i)
# print(usr_prob)

# print(data)

# usr_data = np.array(usr_prob)

x, y = kmeans2(whiten(usr_prob), 5, iter=20)

# y.dtype = np.int64
print(type(y))
y = y.tolist()
# print(x)
print(y[0])
print(len(y))
#
# # print(data["coordinates"]["lat"])
#
j = 0

locn_cluster = {"lat": [], "long": [], "cluster": []}
# print(len(y))

示例#36

0

显示文件

import scipy.cluster.hierarchy as sch
from scipy.cluster.vq import vq,kmeans,whiten
import numpy as np
import matplotlib.pylab as plt


#待聚类的数据点,cancer.csv有653行数据,每行数据有11维: 
dataset = np.loadtxt('cancer.csv', delimiter=",")
#np数据从0开始计算，第0维维序号排除，第10维为标签排除，所以为1到9
points = dataset[:,1:9]
cancer_label = dataset[:,10]
print "points:\n",points
print "cancer_label:\n",cancer_label
# k-means聚类
#将原始数据做归一化处理
data=whiten(points)
#使用kmeans函数进行聚类,输入第一维为数据,第二维为聚类个数k.
#有些时候我们可能不知道最终究竟聚成多少类,一个办法是用层次聚类的结果进行初始化.当然也可以直接输入某个数值. 
#k-means最后输出的结果其实是两维的,第一维是聚类中心,第二维是损失distortion,我们在这里只取第一维,所以最后有个[0]
#centroid = kmeans(data,max(cluster))[0]  
centroid = kmeans(data,2)[0]
print centroid
#使用vq函数根据聚类中心对所有数据进行分类,vq的输出也是两维的,[0]表示的是所有数据的label
label=vq(data,centroid)[0]
num = [0,0]
for i in label:
    if(i == 0):
        num[0] = num[0] + 1
    else:
        num[1] = num[1] + 1
print 'num =',num

示例#37

0

显示文件

文件： kmeans_test1.py 项目： HAL-lucination/MangaTextDetection

      #and if our contour aspect ratio is "long" we draw the bounding box
      #note that this just checks the vert/horiz direction at present.
      x,y,w,h = cv2.boundingRect(c)
      vert_aspect_ratio = float(h)/w
      horiz_aspect_ratio = float(w)/h
      #print 'x ' + str(x) + ' y ' + str(y) + ' w ' + str(w) + ' h ' + str(h)
      #if horiz_aspect_ratio > 2 or vert_aspect_ratio > 2:
      #cv2.rectangle(img,(x,y),(x+w,y+h),(0,0,255),2)
      #draw in all contours to see how they fall
      #contour_sizes.append([float(x)*4,float(y)*4,max(float(w),float(h))])#,horiz_aspect_ratio,vert_aspect_ratio])
      contour_sizes.append([cx*8.0,cy*8.0,max(float(w),float(h))/8.0])#,horiz_aspect_ratio,vert_aspect_ratio])
      contour_lookup.append(c)
      #contour_sizes.append([float(x),float(w),float(h)])#,horiz_aspect_ratio])
    #cv2.drawContours(img,[c],0,(0,255,0),1)
 
  whitened_contour_sizes = clustering.whiten(contour_sizes)
  #print str(contour_sizes)

  # let scipy do its magic (k==3 groups)
  centers,dist = clustering.kmeans(whitened_contour_sizes,75,iter=100)
  code, distance = clustering.vq(whitened_contour_sizes,centers)
  #print str(centroid)
  #print str(code)

  #print 'contours is ' + str(len(contour_sizes)) + ' and code is ' + str(len(code))

  colors = [( int(random.uniform(0, 255)),int(random.uniform(0, 255)),int(random.uniform(0, 255))) for i in code ]
  #print str(colors)
  for i, label in enumerate(code):
    color = colors[label]
    x,y,w,h = cv2.boundingRect(contour_lookup[i])

示例#38

0

显示文件

文件： chapter1-2.py 项目： tamaswells/NJU-python-and-data-Lesson

# -*- coding: utf-8 -*-
"""
Created on Thu Mar 14 02:02:38 2019

@author: js
"""


import numpy as np
from scipy.cluster.vq import vq, kmeans, whiten
list1 = [88.0, 74.0, 96.0, 85.0]
list2 = [92.0, 99.0, 95.0, 94.0]
list3 = [91.0, 87.0, 99.0, 95.0]
list4 = [78.0, 99.0, 97.0, 81.0]
list5 = [88.0, 78.0, 98.0, 84.0]
list6 = [100.0, 95.0, 100.0, 92.0]
data = np.array([list1,list2,list3,list4,list5,list6])
whiten = whiten(data)
centroids,_ = kmeans(whiten, 2)
result,_= vq(whiten, centroids)
print(result)

示例#39

0

显示文件

文件： map_functions.py 项目： HRL-at-NYUSH/nl4ds

def analyze_color(input_image,
                  transparency_threshold=50,
                  plot_3d=False,
                  plot_bar=True,
                  n_cluster=None,
                  max_cluster=10,
                  ignore_pure_black=True,
                  use_sample=True,
                  return_colors=True):

    # Copy to prevent modification (useful but mechanism needs clarification)
    input_image = input_image.copy()

    # Check input shape
    assert (len(input_image.shape) == 3)
    assert (input_image.shape[-1] in {3, 4})

    # Turn color info of pixels into dataframe, filter by transparency if RGBA image is passed
    if input_image.shape[-1] == 4:
        color_df = pd.DataFrame(input_image.reshape(-1, 4),
                                columns=list('rgba'))
        # Get the rgb info of pixels in the non-transparent part of the image
        color_df = color_df[color_df['a'] >= transparency_threshold]
    if input_image.shape[-1] == 3:
        color_df = pd.DataFrame(input_image.reshape(-1, 3),
                                columns=list('rgb'))

    if ignore_pure_black:
        color_df = color_df[~((color_df['r'] == 0) & (color_df['g'] == 0) &
                              (color_df['b'] == 0))]

    # Handle large pixel color_df
    if not use_sample and len(color_df) > 1e5:
        sample_or_not = (input(
            'Large image detected, would you like to sample the pixels in this image? (Y/N) '
        )).lower()[0] == 'y'
        if sample_or_not:
            print(
                'Sampled 100,000 pixels from the image, note that you can also resize the image before passing it to this function.'
            )
            color_df = color_df.sample(n=int(1e5), random_state=0)
        else:
            print(
                'Not sampling performed, but note that rendering 3D plot for the pixels may crash your session and K-means clustering will be slow.'
            )

    # Get std for reverse-transform the kmeans results to a meaningful rgb palette
    r_std, g_std, b_std = color_df[list('rgb')].std()
    reverse_whiten_array = np.array((r_std, g_std, b_std))

    # Normalize observations on a per feature basis, forcing features to have unit variance
    # Doc: https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.vq.whiten.html
    for color in list('rgb'):
        color_df['scaled_' + color] = whiten(color_df[color])

    ## 3D scatter plot showing color groups
    if plot_3d:
        trace = go.Scatter3d(
            x=color_df['r'],
            y=color_df['g'],
            z=color_df['b'],
            mode='markers',
            marker=dict(color=[
                'rgb({},{},{})'.format(r, g, b)
                for r, g, b in zip(color_df['r'].values, color_df['g'].values,
                                   color_df['b'].values)
            ],
                        size=1,
                        opacity=1))
        layout = go.Layout(margin=dict(l=0, r=0, b=0, t=0))
        fig = go.Figure(data=[trace], layout=layout)
        fig.show()

    ## Use K-means to identify main colors
    cluster_centers_list = []
    avg_distortion_list = []

    if n_cluster != None:
        n_cluster_range = [n_cluster - 1]  # note minus 1 to get exactly n
    else:
        n_cluster_range = range(max_cluster + 1)

    if plot_bar:
        # Initialize plt graph
        f, ax = plt.subplots(len(n_cluster_range), 1, figsize=(10, 10))

    for n in n_cluster_range:

        ###### Train clusters ######

        cluster_centers, avg_distortion = kmeans(
            color_df[['scaled_r', 'scaled_g', 'scaled_b']], n + 1)

        ###### Assign labels ######

        labels, distortions = vq(
            color_df[['scaled_r', 'scaled_g', 'scaled_b']], cluster_centers)

        color_df['label'] = labels
        color_df['distortion'] = distortions

        ###### Build palette ######

        # These parameter affects visual style only and can be exposed to user later
        height = 200
        width = 1000
        gap_size = 5
        palette = np.zeros((height, width, 3), np.uint8)

        # Count how many pixels falls under which category, let this decides the color's relative width in the palette
        cluster_proportion = color_df['label'].value_counts().sort_index(
        ) / len(color_df)
        cluster_width_list = (cluster_proportion * width).to_list()
        cluster_width_list = [
            int(x) for x in saferound(cluster_width_list, places=0)
        ]

        # Reorder clusters and widths according to the proportion, largest to smallest
        reordered_cluster_df = pd.DataFrame(
            zip(cluster_centers, cluster_width_list),
            columns=['cluster', 'width']).sort_values('width', ascending=False)
        cluster_centers = reordered_cluster_df['cluster'].tolist()
        cluster_width_list = reordered_cluster_df['width'].tolist()

        # Storing information
        cluster_centers_list.append(cluster_centers)
        avg_distortion_list.append(avg_distortion)

        if plot_bar:
            # Coloring the palette canvas based on color and width
            endpoints = list(np.cumsum(cluster_width_list))
            startpoints = [0] + endpoints[:-1]
            for cluster_index in range(len(cluster_centers)):
                # Notice here we apply the reverse_whiten_array to get meaningful RGB colors
                palette[:, startpoints[cluster_index] + gap_size:
                        endpoints[cluster_index], :] = cluster_centers[
                            cluster_index] * reverse_whiten_array
                palette[:,
                        startpoints[cluster_index]:startpoints[cluster_index] +
                        gap_size, :] = (255, 255, 255)

            # Displaying the palette when performing K-means with parameter n
            if n_cluster != None:
                ax.imshow(palette)
                ax.axis('off')
            else:
                ax[n].imshow(palette)
                ax[n].axis('off')

    if plot_bar:
        ### Show the entire palette
        f.tight_layout()
        plt.show()
        ### Show the elbow plot for choosing best n_cluster parameter for K-means
        fig = plt.figure()
        plt.scatter(x=n_cluster_range, y=avg_distortion_list)
        fig.suptitle('Elbow Plot for K-means')
        plt.xlabel('Number of Clusters')
        plt.ylabel('Average Distortion')
        print()

    if return_colors:
        if n_cluster != None:
            return (cluster_centers_list[0] * reverse_whiten_array).astype(
                np.uint8)
        else:
            return [(cluster_centers * reverse_whiten_array).astype(np.uint8)
                    for cluster_centers in cluster_centers_list]

示例#40

0

显示文件

文件： soundAnalysis.py 项目： DoomyDwyer/sms-tools

def clusterSounds(targetDir, nCluster=-1, descInput=[]):
    """
  This function clusters all the sounds in targetDir using kmeans clustering.
  
  Input:
    targetDir (string): Directory where sound descriptors are stored (all the sounds in this 
                        directory will be used for clustering)
    nCluster (int): Number of clusters to be used for kmeans clustering.
    descInput (list) : List of indices of the descriptors to be used for similarity/distance 
                       computation (see descriptorMapping)
  Output:
    Prints the class of each cluster (computed by a majority vote), number of sounds in each 
    cluster and information (sound-id, sound-class and classification decision) of the sounds 
    in each cluster. Optionally, you can uncomment the return statement to return the same data.
  """

    dataDetails = fetchDataDetails(targetDir)

    ftrArr = []
    infoArr = []

    if nCluster == -1:
        nCluster = len(dataDetails.keys())
    for cname in dataDetails.keys():
        #iterating over sounds
        for sname in dataDetails[cname].keys():
            ftrArr.append(
                convFtrDict2List(
                    dataDetails[cname][sname]['feature'])[descInput])
            infoArr.append([sname, cname])

    ftrArr = np.array(ftrArr)
    infoArr = np.array(infoArr)

    ftrArrWhite = whiten(ftrArr)
    centroids, distortion = kmeans(ftrArrWhite, nCluster)
    clusResults = -1 * np.ones(ftrArrWhite.shape[0])

    for ii in range(ftrArrWhite.shape[0]):
        diff = centroids - ftrArrWhite[ii, :]
        diff = np.sum(np.power(diff, 2), axis=1)
        indMin = np.argmin(diff)
        clusResults[ii] = indMin

    ClusterOut = []
    classCluster = []
    globalDecisions = []
    for ii in range(nCluster):
        ind = np.where(clusResults == ii)[0]
        freqCnt = []
        for elem in infoArr[ind, 1]:
            freqCnt.append(infoArr[ind, 1].tolist().count(elem))
        indMax = np.argmax(freqCnt)
        classCluster.append(infoArr[ind, 1][indMax])

        print(
            "\n(Cluster: " + str(ii) +
            ") Using majority voting as a criterion this cluster belongs to " +
            "class: " + classCluster[-1])
        print("Number of sounds in this cluster are: " + str(len(ind)))
        decisions = []
        for jj in ind:
            if infoArr[jj, 1] == classCluster[-1]:
                decisions.append(1)
            else:
                decisions.append(0)
        globalDecisions.extend(decisions)
        print("sound-id, sound-class, classification decision")
        ClusterOut.append(np.hstack((infoArr[ind], np.array([decisions]).T)))
        print(ClusterOut[-1])
    globalDecisions = np.array(globalDecisions)
    totalSounds = len(globalDecisions)
    nIncorrectClassified = len(np.where(globalDecisions == 0)[0])
    print(
        "Out of %d sounds, %d sounds are incorrectly classified considering that one cluster should "
        "ideally contain sounds from only a single class" %
        (totalSounds, nIncorrectClassified))
    print(
        "You obtain a classification (based on obtained clusters and majority voting) accuracy "
        "of %.2f percentage" % round(
            float(100.0 * float(totalSounds - nIncorrectClassified) /
                  totalSounds), 2))

示例#41

0

显示文件

文件： plot_last_kclustering.py 项目： jizhihang/OpenPV


   k = 64  #24
   for ko in range(numpat):
      kxOn = conv.kxPos(ko, nx, ny, nf)
      kyOn = conv.kyPos(ko, nx, ny, nf)
      p = w.next_patch()
      if marginstart < kxOn < marginend:
         if marginstart < kyOn < marginend:
            acount = acount + 1
            if kxOn == margin + 1 and kyOn == margin + 1:
               d = p
            else:
               d = np.vstack((d,p))

   wd = sp.whiten(d)
   result = sp.kmeans2(wd, k)
   cluster = result[1]

   k2 = k / 2

   nx_im = 2 * (nxp + space) + space
   ny_im = k2 * (nyp + space) + space

   im = np.zeros((nx_im, ny_im))
   im[:,:] = (w.max - w.min) / 2.


   nx_im2 = nx * (nxp + space) + space
   ny_im2 = ny * (nyp + space) + space

示例#42

0

显示文件

文件： KMeans.py 项目： hmwang2000/firstPython

from scipy.cluster.vq import kmeans, vq, whiten
from numpy import vstack, array
from numpy.random import rand

# data generation with three features
data = vstack((rand(100, 3) + array([.5, .5, .5]), rand(100, 3)))
print(data)

# whitening of data
data = whiten(data)
print(data)

# computing K-Means with K = 3 (2 clusters)
print("-------------computing K-Means with K = 3 (2 clusters)--------------")
centroids, _ = kmeans(data, 3)
print(centroids)

# assign each sample to a cluster
clx, _ = vq(data, centroids)

# check clusters of observation
print(clx)

示例#43

0

显示文件

print('Processed ' + str(fileCount) + ' files')
print('Preprocessing...')
# preprocess data
allWords = list()
for i in range(0, len(docs)):
    for j in range(0, len(docs[i])):
        docs[i][j] = wmd.preproc(docs[i][j])
        allWords += docs[i][j]
dict = Dictionary(documents=[allWords])

# create nbow vectors
print('Computing vectors...')
vectors = list()
for i in range(0, len(docs)):
    for j in range(0, len(docs[i])):
        vector = whiten(KeyedVectors.nbow(
            docs[i][j], dict))  # whitening increases accuracy
        # vector = KeyedVectors.nbow(docs[i][j], dict)
        vector = np.append(vector, i)  # add class column
        vectors.append(vector)

df = pd.DataFrame(vectors)
# print('DATA FRAME:\n' + str(df))

# split data to train and test sets
random_indices = permutation(df.index)
test_cutoff = math.floor(len(df) / 5)  # 80:20 ratio
test_set = df.loc[random_indices[1:test_cutoff]]
train_set = df.loc[random_indices[test_cutoff:]]

# choose x and y sets (x - vectors columns; y - class)
x_columns = list(range(0, len(vector) - 1))

示例#44

0

显示文件

文件： kmeantest.py 项目： xinchengzi/Book_Recommendation_System

    def KMEANS(self):
        # clusters
        K = 3

        data_arr = []
        meal_name_arr = []

        with open('./NewDataSet/Cluster_dataset/clusterisbnrate.csv',
                  'rb') as f:
            reader = csv.reader(f)
            for row in reader:
                if reader.line_num != 1:
                    '''for x in row[2:]:
		    		print x'''
                    data_arr.append([float(x) for x in row[1:]])
                    meal_name_arr.append([row[0]])

        data = vstack(data_arr)
        print "data  :"
        print data
        meal_name = vstack(meal_name_arr)

        # normalization
        data = whiten(data)

        # computing K-Means with K (clusters)
        centroids, distortion = kmeans(data, 3)
        print "distortion = " + str(distortion)

        # assign each sample to a cluster
        cntr = []
        print("Centroids:")
        print centroids
        cntr = centroids
        print("Cntr  :")
        print cntr
        print "---------------------------------------------------------"

        print("Centroids after sort:")
        #centroids=cntr.sort()
        #print centroids.sort()

        print "---------------------------------------------------------"
        idx, _ = vq(data, centroids)
        print "idx:"
        print idx
        print "-----------------------------------------------------------"
        '''# some plotting using numpy's logical indexing
	    plot(data[idx==0,0], data[idx==0,1],'ob',
		 data[idx==1,0], data[idx==1,1],'or',
		 data[idx==2,0], data[idx==2,1],'og')'''

        print meal_name
        print data

        for i in range(K):
            print centroids[i] * 3
            #print round(centroids[i])

        print "max value:"
        max1 = max(centroids)

        print "min value:"
        min1 = min(centroids)
        toprated = []
        lowrated = []
        medrated = []
        for i in range(K):
            result_names = meal_name[idx == i, 0]

            print "================================="
            vv = round(centroids[i])
            print vv
            name = ""
            print "Cluster " + str(i + 1)
            for name1 in result_names:
                name = name1
                print name1
                '''if(i== 0) :
		    	f1.write(name)
		    elif (i==1):
		    	f2.write(name)
		    elif (i==2):
		    	f3.write(name)'''

                if (centroids[i] == max1):
                    #for name1 in result_names:
                    toprated.append(name)
                    name = name1 + "\n"
                    f1.write(name)
                elif (centroids[i] == min1):
                    lowrated.append(name)
                    name = name1 + "\n"
                    f3.write(name)
                else:
                    medrated.append(name)
                    name = name1 + "\n"
                    f2.write(name)

        print "--------------------------------------------------------------------------"
        print "toprated:"
        print toprated
        print "--------------------------------------------------------------------------"
        print "medrated:"
        print medrated
        print "--------------------------------------------------------------------------"
        print "lowrated:"
        print lowrated
        print "--------------------------------------------------------------------------"
        '''plot(centroids[:,0],
		 centroids[:,1],
		 'sg',markersize=8)'''

        show()

示例#45

0

显示文件

                 xycoords='data',
                 arrowprops=dict(arrowstyle='->',
                                 connectionstyle='arc3,rad=0.5',
                                 color='k',
                                 alpha=0.8))

plt.show()

# In[4]:

# N is the number of observations to group into k clusters
N = len(coordinates)

# normalize the coordinate data with the whiten function
# each feature is divided by its standard deviation across all observations to give it unit variance.
w = whiten(coordinates)

# k is the number of clusters to form
k = 100

# i is the number of iterations to perform
i = 50

# In[5]:

# performs k-means on a set of observation vectors forming k clusters
# returns a k-length array of cluster centroid coordinates, and the final distortion
cluster_centroids1, distortion = kmeans(w, k, iter=i)

# plot the cluster centroids
plt.figure(figsize=(10, 6), dpi=100)

示例#46

0

显示文件

from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(
    min_df=20,
    stop_words='english')  #该类会将文本中的词语转换为词频矩阵，矩阵元素a[i][j] 表示j词在i类文本下的词频
transformer = TfidfTransformer()  #该类会统计每个词语的tf-idf权值
tfidf = transformer.fit_transform(vectorizer.fit_transform(
    documents_words))  #fit_transform计算tf-idf，fit_transform将文本转为词频矩阵
word = vectorizer.get_feature_names()  #获取词袋模型中的所有词语
features = tfidf.toarray()  #将tf-idf矩阵抽取出来，元素a[i][j]表示j词在i类文本中的tf-idf权重

target = [c for (d, c) in documents]

data = whiten(features)
centroids, _ = kmeans(data, 2)
idx, _ = vq(data, centroids)

target1 = [1 if x == 'pos' else 0 for x in target]
a = sum(target1 == idx) / len(target1)
print('scipy_eu=', max(a, 1 - a))

from nltk.cluster import KMeansClusterer, cosine_distance

clus = KMeansClusterer(2, cosine_distance)
results = clus.cluster(data, True, trace=False)
a = sum(np.array(target1) == results) / len(target1)
print('nltk_cosdis=', max(a, 1 - a))

from Bio.Cluster import kcluster

示例#47

0

显示文件

test_data = pd.read_csv("data/test.csv")

data = pd.concat((train_data, test_data), axis=0, ignore_index=True)
data.drop(['ID', 'v22'], axis=1, inplace=True)
data.fillna(0, inplace=True)

catagorical_features = []
numeric_features = []

for f in data.columns:
    if data[f].dtype == 'object':
        catagorical_features.append(f)
    else:
        numeric_features.append(f)

data_num = whiten(data[numeric_features])
data_cat = pd.get_dummies(data[catagorical_features],
                          columns=catagorical_features)

trlen = train_data.shape[0]
train = np.hstack((data_num[:trlen], data_cat[:trlen]))
test = np.hstack((data_num[trlen:], data_cat[trlen:]))
labels = label_data.astype(int)

xtrain, xtest, ytrain, ytest = train_test_split(train, labels, train_size=0.7)

model = skflow.TensorFlowDNNClassifier(hidden_units=[128, 128, 128],
                                       learning_rate=0.01,
                                       n_classes=2,
                                       batch_size=128,
                                       steps=10000)

示例#48

0

显示文件

    # Make sure we have the primefac-fork
    try:
        import primefac  # pylint: disable=W0611 # NOQA
    except ImportError:
        raise ImportError('Need to install fork of primefac: '
                          'https://github.com/elliptic-shiho/'
                          'primefac-fork')

    # Radially sampled Shepp-Logan
    N, spokes, nc = 288, 72, 8
    kx, ky = radial(N, spokes)
    kx = np.reshape(kx, (N, spokes), 'F').flatten()
    ky = np.reshape(ky, (N, spokes), 'F').flatten()
    k = kspace_shepp_logan(kx, ky, ncoil=nc)
    k = whiten(k)  # whitening seems to help conditioning of Gx, Gy

    # Put in correct shape for radialgrappaop
    k = np.reshape(k, (N, spokes, nc))
    kx = np.reshape(kx, (N, spokes))
    ky = np.reshape(ky, (N, spokes))

    # Get the GRAPPA operators!
    t0 = time()
    Gx, Gy = radialgrappaop(kx, ky, k)
    print('Gx, Gy computed in %g seconds' % (time() - t0))

    # Put in correct order for GROG
    kx = kx.flatten()
    ky = ky.flatten()
    k = np.reshape(k, (-1, nc))

示例#49

0

显示文件

def train_categorical_feature(feature_input, outcome, limit,
                              number_of_clusters):

    input = feature_input.values

    if len(pd.unique(input)) == 2:
        vocabulary = np.unique(input)
        p = np.array([0, 1])
        d = np.zeros(len(input), dtype=np.int)
        d[input == vocabulary[1]] = 1
        output = dict(zip(["d", "vocabulary", "p"], [d, vocabulary, p]))
        print output
        return output

    vocabulary_t = pd.unique(input)
    count_1 = np.zeros(len(vocabulary_t), dtype=int)
    count_0 = np.copy(count_1)

    outcome_1 = outcome.values == 1
    outcome_0 = outcome.values == 0
    for index, item in enumerate(vocabulary_t):
        if pd.notnull(item):
            count_1[index] = sum((input == item) * (outcome_1))
            count_0[index] = sum((input == item) * (outcome_0))
        else:
            count_1[index] = sum(pd.isnull(input) * (outcome_1))
            count_0[index] = sum(pd.isnull(input) * (outcome_0))

    condition = (count_0 + count_1) >= limit
    condition[pd.isnull(vocabulary_t)] = True

    #    n = sum(condition)
    #    vocabulary = np.zeros(n, dtype = str)
    #    p = np.zeros(n)

    def log_ratio(count_1, count_0):
        if count_1 == 0:
            return log(1 / (2 * float(count_0)))
        elif count_0 == 0:
            return log(2 * count_1)
        else:
            return log(count_1 / float(count_0))

    v_log_ratio = np.vectorize(log_ratio)

    vocabulary = vocabulary_t[condition]
    p = v_log_ratio(count_1[condition], count_0[condition])

    #    index = 0
    #    for i in range(len(vocabulary_t)):
    #        if (condition[i]):
    #            vocabulary[index] = str(vocabulary_t[index])
    #            p[index] = log_ratio(count_1[index], count_0[index])
    #            index = index + 1
    #            if (count_1[index] == 0):
    #                p[index] = log(1./(2*count_0[index]))
    #            elif (count_0[index] == 0):
    #                p[index] = log(2*count_1[index])
    #            else:
    #                p[index] = log(count_1[index]./count_0[index])
    # print "sum(condition == 0) is {0}".format(sum(condition == 0))
    if sum(condition == 0) <= 1:
        if sum(condition == 0) == 1:
            p = np.append(
                p,
                log_ratio(count_1[condition == 0][0],
                          count_0[condition == 0][0]))
            #           if (count_1[condition == 0][0] == 0):
            #                p[condition == 0] = log(1./(2*count_0[condition == 0][0]))
            #           elif (count_0[condition == 0] == 0):
            #                p[condition == 0] = log(2*count_1[condition == 0][0])
            #           else:
            #                p[condition == 0] = log(count_1[condition == 0][0]./count_0[condtion == 0][0])
            vocabulary = np.append(vocabulary, vocabulary_t[condition == 0])
    else:
        # print "number of clusters {0}".format(number_of_clusters)
        cl = min(number_of_clusters,
                 sum(condition == 0) - 1)  # why is it -1 here?
        # cl_vocabulary = pd.DataFrame()
        # print "cl {0}".format(cl)
        residual_1 = count_1[condition == 0]
        residual_0 = count_0[condition == 0]
        # print "length of the residual_1 {0}".format(len(residual_1))
        #        s = np.zeros(len(residual_1))
        s = v_log_ratio(residual_1, residual_0).reshape([len(residual_1), 1])
        whitened = whiten(s)
        codebook = kmeans(whitened, cl)[0]
        code = vq(whitened, codebook)[0]
        # print "length of code {0}".format(len(code))
        s1 = pd.Series(data=vocabulary_t[condition == 0])  # .astype(str)
        s2 = pd.Series(data=code)
        cl_vocabulary = pd.DataFrame.from_dict({
            "cat_feature_input": s1,
            "cluster_id": s2
        })

        #print cl_vocabulary.axes

        cl_p = np.zeros(cl, dtype=float)
        # print cl_p, len(cl_p)

        for i in range(cl):
            # print i
            c1 = residual_1[code == i]
            c0 = residual_0[code == i]
            cl_p[i] = log_ratio(sum(c1), sum(c0))
            # print "Hey"

    d = np.zeros(len(input))
    d[pd.isnull(input)] = p[pd.isnull(vocabulary)]

    for i in range(len(vocabulary)):
        d[input == vocabulary[i]] = p[i]
    vocabulary = vocabulary.astype(str)

    if 'cl_vocabulary' in locals():
        print "cl_vocabulary in locals()"
        for i in range(len(cl_vocabulary)):
            d[input == cl_vocabulary.loc[i, "cat_feature_input"]] = cl_p[
                cl_vocabulary.loc[i, "cluster_id"]]
        #print cl_vocabulary.axes
        cl_vocabulary.loc[:, "cat_feature_input"] = cl_vocabulary[
            "cat_feature_input"].astype(str)
        # print cl_vocabulary["cat_feature_input"].apply(type)

        output = dict(
            zip(["d", "vocabulary", "cl_vocabulary", "p", "cl_p"],
                [d, vocabulary, cl_vocabulary, p, cl_p]))
    else:
        output = dict(zip(["d", "vocabulary", "p"], [d, vocabulary, p]))

    #print output
    return output

示例#50

0

显示文件

文件： codebook.py 项目： jonathanwoodard/mlsp-2013

line = segfile.readline()
index = 0
while line != '':
    tokens = line.split(',')
    nums = map(float, tokens)
    nums = nums[2:len(line)]  # Omit recid and segid

    if index == 0:
        segfeatures = nums
    else:
        segfeatures = np.vstack((segfeatures, nums))

    line = segfile.readline()
    index += 1

segfeatures = whiten(segfeatures)

kmeans1 = cluster.KMeans(n_clusters=k,
                         init='k-means++',
                         n_init=50,
                         max_iter=300,
                         random_state=rseed)
kmeans2 = cluster.KMeans(n_clusters=kextra,
                         init='k-means++',
                         n_init=50,
                         max_iter=300,
                         random_state=rseed)

clusters1 = kmeans1.fit_predict(segfeatures)
clusters2 = kmeans2.fit_predict(segfeatures)

示例#51

0

显示文件

infiles = glob.glob('image_vectors/*.npz')

# build ann index
#t = AnnoyIndex(dims)
for file_index, i in enumerate(infiles):
  file_vector = np.loadtxt(i)
  file_name = os.path.basename(i).split('.')[0]
  file_index_to_file_name[file_index] = file_name
  file_index_to_file_vector[file_index] = file_vector

  #whitened = whiten(file_vector)
  #t.add_item(file_index, file_vector)
  

#t.build(trees)
whitened = whiten(features)
codes = 3
result = kmeans(whitened, codes)

'''
# create a nearest neighbors json file for each input
if not os.path.exists('nearest_neighbors'):
  os.makedirs('nearest_neighbors')

for i in file_index_to_file_name.keys():
  master_file_name = file_index_to_file_name[i]
  master_vector = file_index_to_file_vector[i]

  named_nearest_neighbors = []
  nearest_neighbors = t.get_nns_by_item(i, n_nearest_neighbors)
  for j in nearest_neighbors:

示例#52

0

显示文件

def displayResult():


    noOfCluster=0
    #Get Radio button input to check user choice
    chart = request.form['radio']
    #If user choice is cluster
    if chart == 'cluster':
        noOfCluster =long(request.form['cluster'])
        data_arr = []
        meal_name_arr = []
        #Url of data csv
        url='https://storage.googleapis.com/cloudbucket786/imptry4.csv'
        response=urllib2.urlopen(url)
        reader = csv.reader(response)
        
        for row in reader:
                if row[5] is None:
                    row[5]=0
                if row[5]=='':
                    row[5]=0
                if "," in row[6] :
                    rowVal=row[6].split(",")
                    row[6]=rowVal[0]+''+rowVal[1]
                    row[6]=float(row[6])
                if row[6]=='':
                    row[6]=0
                if row[6]=='N' :
                    row[6]=0
                if "," in row[7] :
                    rowVal=row[7].split(",")
                    row[7]=rowVal[0]+''+rowVal[1]
                    row[7]=float(row[6])
                if row[7]=='':
                    row[7]=0
                if row[7]=='N' :
                    row[7]=0
                data_arr.append([float(x) for x in row[5:]])#adding data to data_array
                meal_name_arr.append([row[0]])#adding ids to second array



    #print data_arr
        fig = plt.figure()
        ax = fig.add_subplot(111, projection='3d')#We are using 3D projection as we are plotting 3D data
        data = vstack( data_arr )

        meal_name = vstack(meal_name_arr)
    # normalization
        data = whiten(data)#Before running k-means, it is beneficial to rescale each feature dimension of the observation set with whitening.
    #Each feature is divided by its standard deviation across all observations to give it unit variance.

    # computing K-Means with K (clusters)
        centroids, distortion = kmeans(data,noOfCluster)


    # assign each sample to a cluster
        idx,_ = vq(data,centroids)

    # some plotting using numpy's logical indexing
        listOfColor=['ob','or','og','oc','om','ok','oy']
        for index in range(noOfCluster):
            plot(data[idx==index,0], data[idx==index,1],data[idx==index,2],listOfColor[index])# using 3 objects for 3D projection
        for index in range(noOfCluster):
            result_names = meal_name[idx==index, 0]
        print "================================="
        print "Cluster " + str(index+1)
        for name in result_names:
            print name

        plot(centroids[:,0],
             centroids[:,1],
             centroids[:,2],
             'oy',markersize=8)
        #saving file to temp image
        #Assigning labels to axis
        ax.set_xlabel('X Label')
        ax.set_ylabel('Y Label')
        ax.set_zlabel('Z Label')
        pylab.savefig('temp.jpg')
        pylab.clf()
        

        image="https://www.pythonanywhere.com/user/abhitej/files/home/abhitej/temp.jpg"
        #Overwrites the image on pythonanywhere.com

        return render_template('home.html',image=image,display='display:block;')

    else:
        list=[]
        words=request.form['words']
        list=words.split(",")

        list1=[]
        for s in list:
            list1.append(s.encode('ascii','ignore'))

        return render_template('home.html',list1=list1,display='display:none;')# Assigning display none for cluster if user chooce wordcloud

示例#53

0

显示文件

文件： strain_classification.py 项目： jcpalumbo3/mousestyles

num_bins = len(use_features) * 11

all_data_orig = np.hstack(
    [all_data_orig_master[0, :, 0:3]] +
    [all_data_orig_master[AS_i, :, 3:] for AS_i in use_features])

# classifiers
NN_classify = np.zeros(2)  # 1st col mice | 2nd MDs
LR_classify = np.zeros(2)
GNB_classify = np.zeros(2)
RF_classify = np.zeros(2)

data = all_data_orig[:, 3:]
labels = all_data_orig[:, 0:3]
data = whiten(data)  # "Z-score"

train, labels_train, test, labels_test = split_data_in_half_randomly(
    data, labels)

mice_train = day_to_mouse_average(train, labels_train)
mice_test = day_to_mouse_average(test, labels_test)

# NN vanilla classification
strain_centers = mouse_to_strain_average(mice_train[:, 2:], mice_train[:, 0:2])

tot_cor = 0
for cnt, ms in enumerate(mice_test):
    min_dist = np.inf
    for k in range(strain_centers.shape[0]):
        distance = np.sqrt(((strain_centers[k] - ms[2:])**2).sum())

示例#54

0

显示文件

文件： Author.py 项目： b-javaheri/AuthorIdentification

    tokens = nltk.word_tokenize(ch_text.lower())
    words = word_tokenizer.tokenize(ch_text.lower())
    sentences = sentence_tokenizer.tokenize(ch_text)
    vocab = set(words)
    words_per_sentence = np.array(
        [len(word_tokenizer.tokenize(s)) for s in sentences])

    fvs_lexical[e, 0] = words_per_sentence.mean()
    fvs_lexical[e, 1] = words_per_sentence.std()
    fvs_lexical[e, 2] = len(vocab) / float(len(words))

    fvs_punct[e, 0] = tokens.count(',') / float(len(sentences))
    fvs_punct[e, 1] = tokens.count(';') / float(len(sentences))
    fvs_punct[e, 2] = tokens.count(':') / float(len(sentences))

fvs_lexical = whiten(fvs_lexical)
fvs_punct = whiten(fvs_punct)

NUM_TOP_WORDS = 10
all_tokens = nltk.word_tokenize(all_text)
fdist = nltk.FreqDist(all_tokens)
vocab = fdist.keys()[:NUM_TOP_WORDS]
vectorizer = CountVectorizer(vocabulary=vocab, tokenizer=nltk.word_tokenize)
fvs_bow = vectorizer.fit_transform(chapters).toarray().astype(np.float64)
fvs_bow /= np.c_[np.apply_along_axis(np.linalg.norm, 1, fvs_bow)]


def token_to_pos(ch):
    tokens = nltk.word_tokenize(ch)
    return [p[1] for p in nltk.pos_tag(tokens)]

示例#55

0

显示文件

def apply_kmeans(box_dict, k):
    # for every object class in the box_dict
    #     reduce the list of boxes to the clustered boxes with kmeans
    # return the new dictionary
    kmeans_dict = dict()
    for obj_class in box_dict:
        print obj_class
        boxes = box_dict[obj_class]
        if len(boxes) > k:
            # write a representation for each proposal box as a vector
            def box_to_vec(pbox):
                # list of metrics which we want to reduce the Euclidean distance of:
                # includes centroid, and each of the individual coordinates of the box,
                # which are used to recover box coordinates after the k means in vector reprepresentation
                # are found. To weight the impact of the centroid measure,
                # we multiply by 1/area: the centroid matters less as box area increases.
                # we also include the coordinates, since distances between them are relevant as well.
                # Note that including the original coordinates in the vector allows us to recover the
                # original representation of the box.
                # we also include the score (scaled down) for the same reason. We scale it down since score-space
                # should not really affect the distance between boxes (having similar scores is not necessarily a good reason
                # to combine or not)
                metrics = [
                    pbox.centroid()[0],
                    pbox.centroid()[1],
                    pbox.centroid()[0] / pbox.area(),
                    pbox.centroid()[1] / pbox.area(), pbox.x1, pbox.y1,
                    pbox.x2, pbox.y2, 0.00001 * pbox.score
                ]
                return metrics

            # we will append the columns together and then take transpose
            # so that each row is a box with n features (here n = 9)
            first_col = box_to_vec(boxes[0])
            # for rescaling
            oldx1, oldy1, oldx2, oldy2, oldscore = first_col[4], first_col[
                5], first_col[6], first_col[7], first_col[8]
            first_col = np.array(first_col)
            first_col = first_col.T
            box_mat = first_col
            for i in range(1, len(boxes)):
                new_col = np.array(box_to_vec(boxes[i]))
                new_col = new_col.T
                box_mat = np.c_[box_mat, new_col]
            box_mat = box_mat.T
            box_mat = box_mat.astype('float')
            # whiten
            box_mat = whiten(box_mat)
            # need to rescale the coords when we recover the boxes from the representation vectors
            newx1, newy1, newx2, newy2, newscore = 0, 0, 0, 0, 0
            if len(np.shape(box_mat)) > 1:
                newx1, newy1, newx2, newy2, newscore = box_mat[0][4], box_mat[
                    0][5], box_mat[0][6], box_mat[0][7], box_mat[0][8]
            else:
                newx1, newy1, newx2, newy2, newscore = box_mat[4], box_mat[
                    5], box_mat[6], box_mat[7], box_mat[8]
            scalex1, scaley1, scalex2, scaley2, scalescore = oldx1 / (
                0. + newx1), oldy1 / (0. + newy1), oldx2 / (
                    0. + newx2), oldy2 / (0. + newy2), oldscore / (0. +
                                                                   newscore)
            # use k-means
            codebook, distortion = kmeans(box_mat, k)
            centroid_boxes = []
            for i in range(np.shape(codebook)[0]):
                # we chop off from 4 onwards because these are (pbox.x1, pbox.y1, pbox.x2, pbox.y2, pbox.score)
                # this is a direct inverse from box_to_vec
                # need to multiply these coords by standard deviations across all instances of feature.
                thebox = box(scalex1 * codebook[i][4],
                             scaley1 * codebook[i][5],
                             scalex2 * codebook[i][6],
                             scaley2 * codebook[i][7],
                             scalescore * codebook[i][8])
                centroid_boxes.append(thebox)
            print "# of centroids: " + str(len(centroid_boxes))
            print centroid_boxes[0]
            print centroid_boxes[1]
            print centroid_boxes[2]
            if obj_class not in kmeans_dict:
                kmeans_dict[obj_class] = []
            kmeans_dict[obj_class] = centroid_boxes
        else:
            kmeans_dict[obj_class] = box_dict[obj_class]
        print "==================================="
    return kmeans_dict

示例#56

0

显示文件

文件： trackPoints.py 项目： ashwinrammohan/MouseCV

def limb_track():
    global frame_n

    cv.namedWindow("Dots")
    fps = 30
    frame_dt = 0  #1.0 / fps
    mv_i = 0
    pause = False

    while True:
        print("Frame:", mv_i)
        if frame_n >= contour_data.shape[0]:
            #mv_i = 0
            print("Frames completed:", frame_n)
            f_write.save(write_dict)
            break

        t = time.clock()
        ret, im = cap.read()

        for x, y in fs:
            cv.circle(im, (x, y), 2, (255, 0, 0), -1)

        n = n_contours[mv_i]

        if (n > 0):

            c_points = contour_data[mv_i, :n]

            limb_distances = np.empty((num_limbs, n))
            for i in range(num_limbs):
                limb_x, limb_y = fs[i]
                for j in range(n):
                    x, y = c_points[j]
                    dx = limb_x - x
                    dy = limb_y - y
                    distance = dx * dx + dy * dy
                    limb_distances[i, j] = distance

                limb_distances[i] = np.sort(limb_distances[i])

            threshold = 1500
            needed_limbs = np.where(limb_distances[:, 0] < threshold)[0]

            whitened = whiten(c_points)
            x_scale = c_points[0, 0] / whitened[0, 0]
            y_scale = c_points[0, 1] / whitened[0, 1]

            if (needed_limbs.shape[0] > 0):
                max_k = 6
                costs = np.empty(max_k - needed_limbs.shape[0])
                all_kmean_points = []
                for k in range(needed_limbs.shape[0], max_k):
                    points, distortion = kmeans(whitened, k)
                    points[:, 0] *= x_scale
                    points[:, 1] *= y_scale
                    points = points.astype('int32')
                    all_kmean_points.append(points)
                    costs[k - needed_limbs.shape[0]] = cost(
                        points, needed_limbs)

                best_ind = np.argmin(costs)
                best_points = all_kmean_points[best_ind]

                for i, (x, y) in enumerate(best_points):
                    cv.circle(im, (x, y), 2, (0, 0, 255), -1)

                distances = np.empty(
                    (needed_limbs.shape[0], best_points.shape[0]))
                indices = np.empty(
                    (needed_limbs.shape[0], best_points.shape[0], 2),
                    dtype='uint8')
                for i in range(needed_limbs.shape[0]):
                    limb_x, limb_y = fs[needed_limbs[i]]
                    for j in range(best_points.shape[0]):
                        x, y = best_points[j]
                        dx = x - limb_x
                        dy = y - limb_y
                        distance = dx * dx + dy * dy
                        distances[i, j] = distance
                        indices[i, j, 0] = needed_limbs[i]
                        indices[i, j, 1] = j

                for i in range(needed_limbs.shape[0]):
                    i, j = np.unravel_index(np.nanargmin(distances),
                                            distances.shape)
                    limb_ind = indices[i, j, 0]
                    point_ind = indices[i, j, 1]
                    new_limb_pos = (best_points[point_ind,
                                                0], best_points[point_ind, 1])
                    cv.line(im, fs[limb_ind], new_limb_pos, (255, 255, 255), 1)
                    fs[limb_ind] = new_limb_pos
                    distances[i] = np.NaN
                    distances[:, j] = np.NaN

        for i in range(num_limbs):
            name = names[i]
            x, y = fs[i]
            write_dict[name][mv_i, 0] = x
            write_dict[name][mv_i, 1] = y

        cv.putText(im, str(frame_n), (5, 25), cv.FONT_HERSHEY_SIMPLEX, 1.0,
                   (255, 255, 255))
        cv.imshow("Dots", im)

        if pause:
            k = cv.waitKey(0)
        else:
            dt = frame_dt - (time.clock() - t)
            dt_mili = int(dt * 1000)

            if (dt_mili < 1):
                dt_mili = 1

            k = cv.waitKey(dt_mili)
            mv_i += 1
            frame_n += 1

        if k == 27:  # esc key
            print("Frames completed:", frame_n)
            f_write.save(write_dict)
            break
        elif k == 32:  # space key
            pause = not (pause)
        elif k == 63235 and pause:  # right arrow
            mv_i += 1
            frame_n += 1
            print(stds[frame_n])
        elif k == 63234 and pause:  # left arrow
            mv_i -= 1
            frame_n -= 1
            print(stds[frame_n])

示例#57

0

显示文件

文件： SciPy.py 项目： PriyankaSrivastava10/ml-basics

from numpy import vstack, array
from numpy.random import rand
#from scipy.cluster.vq import whiten
import scipy.cluster.vq as vec

# data generation with three features
data = vstack((rand(100, 3) + array([.5, .5, .5]), rand(100, 3)))

# whitening of data
data = vec.whiten(data)

# computing K-Means with K = 3 (2 clusters)
centroids, _ = vec.kmeans(data, 3)

# assign each sample to a cluster
clx, _ = vec.vq(data, centroids)

print(data)
print(centroids)
print(clx)

示例#58

0

显示文件

def doKMeans(filename):
    '''
	with open(filename) as data_file:    
		data = json.load(data_file)
	'''
    data = json.loads(filename)
    #print(data["USERS"])
    #data = filename
    #print(data["USERS"])
    vertex_matrix = []
    UID = []
    print(data)
    for user in data["USERS"]:
        arr = []
        for VID, V_ANS in user["V_ANS"]["iphone6s"].items():
            arr.append(V_ANS)
        vertex_matrix.append(arr)
        UID.append(user["UID"])
        #print(arr)

    if (len(vertex_matrix) == 1):
        vertex_matrix.append(vertex_matrix[0])
    print(vertex_matrix)

    whitened = whiten(vertex_matrix)
    k = math.floor(math.sqrt(len(vertex_matrix)))
    cluster = kmeans2(whitened, k, 99, 'points')

    print cluster

    centroid = []
    sorted_centroid = []
    sorted_vertex = []

    i = 0
    for cen in cluster[0]:
        x = sum(cen)
        j = 0
        centroid.append(dict())
        sorted_vertex.append(list())
        sorted_centroid.append(list())
        for y in cen:
            j += 1
            centroid[i][str(j)] = y / x
        sorted_centroid[i] = sorted(centroid[i].items(),
                                    key=operator.itemgetter(1))
        for c_list in sorted_centroid[i]:
            sorted_vertex[i].append(c_list[0])
        i += 1

        #print arr
    print sorted_centroid
    print sorted_vertex

    circled_vertex = []

    i = 0
    for arr in sorted_vertex:

        circled_vertex.append(list())
        j = 0
        for v in arr:
            if (j % 2 == 0):
                circled_vertex[i] = circled_vertex[i] + [v]
            else:
                circled_vertex[i] = [v] + circled_vertex[i]
            j += 1
        i += 1

    print circled_vertex

    pivot = circled_vertex[0][0]

    for i in range(1, len(circled_vertex)):
        if (circled_vertex[i].index(pivot) != 0):
            circled_vertex[i] = circled_vertex[i][circled_vertex[i].index(
                pivot):] + circled_vertex[i][0:circled_vertex[i].index(pivot)]

    print circled_vertex

    final_vertex = []
    for i in range(0, len(circled_vertex[0])):
        vote = dict()
        for j in range(0, len(circled_vertex[0])):
            vote[str(j + 1)] = 0
        for j in range(0, len(circled_vertex)):
            try:
                vote[circled_vertex[j][i]] += 1
            except:
                vote[circled_vertex[j][i]] = 1
        sorted_vote = sorted(vote.items(), key=operator.itemgetter(1))
        #sorted_vote = sorted_vote[::-1]
        for v in reversed(sorted_vote):
            if (v[0] not in final_vertex):
                final_vertex.append(v[0])
                break

    print(final_vertex)

    sphere_vertex_weight = []
    for v in final_vertex:
        sphere_vertex_weight.append(0)

    for c in cluster[1]:
        for v in final_vertex:
            sphere_vertex_weight[int(v) - 1] += cluster[0][c][int(v) - 1]

    total_weight = sum(sphere_vertex_weight)
    for i in range(0, len(sphere_vertex_weight)):
        sphere_vertex_weight[i] = sphere_vertex_weight[i] / total_weight
        x = sphere_vertex_weight[i]
        #sphere_vertex_weight[i] = math.pow(math.sine(0.5*3.14*x),(2/3))

        if (math.isnan(sphere_vertex_weight[i])):
            sphere_vertex_weight[i] = -1
    sphere_vertex_weight = normalize(sphere_vertex_weight)

    print(sphere_vertex_weight)

    planetList = []
    for i in range(0, len(cluster[0])):
        planetList.append({"children": []})
        planetList[i]["vertex_weight"] = []
        planetList[i]["name"] = ""
        for v in final_vertex:
            if (math.isnan(centroid[i][v])):
                centroid[i][v] = -1
            planetList[i]["vertex_weight"].append(centroid[i][v])

        planetList[i]["vertex_weight"] = normalize(
            planetList[i]["vertex_weight"])
        '''
		for j in range(0,len(planetList[i]["vertex_weight"])):
			x = planetList[i]["vertex_weight"][j]
			planetList[i]["vertex_weight"][j] = math.pow(math.sine(0.5*3.14*x),(2/3))
		'''

    for i in range(0, len(cluster[1])):
        planetList[cluster[1][i]]["children"].append({"name": UID[i]})
        #planetList[i]["users"].append({"userId": UID[cluster[1][j]]})

    print(planetList)

    result = {
        "name": "sphereList",
        "vertex": final_vertex,
        "vertex_weight": sphere_vertex_weight,
        "children": planetList
    }

    print(json.dumps(result))

    saveJson = {"userData": [], "userCluster": []}
    for i in range(0, len(cluster[1])):
        arr = []
        #arr.append(UID[j])
        for j in range(0, len(vertex_matrix[i])):
            arr.append(vertex_matrix[i][j])
        #arr.append(cluster[1][i])
        saveJson["userData"].append(arr)
        saveJson["userCluster"].append(cluster[1][i])

    with open('data/userCluster.json', 'w') as data_file:
        data_file.write(json.dumps(saveJson))

    return result

示例#59

0

显示文件

        bart_time = time() - t0

        # Check it out
        plt.figure()
        plt.imshow(sos(bart_imspace))
        plt.title('BART NUFFT')
        plt.xlabel('Recon: %g sec' % bart_time)
        plt.show(block=False)

    # The phantominator module also supports arbitrary kspace
    # sampling for multiple coils:
    kx, ky = radial(sx, spokes)
    kx = np.reshape(kx, (sx, spokes), 'F').flatten()
    ky = np.reshape(ky, (sx, spokes), 'F').flatten()
    k = kspace_shepp_logan(kx, ky, ncoil=nc)
    k = whiten(k)

    # We will prefer a gridding approach to keep things simple.  The
    # helper function gridder wraps scipy.interpolate.griddata():
    t0 = time()
    grid_imspace = gridder(kx, ky, k, sx, sx, os=os, method=method)
    grid_time = time() - t0

    # Take a gander
    plt.figure()
    plt.imshow(sos(grid_imspace))
    plt.title('scipy.interpolate.griddata')
    plt.xlabel('Recon: %g sec' % grid_time)
    plt.show(block=False)

    # We could also use GROG to grid

示例#60

0

显示文件

文件： test.py 项目： dlacombejr/sparse_filtering

def main():
    # parse options from the command line
    parser = argparse.ArgumentParser(
        prog='PROG',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        description=textwrap.dedent('''\
        -------------------------------------------------------------------------------------------------------------
        This is a deep neural network architecture for training sparse filters. Example uses:
            $ python test.py
            $ python test.py -m GroupSF -v 1 -g 3 -s 1
            $ python test.py -m ConvolutionalSF -d 16 1 8 8 -v 1 -w y -c y -f CIFAR_data.mat -i 100
            $ python test.py -m ConvolutionalSF ConvolutionalSF -d 16 1 6 6 16 16 4 4 -w y -c y -f CIFAR_data.mat
              -i 100 150 -t y -v 1
        -------------------------------------------------------------------------------------------------------------
        ''')
    )
    parser.add_argument("-m", "--model", default=['SparseFilter'], nargs='+', help="the model type")
    parser.add_argument("-c", "--convolution", default="n", help="convolution, yes or no")
    parser.add_argument("-f", "--filename", default="patches.mat", help="the data filename")
    parser.add_argument("-d", "--dimensions", type=int, nargs='+', default=([100, 256]),
                        help="the dimensions of the model: [neurons, input size] or [neurons, length, width]")
    parser.add_argument("-p", "--pool", type=int, nargs='+', default=None, help="pooling dimensions")
    parser.add_argument("-g", "--group", type=int, default=None, help="group size")
    parser.add_argument("-s", "--step", type=int, default=None, help="step size")
    parser.add_argument("-l", "--learn_rate", type=float, default=.001, help="learning rate")
    parser.add_argument("-i", "--iterations", type=int, nargs='+', default=[100], help="number of iterations")
    parser.add_argument("-v", "--verbosity", type=int, default=0, help="verbosity: 0 no plot; 1 plots")
    parser.add_argument("-o", "--opt", default="GD", help="optimization method: GD or L-BFGS")
    parser.add_argument("-w", "--whitening", default='n', help="whitening: 'y' or 'n'")
    parser.add_argument("-t", "--test", default='n', help="test classification performance: 'y' or 'n'")
    parser.add_argument("-a", "--channels", type=int, default=1, help="number of channels in data")
    parser.add_argument("-e", "--examples", type=int, default=None, help="number of training examples")
    parser.add_argument("-b", "--batch_size", type=int, default=1000, help="number of examples in [mini]batch")
    parser.add_argument("-z", "--aws", default='n', help="run on aws: 'y' or 'n'")
    parser.add_argument("-r", "--random", default='n', help="type of batches: random = 'y'")
    args = parser.parse_args()
    args.dimensions = parse_dims(args)
    args.iterations = parse_iter(args)

    ''' =================================== Load in the data =================================== '''

    # load in data
    print "loading data..."
    base_path = os.path.dirname(__file__)
    file_path = os.path.join(base_path, "data", args.filename)
    data = loadmat(file_path)['X']

    # reshape and preprocess data
    print "pre-processing data ..."
    video = None
    if args.filename == 'patches_video.mat':
        video = data
        data = data.reshape(data.shape[0] * data.shape[1], data.shape[2]).T

    if args.convolution == 'n':
        if args.whitening == 'y':
            data -= data.mean(axis=0)
            data = whiten(data.T).T
        elif args.whitening == 'n' and args.channels == 1:
            data -= data.mean(axis=0)
        # elif args.whitening == 'n' and args.channels == 3:
        # data = np.float32(data)
        data = np.float32(data.T)

    elif args.convolution == 'y':

        if args.filename == 'kyotoData.mat':
            data = np.float32(data.reshape(-1, 1, int(np.sqrt(data.shape[1])), int(np.sqrt(data.shape[1]))))
            data = scaling.LCNinput(data, kernel_shape=9)

        elif args.filename == 'CIFAR_data.mat':
            data = np.float32(data.reshape(-1, 1, int(np.sqrt(data.shape[1])), int(np.sqrt(data.shape[1]))))
            data = scaling.LCNinput(data, kernel_shape=5)
            data = data[0:args.examples, :, :, :]

        elif args.filename == 'STL_10.mat' or args.filename == 'Lenna.mat':
            data = np.float32(data.reshape(-1, 3, int(np.sqrt(data.shape[1] / 3)), int(np.sqrt(data.shape[1] / 3))))
            data = data[0:args.examples, :, :, :]
            args.channels = data.shape[1]
            for channel in range(args.channels):
                data[:, channel, :, :] = np.reshape(scaling.LCNinput(data[:, channel, :, :].
                                                                     reshape((data.shape[0], 1,
                                                                              data.shape[2],
                                                                              data.shape[3])),
                                                                     kernel_shape=9), (
                                                    data.shape[0],
                                                    data.shape[2],
                                                    data.shape[3]))

    # assert that batch size is valid and get number of batches
    n_batches, rem = divmod(data.shape[0], args.batch_size)
    assert rem == 0

    # other assertions
    assert len(args.model) == len(args.iterations)
    if args.model[0] == 'GroupSF' or args.model[0] == 'GroupConvolutionalSF':
        assert args.group is not None
        assert args.step is not None

    # assert that the number of neurons in each layer is a perfect square
    for layer in xrange(len(args.dimensions)):
        assert np.sqrt(args.dimensions[layer][0]) % np.floor(np.sqrt(args.dimensions[layer][0])) == 0

    ''' ============================= Build and train the network ============================= '''

    # construct the network
    print "building model..."
    model = sf.Network(
        model_type=args.model, weight_dims=args.dimensions, p=args.pool, group_size=args.group,
        step=args.step, lr=args.learn_rate, opt=args.opt, c=args.convolution, test=args.test,
        batch_size=args.batch_size, random=args.random, weights=None
    )  # TODO: custom learning rates for each layer

    # compile the training, output, and test functions for the network
    print "compiling theano functions..."
    train, outputs, test = model.training_functions(data)

    # train the sparse filtering network
    print "training network..."
    t = time.time()
    cost = {}
    weights = {}
    for l in xrange(model.n_layers):

        cost_layer = []
        w = None

        # iterate over training epochs
        if args.opt == 'GD':
            for epoch in xrange(args.iterations[l]):

                # go though [mini]batches
                for batch_index in xrange(n_batches):

                    c, w = train[l](index=batch_index)
                    cost_layer.append(c)
                    print("Layer %i cost at epoch %i and batch %i: %f" % (l + 1, epoch, batch_index, c))

        elif args.opt == 'L-BFGS':
            w = minimize(train[l], model.layers[l].w.eval().flatten(),
                         method='L-BFGS-B', jac=True,
                         options={'maxiter': args.iterations[l], 'disp': True})

            if args.convolution == 'n':
                w = w.x.reshape(args.dimensions[0][0], args.dimensions[0][1])
            elif args.convolution == 'y':
                w = w.x.reshape(args.dimensions[0][0], args.dimensions[0][1],
                                args.dimensions[0][2], args.dimensions[0][3])

        # add layer cost and weights to the dictionaries
        cost['layer' + str(l)] = cost_layer
        weights['layer' + str(l)] = w

    # calculate and display elapsed training time        
    elapsed = time.time() - t
    print('Elapsed training time: %f' % elapsed)

    # create sub-folder for saved model
    if args.aws == 'n':
        directory_format = "./saved/%4d-%02d-%02d_%02dh%02dm%02ds"
        directory_name = directory_format % time.localtime()[0:6]
        os.mkdir(directory_name)
    elif args.aws == 'y':
        import boto
        from boto.s3.key import Key
        s3 = boto.connect_s3()
        my_bucket = 'dlacombejr.bucket'
        bucket = s3.get_bucket(my_bucket)
        k = Key(bucket)
        directory_format = "./saved/%4d-%02d-%02d_%02dh%02dm%02ds"
        directory_name = directory_format % time.localtime()[0:6]
        os.mkdir(directory_name)

    # save the model for later use
    full_path = directory_name + '/model.pkl'
    pickle.dump(model, open(full_path, 'w'), pickle.HIGHEST_PROTOCOL)
    if args.aws == 'y':
        k.key = full_path
        k.set_contents_from_filename(full_path)
        os.remove(full_path)

    # save weights separately
    savemat(directory_name + '/weights.mat', weights)
    if args.aws == 'y':
        k.key = directory_name + '/weights.mat'
        k.set_contents_from_filename(directory_name + '/weights.mat')
        os.remove(directory_name + '/weights.mat')

    # save the cost functions
    savemat(directory_name + '/cost.mat', cost)
    if args.aws == 'y':
        k.key = directory_name + '/cost.mat'
        k.set_contents_from_filename(directory_name + '/cost.mat')
        os.remove(directory_name + '/cost.mat')

    # create log file
    log_file = open(directory_name + "/log.txt", "wb")  # todo: create log file by looping through args
    # for arg in args:
    #     log_file.write(
    #         args.
    #     )
    for m in range(len(args.model)):
        log_file.write(
            "Model layer %d: \n model:%s \n dimensions:%4s \n iterations:%3d \n" % (m,
                                                                                    args.model[m],
                                                                                    args.dimensions[m],
                                                                                    args.iterations[m])
        )
        if args.model == 'GroupSF' or args.model == 'GroupConvolutionalSF':
            log_file.write(
                " Groups: %d \n Step: %d" % (args.group, args.step)
            )
        ex = data.shape[0]
        if args.examples is not None:
            ex = args.examples

    log_file.write(
        " Data-set: %s \n Examples: %6d \n Whitened: %s" % (args.filename, ex, args.whitening)
    )
    log_file.write('\nElapsed training time: %f' % elapsed)
    log_file.close()
    if args.aws == 'y':
        k.key = directory_name + "/log.txt"
        k.set_contents_from_filename(directory_name + "/log.txt")
        os.remove(directory_name + "/log.txt")

    ''' =============================== Verbosity Options ===================================== '''

    # get variables and saves
    if args.verbosity >= 1:

        # # get variables of interest
        # activations_norm = {}
        # activations_raw = {}
        # activations_shuffled = {}
        # reconstruction = {}
        # error_recon = {}
        # pooled = {}

        # for l in xrange(len(args.dimensions)):

            # activations_norm['layer' + str(l)] = {}
            # activations_raw['layer' + str(l)] = {}
            # activations_shuffled['layer' + str(l)] = {}
            # reconstruction['layer' + str(l)] = {}
            # error_recon['layer' + str(l)] = {}
            # pooled['layer' + str(l)] = {}

        for batch in xrange(n_batches):

            # get variables of interest
            activations_norm = {}
            activations_raw = {}
            activations_shuffled = {}
            reconstruction = {}
            error_recon = {}
            pooled = {}

            # f_hat, rec, err, f_hat_shuffled, f, p = outputs[l]()
            begin = batch * args.batch_size
            end = begin + args.batch_size
            f_hat, rec, err, f_hat_shuffled, f, p = outputs[model.n_layers - 1](data[begin:end])

            # activations_norm['layer' + str(l)]['batch' + str(batch)] = f_hat
            # activations_raw['layer' + str(l)]['batch' + str(batch)] = f
            # activations_shuffled['layer' + str(l)]['batch' + str(batch)] = f_hat_shuffled
            # reconstruction['layer' + str(l)]['batch' + str(batch)] = err
            # error_recon['layer' + str(l)]['batch' + str(batch)] = rec
            # pooled['layer' + str(l)]['batch' + str(batch)] = p

            # define [mini]batch title
            batch_title = 'layer' + str(l) + '_batch' + '%03d' % batch

            # define norm and raw file names
            norm_file_name = directory_name + '/activations_norm_' + batch_title + '.mat'
            raw_file_name = directory_name + '/activation_raw_' + batch_title + '.mat'

            activations_norm[batch_title] = f_hat
            activations_raw[batch_title] = f
            activations_shuffled[batch_title] = f_hat_shuffled
            reconstruction[batch_title] = err
            error_recon[batch_title] = rec
            pooled[batch_title] = p

            # save model as well as weights and activations separately
            savemat(norm_file_name, activations_norm)
            # savemat(raw_file_name, activations_raw)

            if args.aws == 'y':

                k.key = norm_file_name
                k.set_contents_from_filename(norm_file_name)
                os.remove(norm_file_name)

                # k.key = raw_file_name
                # k.set_contents_from_filename(raw_file_name)
                # os.remove(raw_file_name)

        # savemat(directory_name + '/weights.mat', weights)
        # if args.aws == 'y':
        #     k.key = directory_name + '/weights.mat'
        #     k.set_contents_from_filename(directory_name + '/weights.mat')
        #     os.remove(directory_name + '/weights.mat')

        #     # f_hat, rec, err, f_hat_shuffled, f, p = outputs[l]()
        #     f_hat, rec, err, f_hat_shuffled, f, p = outputs[l](data[0:args.batch_size])
        #
        #     activations_norm['layer' + str(l)] = f_hat
        #     activations_raw['layer' + str(l)] = f
        #     activations_shuffled['layer' + str(l)] = f_hat_shuffled
        #     reconstruction['layer' + str(l)] = err
        #     error_recon['layer' + str(l)] = rec
        #     pooled['layer' + str(l)] = p
        #
        # # save model as well as weights and activations separately
        # savemat(directory_name + '/weights.mat', weights)
        # savemat(directory_name + '/activations_norm.mat', activations_norm)
        # savemat(directory_name + '/activation_raw.mat', activations_raw)

    # output helper file for concatenating activations
    helper = {'batches': n_batches, 'output_size': f_hat.shape}
    helper_file_name = directory_name + '/helper.mat'
    savemat(helper_file_name, helper)
    if args.aws == 'y':
        k.key = helper_file_name
        k.set_contents_from_filename(helper_file_name)
        os.remove(helper_file_name)

    # get data if not on AWS
    if args.aws == 'n':
        f_hat, rec, err, f_hat_shuffled, f, p = outputs[model.n_layers - 1](data)
        activations_norm = {"layer0": f_hat}

    # display figures
    if args.verbosity == 2:

        # if GD, plot the cost function over time
        if args.opt == 'GD':
            visualize.plotCost(cost)

        # visualize the receptive fields of the first layer
        visualize.drawplots(weights['layer0'].T, color='gray', convolution=args.convolution,
                            pad=0, examples=None, channels=args.channels)

        # visualize the distribution of lifetime and population sparseness
        for l in xrange(len(args.dimensions)):
            layer = 'layer' + str(l)
            if args.convolution == 'n':
                visualize.dispSparseHist(activations_norm[layer], l)
            elif args.convolution == 'y':
                visualize.dispSparseHist(activations_shuffled[layer].reshape(args.dimensions[l][0],
                                                                             data.shape[0] *
                                                                             activations_shuffled[layer].shape[2] *
                                                                             activations_shuffled[layer].shape[3]),
                                         layer=l)

        # visualize the distribution of activity across the "cortical sheet" and reconstruction
        if args.filename == 'patches_video.mat':
            f_hat = activations_norm['layer0'].T.reshape(video.shape[0], video.shape[1], args.dimensions[0][0])
            visualize.videoCortex(f_hat[0:100, :, :], 'y', args.convolution, 1)
        else:
            visualize.drawplots(activations_norm['layer0'], color='gray', convolution=args.convolution,
                                pad=1, examples=100)

        # # visualize reconstruction capabilities
        # if args.convolution == 'n':
        #     visualize.drawReconstruction(data[:, 0:100], error_recon['layer0'][:, 0:100], 'y', args.convolution, 1)
        # elif args.convolution == 'y':
        #     visualize.convolutional_reconstruction(data[0, :, :, :], activations_raw['layer0'], weights['layer0'],
        #                                            color='gray', convolution=args.convolution)
        # print('Reconstructed error: %e' % reconstruction['layer0'])

        # additional visualizations for convolutional network
        if args.convolution == 'y':

            dim = activations_raw['layer0'].shape[2]

            # visualize an example of a convolved image
            visualize.visualize_convolved_image(activations_raw['layer0'], dim=dim)
            # print activations_raw['layer0']

            # visualize max-pooled activations and LCN output
            visualize.visualize_convolved_image(pooled['layer0'][0, :, :, :].reshape(1,
                                                                                     pooled['layer0'].shape[1],
                                                                                     pooled['layer0'].shape[2],
                                                                                     pooled['layer0'].shape[3]),
                                                dim=dim / 2)

            # visualize an example of a LCNed convolved image after max pooling
            # temp = activations_raw['layer0']    #[0, :, :, :]
            temp = pooled['layer0']    #[0, :, :, :]
            # print temp.shape
            for i in range(temp.shape[1]):
                temp[0, i, :, :] = scaling.LCNinput(temp[0, i, :, :].reshape((1, 1, dim / 2, dim / 2)), kernel_shape=5)
            # temp = scaling.LCNinput(temp, kernel_shape=5)
            visualize.visualize_convolved_image(temp, dim=dim / 2)
            # print temp

    ''' ================================ Test the Model ======================================= '''

    # test the model if evaluating classification performance
    if args.test == 'y':

        from sklearn import svm
        from sklearn.metrics import confusion_matrix

        train_labels = loadmat(file_path)['y']

        file_path = os.path.join(base_path, "data", "CIFAR_test.mat")
        test_data = loadmat(file_path)['X']
        test_labels = loadmat(file_path)['y']

        # reshape and normalize the data
        if args.convolution == 'y':
            test_data = np.float32(test_data.reshape(-1, 1, int(np.sqrt(test_data.shape[1])),
                                                     int(np.sqrt(test_data.shape[1]))))
            test_data = scaling.LCNinput(test_data, kernel_shape=5)
            test_data = test_data[0:args.examples, :, :, :]

        # get SVM test results for pixels to last layer
        train_input = None
        for layer in range(model.n_layers + 1):

            # pixel inputs
            if layer == 0:

                test_input = test_data.reshape(test_data.shape[0], test_data.shape[1] *
                                               test_data.shape[2] * test_data.shape[3])

                train_input = data.reshape(data.shape[0], data.shape[1] *
                                           data.shape[2] * data.shape[3])

            # hidden layers
            elif layer > 0:

                # get the output of the current layer in the model given the training / test data and then reshape
                # TODO: use raw output as training and testing data?
                test_input = test[layer - 1](test_data[0:args.batch_size])
                test_input = test_input[0].reshape(test_input[0].shape[0], test_input[0].shape[1] *
                                                   test_input[0].shape[2] * test_input[0].shape[3])

                train_input = activations_norm['layer' + str(layer - 1)]
                train_input = train_input.reshape(train_input.shape[0], train_input.shape[1] *
                                                  train_input.shape[2] * train_input.shape[3])

            # train linear support vector machine
            clf = svm.SVC(kernel="linear").fit(train_input, np.ravel(train_labels[0:args.examples]))

            # get predictions from SVM and calculate accuracy
            predictions = clf.predict(test_input)
            accuracy = clf.score(test_input, test_labels[0:args.examples])

            # display results and log them
            print("Accuracy of the classifier at layer %1d: %0.4f" % (layer, accuracy))
            cm = confusion_matrix(test_labels[0:args.examples], predictions)
            log_file = open(directory_name + "/log.txt", "a")
            log_file.write(
                "\nAccuracy of the classifier at layer %1d: %0.4f" % (layer, accuracy)
            )
            log_file.close()

    # visualize the confusion matrix
    if args.test == 'y' and args.verbosity == 2:

        import pylab as pl

        pl.imshow(cm, interpolation='nearest')
        pl.title('Confusion Matrix for Network')
        pl.colorbar()
        pl.ylabel('True Label')
        pl.xlabel('Predicted Label')
        pl.show()