예제 #1
0
	def _kmeanspp(self, X, random_state):
		# Based on: https://en.wikipedia.org/wiki/K-means%2B%2B
		Xp = type(X)(X, shape=X.shape, dtype=X.dtype, copy=True) if sparse.issparse(X) else np.copy(X)

		idx = random_state.randint(X.shape[0], size=(1,), dtype=Xp.indptr.dtype)[0]

		centroids = Xp[idx]
		Xp = self.delete_row_csr(Xp, idx) if sparse.issparse(Xp) else np.delete(Xp, idx, axis=0)

		while (centroids.shape[0] < self.n_clusters):
			clustering, distances = pairwise_distances_argmin_min(X=Xp, Y=centroids, metric='cosine')

			# Calculate weighted probability distribution
			d = np.power(distances, 2)
			p = d / d.sum()

			dist = rv_discrete(values=(np.arange(Xp.shape[0]), p), seed=random_state)

			# Choose next centroid
			idx = dist.rvs()
			centroids = sparse.vstack((centroids, Xp[idx])) if sparse.issparse(Xp) else np.concatenate((centroids, Xp[idx].reshape(1, -1)), axis=0)

			# Delete center from `Xp`
			Xp = self.delete_row_csr(Xp, idx) if sparse.issparse(Xp) else np.delete(Xp, idx, axis=0)

		return centroids
예제 #2
0
    def closest_image(self, pixel):
        index, _ = pairwise_distances_argmin_min(pixel, self.rgb_means)

        img_path = self.file_paths[index[0]]
        img = io.imread(img_path)

        return img
예제 #3
0
def compute_data_labels(fname, dfilec, dfile, sensorref, sensor):
    """
    Computes the labels of the data using the centroids of the cluster in the file
    the labels are relabeled acording to the matching with the reference sensor
    :param dfile:
    :param sensor:
    :return:
    """
    f = h5py.File(fname + '.hdf5', 'r')

    d = f[dfilec + '/' + sensor + '/Clustering/' + 'Centers']
    centers = d[()]
    d = f[dfile + '/' + sensor + '/' + 'PeaksResamplePCA']
    data = d[()]
    d = f[dfilec + '/' + sensorref + '/Clustering/' + 'Centers']
    centersref = d[()]
    f.close()

    # clabels, _ = pairwise_distances_argmin_min(centers, centersref)
    #
    # m = Munkres()
    # dist = euclidean_distances(centers, centersref)
    # indexes = m.compute(dist)
    # print indexes
    # print clabels
    labels, _ = pairwise_distances_argmin_min(data, centers)
    return labels #[indexes[i][1] for i in labels]
예제 #4
0
def get_only_nug2_ruptures(scores):
    # get the indices, sorting the true
    sort_idx = np.argsort(scores.idx_true)
    true = np.array([scores.idx_true[i] for i in sort_idx]).reshape(-1,1)
    pred = np.array(scores.idx_predicted).reshape(-1,1)
    # not interested in anything with just two ruptures
    if (len(true) < 3 or len(pred) < 3):
        return scores
    # POST: something to do; at least 3 ruptures
    # pairwise_distances_argmin_min:
    # for each row in X (true), the index of the row of Y (pred) which
    # is closest (according to the specified distance).
    idx_closest_pred_to_true,_ = metrics.pairwise_distances_argmin_min(X=true,
                                                                       Y=pred)
    # only interested from the second to the next to last, since the 6 ruptures 
    # are: alpha3D, NUG2 (4 of these), biotin/streptavidin
    logical_fec_slice = slice(1,-1,None)
    slice_true = sort_idx[logical_fec_slice]
    idx_we_want = idx_closest_pred_to_true[logical_fec_slice]
    pred_slice = lambda x: [x[i] for i in idx_we_want]
    true_slice = lambda x: [x[i] for i in slice_true]
    scores.ruptures_true = true_slice(scores.ruptures_true)
    scores.ruptures_predicted = pred_slice(scores.ruptures_predicted)
    # also update all the indices and such
    scores.true_x = true_slice(scores.true_x)
    scores.pred_x = pred_slice(scores.pred_x)
    scores.idx_true = true_slice(scores.idx_true)
    scores.idx_predicted = pred_slice(scores.idx_predicted)
    return scores
def finding_correct_centers(list_underlyings, scenarios, cluster_centers):
    
#Les index dans scenarios sont les même que dans list_underlyings
#Dans scenarios_bis on supprime plusieurs ligne, mais les index sont inchangés ->les lignes ne correspondent pas aux index !
#cluster_problem indique tous les clusters (entre 0 et nclusters) pour lesquels on doit changer les centres

    scenarios_bis = scenarios
    nclusters = len(cluster_centers)
    cluster_problem = range(nclusters)
    
    #Onreécupère laliste des centres dans list_underlyings ET scénarios
    list_cluster_index,_ = pairwise_distances_argmin_min(cluster_centers, scenarios_bis.drop(['Underlying'],axis=1),metric='l2')


    while(len(cluster_problem)>0):
        
        #On cherche les centres priçables
        cluster_problem = test_pricing_centers(scenarios,
                                                 list_underlyings,
                                                 list_cluster_index,
                                                 cluster_problem)
        
        
        #Si on a encore des problèmes
        if(len(cluster_problem)>0):
            
            #On commence par supprimer les sous-jacents qui posent un problème
            for ind in cluster_problem:
                index_centre = list_cluster_index[ind]
                name = str(list_underlyings['EliotName'][index_centre])
                scenarios_bis = scenarios_bis.drop([name])
            
            
            #On cherche les sous-jacents les plus proches des centres des clusters affectés
            list_cluster_index,_ = pairwise_distances_argmin_min(cluster_centers, scenarios_bis.drop(['Underlying'],axis=1),metric='l2')
            
            list_cluster_index = new_cluster_index(scenarios,scenarios_bis,list_cluster_index)

#--------------------------------------------------------------------------
    
    for i in range(nclusters):
        ind = list_cluster_index[i]
        index_centre_cluster = list_underlyings['Label'][ind]   
        print("Le centre pricable du cluster {} est dans le cluster {}".format(i,index_centre_cluster))
    
    
    return list_cluster_index
예제 #6
0
 def euclidean_rupture_spectrum_distance(self):
     safe_log = lambda x : np.log10(x) if x > 0 else -10
     spectrum_tuple = lambda x: [safe_log(x.loading_rate*1e12),
                                 x.rupture_force*1e12]
     all_tuples = lambda list_v: np.array([spectrum_tuple(x) 
                                           for x in list_v])
     X = all_tuples(self.ruptures_true)
     Y = all_tuples(self.ruptures_predicted) 
     # get the distances from x to y and from y to x
     if (len(Y) == 0 or len(X) == 0):
         dist_1 = []
         dist_2 = [sum(x**2) for x in X]
     else:
         _,dist_1 = metrics.pairwise_distances_argmin_min(X=X,Y=Y)
         _,dist_2 = metrics.pairwise_distances_argmin_min(X=Y,Y=X)
     all_distances = list(dist_1) + list(dist_2)
     return all_distances
def chunked(X, Y, axis=1, metric="euclidean", batch_size=500, **kwargs):
    """Return argmin on the selected axis.
    axis 0 is along X
    axis 1 is along Y
    """
    return pairwise_distances_argmin_min(X, Y, axis=axis,
                                         batch_size=batch_size,
                                         metric=metric, **kwargs)
예제 #8
0
def kmeans(X):
  num_clusters = int(sys.argv[2])
  kmeans_model = KMeans(n_clusters = num_clusters)
  kmeans_model.fit(X)

  if sys.argv[3] == 'c':
    print kmeans_model.cluster_centers_
  else:
    closest, _ = pairwise_distances_argmin_min(kmeans_model.cluster_centers_, X)
    for point in closest:
      print X[point]
예제 #9
0
 def get_Kmeans(self):
     ''' Set up Kmeans algorithm with arbitrary clusters'''
     k = 100
     vect = TfidfVectorizer(ngram_range=(1,2), stop_words='english')
     X = vect.fit_transform(self.corpus)
     model = KMeans(k)
     model.fit(X)
     order_centroids = model.cluster_centers_.argsort()[:, ::-1]
     terms = vect.get_feature_names()
     self.centroids = order_centroids
     self.model = model
     self.vect = vect
     return model, pairwise_distances_argmin_min(model.cluster_centers_, X, metric='cosine')
예제 #10
0
 def __init__(self,IdxFrom,IdxTo):
     _,distActualToPred= pairwise_distances_argmin_min(IdxFrom,IdxTo)
     self.MeanToLabel = np.mean(distActualToPred)
     self.MedianToLabel = np.median(distActualToPred)
     self.MaxToLabel = np.max(distActualToPred)
     self.MinToLabel = np.min(distActualToPred)
     maxV = max(distActualToPred)
     cond = (np.abs(distActualToPred) > 0.5)
     numWrongByAtLeastOne = sum(cond)
     nBins = 10
     bins = np.linspace(start=0,stop=maxV,num=nBins,endpoint=True)
     self.histZeros = np.histogram(distActualToPred,bins=bins)
     nonZeroDistance = distActualToPred[np.where(cond)]
     self.histNoZeros =np.histogram(nonZeroDistance,bins=bins)
예제 #11
0
def compute_data_labels(dfilec, dfile, sensor):
    """
    Computes the labels of the data using the centroids of the cluster in the file
    :param dfile:
    :param sensor:
    :return:
    """
    f = h5py.File(datainfo.dpath + datainfo.name + ext + '.hdf5', 'r')

    d = f[dfilec + '/' + sensor + '/Clustering/' + 'Centers']
    centers = d[()]
    d = f[dfile + '/' + sensor + '/' + 'PeaksResamplePCA']
    data = d[()]
    labels, _ = pairwise_distances_argmin_min(data, centers)
    f.close()
    return labels
예제 #12
0
def compute_data_labels(fname, dfilec, dfile, sensor):
    """
    Computes the labels of the data using the centroids of the cluster in the file
    the labels are relabeled acording to the matching with the reference sensor

    Disabled the association using the Hungarian algorithm so the cluster index are
    the original ones

    :param dfile:
    :param sensor:
    :return:
    """
    f = h5py.File(datainfo.dpath + '/' + fname + '/' + fname + '.hdf5', 'r')

    d = f[dfilec + '/' + sensor + '/Clustering/' + 'Centers']
    centers = d[()]
    d = f[dfile + '/' + sensor + '/' + 'PeaksResamplePCA']
    data = d[()]
    f.close()

    labels, _ = pairwise_distances_argmin_min(data, centers)
    return labels
예제 #13
0
    def compute_peaks_labels(self, f, dfile, sensor, nclusters, globalc=False, distances=False):
        """
        Computes the labels of the data using the centroids of the cluster in the first file
        :param nclusters:
        :param dfile:
        :param sensor:
        :return:
        """
        if globalc:
            d = f["All/" + sensor + "/Clustering/" + str(nclusters) + "/Centers"]
        else:
            d = f[self.datafiles[0] + "/" + sensor + "/Clustering/" + str(nclusters) + "/Centers"]

        centers = d[()]
        d = f[dfile + "/" + sensor + "/" + "PeaksResamplePCA"]
        data = d[()]
        labels, dist = pairwise_distances_argmin_min(data, centers)

        if distances:
            return labels, distances
        else:
            return labels
예제 #14
0
	def fit(self, X, y=None):
		random_state = check_random_state(self.random_state)
		X = self._check_fit_data(X)

		# Init CosineMeans
		if (isinstance(self.init, np.ndarray)):
			self.cluster_centers_ = self.init
		elif (self.init == 'random'):
			idx = random_state.randint(X.shape[0], (self.n_clusters,))
			self.cluster_centers_ = X[idx].A if sparse.issparse(X) else X[idx]
		elif (self.init == 'k-means++'):
			self.cluster_centers_ = self._kmeanspp(X=X, random_state=random_state)
		else:
			raise ValueError('Unknown param passed to `init`: {}. Allowed values are "random", "k-means++" or an ndarray')

		# Run CosineMeans
		centroids = np.zeros((self.n_clusters, X.shape[1]))#sparse.csr_matrix((self.n_clusters, X.shape[1]))
		for _ in range(self.max_iter):
			clustering, distances = pairwise_distances_argmin_min(X=X, Y=self.cluster_centers_, metric='cosine')
			# http://stackoverflow.com/questions/29629821/sum-over-rows-in-scipy-sparse-csr-matrix

			# Todo: This really needs improvement
			for yi in np.unique(clustering):
				row_idx = np.where(clustering==yi)[0]

				if (sparse.issparse(X)):
					centroids[yi] = np.asarray(X[row_idx].multiply(1/len(row_idx)).sum(axis=0))
				else:
					centroids[yi] = np.multiply(X[row_idx], 1/len(row_idx)).sum(axis=0)

			# Convergence check
			if (np.all(np.abs(self.cluster_centers_-centroids) < self.tol)):
				break
			self.cluster_centers_ = centroids
		self.cluster_centers_ = centroids
		self.labels_ = clustering

		return self
예제 #15
0
def upsample(test_indices, training_set_cluster_IDs, data, 
             method = 'k-means', usecols = None):

    N_samples = test_indices.size + training_set_cluster_IDs.size

    assert N_samples == data.shape[0]

    full_set_cluster_IDs = np.zeros(N_samples, dtype = int)

    training_indices = np.setdiff1d(np.arange(N_samples), test_indices, True)
    full_set_cluster_IDs[training_indices] = training_set_cluster_IDs

    if usecols is not None:
        usecols = list(usecols)
        data = np.take(data, usecols, 1)    

    training_data = np.delete(data, test_indices, axis = 0)
    
    max_ID = np.amax(training_set_cluster_IDs)
    centroids = np.zeros((max_ID + 1, data.shape[1]), dtype = float)

    for cluster in xrange(max_ID + 1):
        samples_in_cluster = np.where(training_set_cluster_IDs == cluster)[0]
        if method == 'hierarchical':
            centroids[cluster] = np.median(training_data[samples_in_cluster], 
                                           axis = 0)
        else:
            centroids[cluster] = training_data[samples_in_cluster].mean(axis = 0)

    test_data = np.take(data, test_indices, axis = 0)
    test_set_cluster_IDs, _ = pairwise_distances_argmin_min(test_data, centroids, 
                metric = 'manhattan' if method == 'hierarchical' else 'euclidean')

    full_set_cluster_IDs[test_indices] = test_set_cluster_IDs

    return full_set_cluster_IDs
예제 #16
0
def geneticlabels(dataframe,centers):
    return pairwise_distances_argmin_min(dataframe,centers,metric='minkowski')
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min
#X = np.random.randn(10, 4) # generate a 10 row, 4 column random number matrix
X = np.array([[1, 1, 1, 2], [5, 4, 5, 6], [2, 1, 1, 2], [6, 7, 6, 4],
              [8, 10, 9, 8], [10, 8, 9, 8], [1, 2, 3, 2], [3, 1, 2, 1],
              [9, 10, 7, 9], [9, 9, 7, 7]])
# clustering
print("X is: \n", X)

km = KMeans(n_clusters=3).fit(X)  # create 3 clusters
closest, _ = pairwise_distances_argmin_min(km.cluster_centers_, X)
closest  # gives out the closest data point to the center points, an array, the first is the closest to the first cluster center, the second to the second cluster, etc.
print("closest to each cluster: ", closest)

# sort and output the closest data points
km.cluster_centers_  # the center points
centers = np.array(km.cluster_centers_)
num_closest = 4  # number of closest points to cluster center
num_clusters = 3

print("\n...clustering into 3 clusters...")
dist = km.transform(X)
print("distance matrix: \n", dist)
print("\n")
for i in range(0, num_clusters):
    print("cluster ", i, ", center: ", centers[i])
    d = dist[:, i]
    print("d to cluster center", i, ":", d)
    ind = np.argsort(
        d
예제 #18
0
파일: server.py 프로젝트: light-weaver/misc
                ref_point = np.squeeze(eval(d["DATA"]))
                print(f"Ref point: {ref_point}")
                pref.response = pd.DataFrame(
                    np.atleast_2d(ref_point),
                    columns=pref.content["dimensions_data"].columns,
                )
                color_point = pref.response.values
                _, pref = evolver.iterate(pref)

            objectives_ = evolver.population.objectives

            ### KMEANS
            # fit to n_clusters and find the closest solutions to each cluster's centroid
            kmeans = KMeans(n_clusters=n_clusters, verbose=0)
            kmeans.fit(objectives_)
            closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_,
                                                       objectives_)
            labels_kmeans = kmeans.labels_
            print(labels_kmeans)

            labelled_objectives = []
            labelled_variables = []
            for label_n in range(n_clusters):
                labelled_objectives.append(
                    objectives_[labels_kmeans == label_n])
                labelled_variables.append(
                    evolver.population.individuals[labels_kmeans == label_n])

            objectives = objectives_[closest]
            variables = evolver.population.individuals[closest]

            ### DBSCAN
예제 #19
0
print('='*40)
print('KMMCDUE-based ALGO')
print('='*40)
for al_iters in range(al_steps):
    t = time.time()
    # 1) get MCDUEs
    print('Starting AL iteration #', al_iters)
    mcdues = get_mcdues(X_pool_current)
    print('AL iteration #', al_iters, ': got MCDUEs')
    # 2) pick n_pick samples with top mcdues
    km_model = KMeans(n_clusters = sample_each_step, verbose=2)
    inds = np.argsort(mcdues)[::-1][::-1]
    km_model.fit(X_pool_current[inds[:int(0.1*X_train_current.shape[0])]]) # KMeans on top 10%
    print('Fitted KMeans with', sample_each_step, 'clusters')
    inds, _ = pairwise_distances_argmin_min(km_model.cluster_centers_, X_pool_current)
    print(sample_each_step, 'samples picked')
    # 3) add them to the training set
    X_train_current = np.concatenate([X_train_current, X_pool_current[inds, :]])
    y_train_current = np.concatenate([y_train_current, y_pool_current[inds, :]])
    print('Added to training set, new sizes:', X_train_current.shape, y_train_current.shape)
    # 4) remove them from the pool
    X_pool_current = np.delete(X_pool_current, inds, axis = 0)
    y_pool_current = np.delete(y_pool_current, inds, axis = 0)
    print('Deleted from pool set, new sizes:', X_pool_current.shape, y_pool_current.shape)
    # 5) uptrain the NN
    prev_test_error = 1e+10
    sample_selection_time = time.time() - t
    t_big = time.time()
    t = time.time()
    for cnt in range(uptrain_epochs):
예제 #20
0
    book = [sentence for sentence in book
            if len(sentence) > 20][100:200]  # 截断长度
    book_sequences = batch_sequence(book, dictionary, maxlen=maxlen)  # 填充
    encoded, attention = sess.run([model.get_thought, model.attention],
                                  feed_dict={model.INPUT: book_sequences})

    n_clusters = 10
    kmeans = KMeans(n_clusters=n_clusters, random_state=0)
    kmeans = kmeans.fit(encoded)

    avg = []
    closest = []

    for j in range(n_clusters):
        idx = np.where(kmeans.labels_ == j)[0]
        avg.append(np.mean(idx))

    closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_,
                                               encoded)
    ordering = sorted(range(n_clusters), key=lambda k: avg[k])

    print('. '.join([book[closest[idx]] for idx in ordering]))

    print("*" * 100)

    indices = np.argsort(attention.mean(axis=0))[::-1]

    rev_dictionary = {v: k for k, v in dictionary.items()}

    print([rev_dictionary[i] for i in indices[:10]])
예제 #21
0
    def K_Means_Clustering(self):
        data = self.KNN_ser.iloc[:, 0:10]
        data['Mode'] = self.KNN_ser.iloc[:, -1].values
        print(data)
        car = data['Mode'] == 'Car'
        metro = data['Mode'] == 'Metro'
        bus = data['Mode'] == 'Bus'
        walking = data['Mode'] == 'Walking'
        still = data['Mode'] == 'Still'
        # print(data[still].iloc[0:900, 0:10])
        car = data[bus].iloc[:, 0:10]
        # print(car.iloc[:, 0:1].values)
        print(car[('Acc','f_1')].values)
        df = pd.DataFrame({
            'f1': car[('Acc','f_1')].values,
            'f2': car[('Acc','f_2')].values,
            'f3': car[('Acc','f_3')].values,
            'f4': car[('Acc','f_4')].values,
            'f5': car[('Acc','f_5')].values,
            'f6': car[('Acc','f_6')].values,
            'f7': car[('Acc','f_7')].values,
            'f8': car[('Acc','f_8')].values,
            'f9': car[('Acc','f_9')].values,
            'f10': car[('Acc','f_10')].values,
            
            })
        print(df)
        num_clusters = 1
        kmeans = KMeans(n_clusters=1).fit(df)
        centers = np.array(kmeans.cluster_centers_)
        m_clusters = kmeans.labels_.tolist()
        print(centers)

        closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_, df)
        print(closest)

        # print(car.loc[901].values)

        # print(self.car_fft)

        # N = 501
        # T = 10.0 / 900.0
        # x = np.linspace(0.0, N*T, N)
        # xf = fftfreq(N, T)
        # xf = fftshift(xf)
        # yplot = fftshift(self.bus_fft[closest[0]])
        # yplot1 = fftshift(self.walking_fft[closest[1]])
        # plt.plot(xf, 1.0/N * np.abs(yplot))
        # plt.plot(xf, 1.0/N * np.abs(yplot1))
        # plt.show()

        clostest_data = []
        for i in range(num_clusters):
            center_vec = centers[i]
            data_idx_within_i_cluster = [ idx for idx, clu_num in enumerate(m_clusters) if clu_num == i ]

            one_cluster_tf_matrix = np.zeros( (  len(pmids_idx_in_i_cluster) , centers.shape[1] ) )
            for row_num, data_idx in enumerate(data_idx_in_i_cluster):
                one_row = tf_matrix[data_idx]
                one_cluster_tf_matrix[row_num] = one_row

            closest, _ = pairwise_distances_argmin_min(center_vec, one_cluster_tf_matrix)
            closest_idx_in_one_cluster_tf_matrix = closest[0]
            closest_data_row_num = data_idx_within_i_cluster[closest_idx_in_one_cluster_tf_matrix]
            data_id = all_data[closest_data_row_num]

            closest_data.append(data_id)

        closest_data = list(set(closest_data))

        assert len(closest_data) == num_clusters

        d = kmeans.transform(car['Acc'])[:, 1]
        ind = np.argsort(df)[::-1][:50]
        print(ind)
        plt.scatter(df['car'], df['metro'], c= kmeans.labels_.astype(float), s=50, alpha=0.5)
        plt.scatter(centers[:, 0], centers[:, 1], c='red', s=50)
        plt.show()
예제 #22
0
print(n_clusters)
kmeans = KMeans(n_clusters=n_clusters)
kmeans = kmeans.fit(encoded)

# Step-6: Summarization
# The candidate sentence is chosen to be the sentence whose vector representation is closest to the cluster center.

from sklearn.metrics import pairwise_distances_argmin_min

avg = []
for j in range(n_clusters):
    idx = np.where(kmeans.labels_ == j)[0]
    avg.append(np.mean(idx))
    print('DENTRO DO FOR')
    print(idx)
    print(np.mean(idx))
closest, _ = pairwise_distances_argmin_min(
    kmeans.cluster_centers_, encoded)  # computa a menor distancia
ordering = sorted(range(n_clusters), key=lambda k: avg[k])
summary = ' '.join([sentences[closest[idx]] for idx in ordering])
print([email[closest[1]]])

print('ordering')
print(ordering)
print('closest')
print(closest)
print('closest')
print(closest, )

print(summary)
예제 #23
0
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.metrics.pairwise import pairwise_distances_argmin

data_x = data_2020[[
    'Logged GDP per capita', 'Social support', 'Healthy life expectancy',
    'Freedom to make life choices', 'Generosity', 'Perceptions of corruption'
]]
# need to decide n_clusters
# Elbow method
WSS = {}
for i in range(2, 30):
    k_means = KMeans(init='k-means++', n_clusters=i, n_init=10)
    k_means.fit(data_x)
    k_means_labels, k_means_distance = metrics.pairwise_distances_argmin_min(
        data_x, k_means.cluster_centers_)
    WSS[i] = np.sum(np.sqrt(k_means_distance))
WSS_pd = pd.DataFrame(WSS.values(), index=WSS.keys(), columns=['WSS'])
WSS_pd.plot()
# no clean answer
from sklearn.metrics import silhouette_score

sil = {}
for i in range(2, 30):
    k_means = KMeans(init='k-means++', n_clusters=i, n_init=10)
    k_means.fit(data_x)
    k_means_labels, k_means_distance = metrics.pairwise_distances_argmin_min(
        data_x, k_means.cluster_centers_)
    sil[i] = silhouette_score(data_x, k_means_labels, metric='euclidean')
sil_pd = pd.DataFrame(sil.values(), index=sil.keys(), columns=['silhouette'])
sil_pd.plot()
예제 #24
0
else:
    how_many_summaries = 500
summary = [None] * how_many_summaries

for rv in range(how_many_summaries):
    review = df['sent_tokens'].iloc[rv]
    enc_email = get_sent_embedding(review)
    if (len(enc_email) > 0):
        n_clusters = int(np.ceil(len(enc_email)**0.5))
        kmeans = KMeans(n_clusters=n_clusters, random_state=0)
        kmeans = kmeans.fit(enc_email)
        avg = []
        closest = []
        for j in range(n_clusters):
            idx = np.where(kmeans.labels_ == j)[0]
            avg.append(np.mean(idx))
        closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_,\
                                                   enc_email)
        ordering = sorted(range(n_clusters), key=lambda k: avg[k])
        summary[rv] = ' '.join([review[closest[idx]] for idx in ordering])
    else:
        print("This is not a valid review")

if (cmdline):
    print(f'{summary}')
else:
    df_500 = df.iloc[:how_many_summaries]
    print(df_500.head())
    df_500['PredictedSummary'] = summary
    df_500[['Text', 'PredictedSummary']].to_csv('top_500_summary.csv')
def active_cluster_svm_margin(foldname):

    twenty_train_data = getattr(prepare_data, foldname + '_train_data')
    twenty_train_target = getattr(prepare_data, foldname + '_train_target')
    twenty_test_data = getattr(prepare_data, foldname + '_test_data')
    twenty_test_target = getattr(prepare_data, foldname + '_test_target')

    #baseline active learning solution
    alpha = 20 #initial training set
    betha = int(len(twenty_train_data) / alpha) - 2 #number of iterations
    gamma = 20 #sampling volume

    tfidf_transformer = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer())
    ])

    #try to implement silhouette analysis for number of clusters
    #cluster = AgglomerativeClustering(n_clusters=20,affinity='cosine', linkage='complete')
    cluster = KMeans(n_clusters=20)

    unlabeled_train_data = twenty_train_data
    unlabeled_train_target = twenty_train_target

    #print 'start transforming'
    unlabeled_matrix = tfidf_transformer.fit_transform(unlabeled_train_data)

    #print 'start fitting'
    #print datetime.now()
    res = cluster.fit_predict(unlabeled_matrix)
    #print datetime.now()

    #print 'clustering result'
    #print OrderedDict(Counter(res))
    #print res.shape

    closest, _ = pairwise_distances_argmin_min(cluster.cluster_centers_, unlabeled_matrix, metric='cosine')

    #print closest

    '''
    results = defaultdict(list)
    for idx, val in enumerate(res):
        results[val].append(idx)

    take_idx = []
    for cluster_num in range(0, 20):
        idxset = results[cluster_num]
    '''



    #create labeled and unlabeled training set
    #labeled_train_data = twenty_train_data[: alpha]
    #labeled_train_target = twenty_train_target[: alpha]
    #unlabeled_train_data = twenty_train_data[alpha:]
    #unlabeled_train_target = twenty_train_target[alpha:]
    labeled_train_data = []
    labeled_train_target = []
    labeled_train_data, labeled_train_target, unlabeled_train_data, unlabeled_train_target = diploma_range_sampling(labeled_train_data, labeled_train_target, unlabeled_train_data, unlabeled_train_target, closest)
    #print labeled_train_data.shape
    baseline_active_clf = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', LinearSVC())
    ])

    baseline_active_clf.fit(labeled_train_data, labeled_train_target)
    predicted = baseline_active_clf.predict(twenty_test_data)
    score = f1_score(twenty_test_target, predicted, average='macro')
    #print 'active cluster svm margin solution'
    scores = baseline_active_clf.decision_function(unlabeled_train_data)
    prob = np.divide(1, np.add(1, np.exp(np.multiply(np.array(scores), -1))))
    diploma_res_print(foldname, len(labeled_train_data), score, np.amax(prob))
    for t in range(1, betha):
        #to do use labeled dataset to train sigmoid

        #f1 for labeled set
        #pred_lab = baseline_active_clf.predict(labeled_train_data)
        #print 'f1 score for labeled:', f1_score(labeled_train_target, pred_lab, average='macro')




        #count p1 p2 p3 p4
        '''
        def count_p(arr):
            p1 = arr.min()
            p4 = arr.max()
            sorted_arr = sorted(arr)
            a1 = [i for i in sorted_arr if i < 0]
            a2 = [i for i in sorted_arr if i > 0]
            p2 = -100500
            p3 = +100500
            if len(a1) > 0:
                p2 = max(a1)
            if len(a2) > 0:
                p3 = min(a2)
            return [p1, p2, p3, p4]

        #prom_arr = []

        norm_scores = LA.norm(scores)
        n_scores = np.divide(scores, norm_scores)

        '''
        '''
        plus_norm = 0
        min_norm = 0
        for line in scores:
            for elem in line:
                if (elem > 0):
                    plus_norm += elem ** 2
                else:
                    min_norm += elem ** 2
        plus_norm = math.sqrt(plus_norm)
        min_norm = math.sqrt(min_norm)
        n_scores = np.array(scores)
        for i in range(0, len(n_scores)):
            for j in range(0, len(n_scores[i])):
                if (n_scores[i][j] > 0):
                    n_scores[i][j] = n_scores[i][j] / plus_norm
                else:
                    n_scores[i][j] = n_scores[i][j] / min_norm
        '''
        '''
        #print n_scores
        prom_arr = []
        for lin in range(0, len(n_scores)):
            prom_arr.append(count_p(n_scores[lin]))

        t_prom_arr = np.transpose(np.array(prom_arr))
        #print t_prom_arr
        #p1 = np.amin(t_prom_arr[0])
        #p2 = np.amax(t_prom_arr[1])
        #p3 = np.amin(t_prom_arr[2])
        #p4 = np.amax(t_prom_arr[3])
        #print 'p1:', p1, 'p2:', p2, 'p3:', p3, 'p4:', p4
        '''



        #prob = np.divide(1, np.add(1, np.exp(np.multiply(np.array(n_scores), -1))))
        #print 'norm matrix min proba:', np.amin(prob), 'norm matrix max proba:', np.amax(prob)

        doc_score = {}
        for i in range(0, len(unlabeled_train_data)):
            last_elems = (sorted(scores[i]))[-2:]
            doc_score[i] = np.abs(last_elems[0] - last_elems[1])

        sorted_doc_score = sorted(doc_score.items(), key=operator.itemgetter(1))


        #print 'sorted doc score minimum active cluster svm margin', sorted_doc_score[0]

        sample_numbers = []
        for i in range(0, gamma):
            sample_numbers = sample_numbers + [sorted_doc_score[i][0]]

        labeled_train_data, labeled_train_target, unlabeled_train_data, unlabeled_train_target = diploma_range_sampling(labeled_train_data, labeled_train_target, unlabeled_train_data, unlabeled_train_target, sample_numbers)
        baseline_active_clf.fit(labeled_train_data, labeled_train_target)
        predicted = baseline_active_clf.predict(twenty_test_data)
        score = f1_score(twenty_test_target, predicted, average='macro')

        scores = baseline_active_clf.decision_function(unlabeled_train_data)
        prob = np.divide(1, np.add(1, np.exp(np.multiply(np.array(scores), -1))))
        #print 'min proba:', np.amin(prob), 'max proba:', np.amax(prob)


        diploma_res_print(foldname, len(labeled_train_data), score, np.amax(prob))
예제 #26
0
        print(fila["LSQN"], fila["Grupo"])
    if fila["Grupo"] == 6:
        print(fila["LSQN"], fila["Grupo"])
#Generating table with # of clients and their respective group
labels = kmeans.predict(datos_reelevantes)
colores = ['red', 'green', 'blue', 'cyan', 'yellow', 'pink', 'black']
copia = pd.DataFrame()
copia['LSQN'] = datos_cargados['LSQN'].values
copia['OCLTV'] = datos_cargados['OCLTV'].values
copia['label'] = labels
cantidadGrupo = pd.DataFrame()
cantidadGrupo['color'] = colores
cantidadGrupo['cantidad'] = copia.groupby('label').size()
#cantidadGrupo #Generate table,in notebook
#We see the representative of the group, the user close to his centroid
cercanos, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_,
                                            datos_reelevantes)
cercanos
#We look for the id of the closest clients
usuarios = datos_cargados['LSQN'].values
for fila in cercanos:
    print(usuarios[fila])
#Now we associate the centroid with its id passing as a parameter the list of centroids
labels = kmeans.predict(datos_reelevantes)
colores = ['red', 'green', 'blue', 'cyan', 'yellow', 'pink', 'black']
copia = pd.DataFrame()
copia['LSQN'] = datos_cargados['LSQN'].values
copia['OCLTV'] = datos_cargados['OCLTV'].values
copia['Grupo'] = labels
cantidadGrupo = pd.DataFrame()
cantidadGrupo['color'] = colores
cantidadGrupo['cantidad'] = copia.groupby('Grupo').size()
예제 #27
0
            sentence_vec += w2v.wv[word]
    X.append(sentence_vec)

## ======== clustering Kmean =========
n_clusters_kmean = Number_line
kmeans = KMeans(n_clusters=n_clusters_kmean)
kmeans = kmeans.fit(X)
print(kmeans.labels_)

# ======== Determining the closest point to the center
avg = []
for j in range(n_clusters_kmean):
    idx = np.where(kmeans.labels_ == j)[0]
    avg.append(np.mean(idx))

closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_, X)
ordering = sorted(range(n_clusters_kmean), key=lambda k: avg[k])
summary = ' '.join([sentences[closest[idx]] for idx in ordering])

print("\n*** Using Kmean clustering:\n")
print(summary)

## ======== hierarchical clustering =========
n_clusters_hierarchy = Number_line  # number of regions
ward = AgglomerativeClustering(n_clusters=n_clusters_hierarchy)
ward = ward.fit(X)
print(ward.labels_)

# ======== Determining the closest point to the center of each cluster
X_Clusters = []
idx_cluster = []
예제 #28
0
def run_umap(X=None,
             y=None,
             method='unsupervised',
             scaler=None,
             neighbor=10,
             dist=0.1,
             metric='correlation',
             color_code=None,
             annotate_names=None,
             annotate=False,
             test_set=True,
             title=None,
             savefig_path=False,
             X_test=None,
             y_test=None,
             color_code_test=None,
             plot=True):

    reducer = umap.UMAP(
        n_components=dimension,
        n_neighbors=neighbor,
        min_dist=dist,
        metric=metric,
        random_state=seed_value
    )  #, TSNE(n_components=k, random_state=seed_value), PCA(n_components=k, random_state=seed_value)]
    reducer_name = 'umap'  #, 'tsne', 'pca']

    pipeline = Pipeline([
        ('normalization', scaler),
        ('reducer', reducer),
    ])

    y_encoded = LabelEncoder().fit_transform(y)
    if method == 'supervised':
        X_reduced = pipeline.fit_transform(X, y_encoded)
    elif method == 'metric_learning':
        X_reduced = pipeline.fit_transform(X, y_encoded)
        X_reduced_test = pipeline.transform(X_test)

    elif method == 'unsupervised':
        X_reduced = pipeline.fit_transform(X)

    print('running kmeans...')
    # Set k to amount of subreddits
    k = len(np.unique(y))
    # Fit kmeans
    km = KMeans(n_clusters=k, random_state=seed_value).fit(X_reduced)
    # Obtain euclidean distance between centroids
    centers = km.cluster_centers_
    # find centroid labels
    closest, _ = pairwise_distances_argmin_min(centers, X_reduced)
    data = pd.DataFrame(X_reduced, columns=['x1', 'x2'])
    data['label'] = y
    centers_labels = list(data.loc[closest].label)

    # Plot in 2D
    if plot:
        assert dimension == 2
        if method == 'metric_learning':
            # train: first time point
            scatter_plot(X_reduced,
                         y,
                         color_code,
                         method,
                         annotate=annotate,
                         title='First time step (train set)',
                         savefig_path=savefig_path)
            # test: next time points
            scatter_plot(X_reduced_test,
                         y_test,
                         color_code_test,
                         method,
                         annotate=annotate,
                         title=title,
                         savefig_path=savefig_path)

        else:
            scatter_plot(X_reduced,
                         y,
                         color_code,
                         method,
                         annotate=annotate,
                         title=title,
                         savefig_path=savefig_path,
                         centers=centers)
    if method == 'metric_learning':
        return X_reduced, X_reduced_test
    else:
        return X_reduced, centers, centers_labels
def active_cluster_svm_margin_cluster():
    #baseline active learning solution
    alpha = 20 #initial training set
    betha = 600 #number of iterations
    gamma = 20 #sampling volume

    tfidf_transformer = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer())
    ])

    #try to implement silhouette analysis for number of clusters
    #cluster = AgglomerativeClustering(n_clusters=20,affinity='cosine', linkage='complete')
    cluster = KMeans(n_clusters=20)

    unlabeled_train_data = twenty_train_data
    unlabeled_train_target = twenty_train_target

    print 'start transforming'
    unlabeled_matrix = tfidf_transformer.fit_transform(unlabeled_train_data)

    print 'start fitting'
    print datetime.now()
    res = cluster.fit_predict(unlabeled_matrix)
    print datetime.now()

    print 'clustering result'
    print OrderedDict(Counter(res))
    print res.shape

    closest, _ = pairwise_distances_argmin_min(cluster.cluster_centers_, unlabeled_matrix, metric='cosine')

    print closest

    '''
    results = defaultdict(list)
    for idx, val in enumerate(res):
        results[val].append(idx)

    take_idx = []
    for cluster_num in range(0, 20):
        idxset = results[cluster_num]
    '''



    #create labeled and unlabeled training set
    #labeled_train_data = twenty_train_data[: alpha]
    #labeled_train_target = twenty_train_target[: alpha]
    #unlabeled_train_data = twenty_train_data[alpha:]
    #unlabeled_train_target = twenty_train_target[alpha:]
    labeled_train_data = []
    labeled_train_target = []
    labeled_train_data, labeled_train_target, unlabeled_train_data, unlabeled_train_target = diploma_range_sampling(labeled_train_data, labeled_train_target, unlabeled_train_data, unlabeled_train_target, closest)
    print labeled_train_data.shape
    baseline_active_clf = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', LinearSVC())
    ])

    baseline_active_clf.fit(labeled_train_data, labeled_train_target)
    predicted = baseline_active_clf.predict(twenty_test_data)
    score = f1_score(twenty_test_target, predicted, average='macro')
    print 'active cluster svm margin cluster solution'
    diploma_res_print(len(labeled_train_data), score)
    for t in range(1, betha):
        sample_numbers = np.array([])
        #to do use labeled dataset to train sigmoid

        scores = baseline_active_clf.decision_function(unlabeled_train_data)
        doc_score = {}
        for i in range(0, len(unlabeled_train_data)):
            last_elems = (sorted(scores[i]))[-2:]
            doc_score[i] = np.abs(last_elems[0] - last_elems[1])
        sorted_doc_score = sorted(doc_score.items(), key=operator.itemgetter(1))
        print 'sorted doc score minimum active cluster svn margin cluster', sorted_doc_score[0]
        if (t % 2) == 0:
            sample_numbers = np.array([]) #to add
            for i in range(0, gamma):
                sample_numbers = np.append(sample_numbers, sorted_doc_score[i][0])
        else:
            unlabeled_matrix = tfidf_transformer.fit_transform(unlabeled_train_data)
            print datetime.now()
            res = cluster.fit_predict(unlabeled_matrix)
            print datetime.now()
            sample_numbers, _ = pairwise_distances_argmin_min(cluster.cluster_centers_, unlabeled_matrix, metric='cosine')
            print sample_numbers

        labeled_train_data, labeled_train_target, unlabeled_train_data, unlabeled_train_target = diploma_range_sampling(labeled_train_data, labeled_train_target, unlabeled_train_data, unlabeled_train_target, sample_numbers)
        baseline_active_clf.fit(labeled_train_data, labeled_train_target)
        predicted = baseline_active_clf.predict(twenty_test_data)
        score = f1_score(twenty_test_target, predicted, average='macro')
        diploma_res_print(len(labeled_train_data), score)
예제 #30
0
	def predict(self, X):
		clustering, _ = pairwise_distances_argmin_min(X=X, Y=self.cluster_centers_, metric='cosine')

		self.labels_ = clustering

		return self.labels_
예제 #31
0
def closest_n_index(X, n_c=10):
    kmeans = KMeans(n_clusters=n_c, random_state=0).fit(X)
    closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_, X)
    return closest, kmeans.labels_
    def execute(self, context):
        try:
            mesh = bpy.data.objects[self.currentobject]
        except:
            mesh = context.active_object

        if (mesh is not None):
            only_gisif_colors, k, gisif_name = getGISIFColorsInner(
                context, mesh)
            only_gisif_colors = only_gisif_colors.reshape(
                only_gisif_colors.shape[0], 1)
            #Normalize the gisif colors
            only_gisif_colors = only_gisif_colors / np.sqrt(
                np.sum(only_gisif_colors**2))

            #             k1_list, k2_list, sx, p1_list, p2_list, mean_list, gaussian_list, normals = need_curvatures(mesh);
            #             features = np.hstack((normals, k1_list.reshape(k1_list.shape[0],1), k2_list.reshape(k2_list.shape[0],1), p1_list, p2_list, only_gisif_colors.reshape(only_gisif_colors.shape[0],1)));
            #             mu, transformedFeatures = pcaTransform(context, mesh, features, K=12);

            gisif_colors = only_gisif_colors

            gisif_colors = StandardScaler().fit_transform(gisif_colors)
            count_n = mesh.gisif_markers_n

            gmm = GaussianMixture(n_components=count_n,
                                  covariance_type='full').fit(gisif_colors)
            labels_gmm = gmm.predict(gisif_colors)
            labels_gmm.shape = (labels_gmm.shape[0], 1)

            #             gmm_sorted_indices = np.argsort(gmm.means_.T).flatten();
            #             gmm_sorted_values = np.sort(gmm.means_.T).flatten();

            gmm_sorted_indices = np.array([i for i in range(count_n)])
            gmm_sorted_values = gmm.means_

            print(gmm.means_, gmm_sorted_indices)

            keyindices = []
            print('=' * 40)
            for i in range(count_n):
                gmm_label_index = gmm_sorted_indices[i]
                gmm_value = gmm_sorted_values[gmm_label_index]
                gmm_subset, __ = np.where(labels_gmm == gmm_label_index)
                cluster_values = gisif_colors[gmm_subset]
                print(gmm_value, gmm_value.shape, cluster_values.shape)
                closest, __ = pairwise_distances_argmin_min(
                    gmm_value.reshape(1, -1), cluster_values)
                closest_index = gmm_subset[closest[0]]
                closest_value = gisif_colors[closest_index]
                keyindices.append(closest_index)
                print('-----------------')
#                 print('GMM VALUES (Mean: %f, Closest: %f, Closest Index: %d, In Subset Value: %f, In Subset Index: %d) ::: '%(gmm_value, closest_value, closest_index, cluster_values[closest[0]], closest[0]));

            faces = getMeshFaces(mesh)
            for vid in keyindices:
                uvw = [0.0, 0.0, 0.0]
                faces_rows, faces_column = np.where(faces == vid)
                face_row_index, face_column_index = faces_rows[
                    0], faces_column[0]
                face_row = faces[face_row_index]
                uvw[face_column_index] = 1.0
                vid1, vid2, vid3 = face_row.tolist()
                print(vid1, vid2, vid3)
                co = mesh.data.vertices[face_row[face_column_index]].co
                addConstraint(context,
                              mesh,
                              uvw, [vid1, vid2, vid3],
                              co,
                              faceindex=face_row_index,
                              create_visual_landmarks=False)

            if (mesh.gisif_symmetries):
                print('~' * 40)
                for o_vid in keyindices:
                    #EQuation 10 in the paper for finding the symmetry points where the euclidean distance will be zero for symmetry
                    delta_gisif_colors = np.sqrt(
                        (only_gisif_colors[o_vid] - only_gisif_colors)**2)
                    #                     delta_gisif_colors[o_vid] = np.finfo(float).max;
                    vidrows, __ = np.where(delta_gisif_colors == 0.0)

                    print(delta_gisif_colors[vidrows])
                    print(vidrows)

                    filtered_vid_values = delta_gisif_colors[vidrows]
                    vid = vidrows[filtered_vid_values.argmin()]
                    print(o_vid, vid)

                    uvw = [0.0, 0.0, 0.0]
                    faces_rows, faces_column = np.where(faces == vid)
                    face_row_index, face_column_index = faces_rows[
                        0], faces_column[0]
                    face_row = faces[face_row_index]
                    uvw[face_column_index] = 1.0
                    vid1, vid2, vid3 = face_row.tolist()
                    print(vid1, vid2, vid3)
                    co = mesh.data.vertices[face_row[face_column_index]].co
                    addConstraint(context,
                                  mesh,
                                  uvw, [vid1, vid2, vid3],
                                  co,
                                  faceindex=face_row_index,
                                  create_visual_landmarks=False)

#             bpy.ops.genericlandmarks.createlandmarks('EXEC_DEFAULT', currentobject=mesh.name, updatepositions = True);
            bpy.ops.genericlandmarks.changelandmarks('EXEC_DEFAULT',
                                                     currentobject=mesh.name)

        return {'FINISHED'}
예제 #33
0
def __innerQmc(outPath='./',
               path='Models/',
               FILE_DOT_OUT='analysis.out',
               CSV_NAME='model.csv',
               MAX_CLUSTERS=10,
               PER_CONNECT=0.5,
               SIL_DAMPING=0.1,
               NORM_METHOD='StandardScaler',
               clustering_names=[
                   'AffinityPropagation', 'DBSCAN', 'KMeans', 'MeanShift',
                   'SpectralClustering', 'Ward'
               ],
               modellerScores=['molpdf', 'DOPE', 'DOPEHR', 'GA341', 'NDOPE'],
               molprobityScores=['outlier', 'allowed'],
               theColors='bgrcmykbgrcmykbgrcmykbgrcmyk',
               saveFig=False,
               molprobity=False):
    '''
    The Quality-Models Clusterizer private method, its performs the analysis,
    call the pther methods and evaluate the dataset.
    
    PARAMETERS
    ----------
    outPath : string (Default = ./ )
        The path to save the csv and data analysis.
    path : string (Default = ./Models/ )
        The path of Molprobity pdf. (All files must be on same folder and
        its names MUST be on modeller output file!).
    FILE_DOT_OUT : string (default = analysis.out)
        Name of output file.
    CSV_NAME : string (default = analysis.out)
        Name of .csv file with data from Modeller and Molprobity outputs
    MAX_CLUSTERS : int (default = 10)
        Maximum number of clusters for k-dependent methods.
    PER_CONNECT : double (default = 0.5)
        Percentage of the data size used as number of neighbors for Ward.
    SIL_DAMPING : double (default = 0.1)
        Minimum percentage of silhouette number to be considered as actual increasing.
    NORM_METHOD : string (default = StandardScaler)
        Method for normilize the data. Options : {'StandardScaler', 'MinMax'}
    saveFig : Boolean (default = False)
        Save a figure of all cluster results Yes (True)/No (False).
    clustering_names : List[string] (default = ['AffinityPropagation', 'DBSCAN', 'KMeans',
                                                'MeanShift', 'SpectralClustering', 'Ward'])
        List of Method names. Supported methods are: KMeans, AddinityPropagation, MeanShift,
        SpecrtalClustering, Ward, DBSCAN.
    modellerScores: List[string] (default = ['molpdf', 'DOPE', 'DOPEHR', 'GA341', 'NDOPE'])
        List of Modeller attributes to evaluate.
        Options : {'molpdf', 'DOPE', 'DOPEHR', 'GA341', 'NDOPE'}
    molprobityScores: List[string] (default = ['outlier', 'allowed'])
        List of Molprobity attributes to evaluate.
        Options : {'outlier', 'allowed'}
    theColors : string (default = bgrcmykbgrcmykbgrcmykbgrcmyk)
        A stirng which each letter is a matplotlib color. (b : blue; g : green; r : red;
        c : cyan; m : magenta; y : yellow; k : black; w : white)
    
    RETURNS
    -------
        
    '''

    ##########################################  PREPARING DATA  ################################################
    log.info('\n\n\t\tQuality-Models Clusterizer\n\n')

    if not modellerScores or not any(
            x in ['molpdf', 'DOPE', 'DOPEHR', 'GA341', 'NDOPE']
            for x in modellerScores):
        log.error(
            "modellerScores list has no valid value or its empty.\nValid values are: molpdf, DOPE, DOPEHR, GA341, NDOPE\n\nABORTING EXECUTION"
        )
        exit()

    if not molprobityScores or not any(x in ['outlier', 'allowed']
                                       for x in molprobityScores):
        log.error(
            "molprobityScores list has no valid value or its empty.\nValid values are: outlier, allowed\n\nABORTING EXECUTION"
        )
        exit()

    if molprobity:
        os.system('mkdir Modelos')

    log.info('#######  Preparing data...')
    t0 = time.time()

    clustering_names.sort()

    # colors used after on the plot
    colors = np.array([x for x in theColors])
    colors = np.hstack([colors] * 20)

    plt.figure(figsize=(len(clustering_names) * 2 + 3, 9.5))
    plt.subplots_adjust(left=.05,
                        right=.98,
                        bottom=.1,
                        top=.96,
                        wspace=.2,
                        hspace=.2)

    plot_num = 1

    D = []

    with open(FILE_DOT_OUT, 'r') as content:
        c = content.read()
        c_list = c.split('>>')[1].split('--------')[-1].strip().split('\n')

        for line in c_list:
            v = line.split()
            pdb, var = v[0], v[1::]
            rt = pdb.split('.pdb')[0]

            if bool(re.match('^[-]+$', rt)):
                continue

            pdf = path + rt + '.pdf'
            var = [float(i) for i in var]

            #print(pdf)

            # This code should be uncommented when you have not already generated the 'MolProbity Ramachandran analysis' for the pdb files to be analyzed.
            # It's necessary to install molprobity to run it.
            if molprobity:
                os.system(
                    'java -Xmx256m -cp /home/medina/Documentos/Paper_Agrupamento_Proteinas/molprobity/lib/chiropraxis.jar chiropraxis.rotarama.Ramalyze -pdf '
                    + pdb + ' ' + pdf)
                os.system('mv *.pdf ./Modelos')
            aux_path = './Modelos/' + rt + '.pdf'

            d = dict()

            gen = do_grep(aux_path, 'allowed')
            outputs = [output for output in gen]

            if 'allowed' in molprobityScores:
                try:
                    d['allowed'] = float(
                        re.sub(
                            ',', '.',
                            outputs[0].split('%')[0].split('\'')[1].strip()))
                except:
                    d['allowed'] = 0
            #s = os.popen('pdfgrep allowed  '+pdf).read()
            #p = float(re.sub(',','.',s.split('%')[0].strip()))

            #s = os.popen('pdfgrep outliers  '+pdf).read()
            gen = do_grep(aux_path, 'outliers')
            outputs = [output for output in gen]

            if 'outlier' in molprobityScores:
                try:
                    d['outlier'] = int(outputs[0].split('outliers')[0].split(
                        'were')[-1].strip())
                except:
                    d['outlier'] = 0

            d['pdb'] = rt

            if 'molpdf' in modellerScores:
                d['molpdf'] = var[0]

            if 'DOPE' in modellerScores:
                d['DOPE'] = var[1]

            if 'DOPEHR' in modellerScores:
                d['DOPEHR'] = var[2]

            #if 'GA341' in modellerScores:
            #    d['GA341'   ] = var[3]

            if 'NDOPE' in modellerScores:
                d['NDOPE'] = var[4]

            D.append(d)

    D = pd.DataFrame(D)

    # Find uniform columns
    #    nunique = D.apply(pd.Series.nunique)
    #    cols_to_drop = nunique[nunique == 1].index
    #    D.drop(cols_to_drop, axis=1)

    # Create a csv with data
    D.to_csv(path + CSV_NAME, index=False)

    # Create a csv with data
    #aux =  pd.read_csv(path + CSV_NAME)

    # Concatenate scores
    listOfAtrr = modellerScores + molprobityScores

    allowedScores = ['molpdf', 'DOPE', 'DOPEHR', 'NDOPE', 'outlier', 'allowed']

    # Remove uniform columns
    #    for dropThis in cols_to_drop:
    #        #print(dropThis)
    #        listOfAtrr.remove(dropThis)
    #        allowedScores.remove(dropThis)

    #print(allowedScores)
    # Remove not allowed values
    listOfAtrr = list(filter(lambda i: i in allowedScores, listOfAtrr))
    #print(listOfAtrr)
    X = D[listOfAtrr]

    #print(X)
    pdb_names = D['pdb']

    dt = np.asarray(X)

    #print(dt)

    if NORM_METHOD == 'MinMax':
        # normalize the data in the space of 0 to 1
        for i in range(len(dt[0])):
            # If column is uniform discard it
            #if np.all(dt[0:i] == dt[:i], axis=1):
            #    dt = np.delete(dt, i, axis=1)
            #    #dt = np.delete(dt, i, axis=2)
            #    continue

            if sum(dt[:, i]) != 0:
                #print("\n\nCOLUNA MM: " + str(i))
                #print("\nDIVISOR DO MINMAX: " + str(abs(dt[:, i]).max()))
                dt[:, i] = (dt[:, i] / abs(dt[:, i]).max())**2

                #print(dt[:, i])

    else:
        if NORM_METHOD != 'StandardScaler':
            log.warn(
                "NORM_METHOD must be either MinMax or StandardScaler, running as StandardScaler, since it is the default method"
            )
        # normalize the data with mean 0 and stf 1
        for i in range(len(dt[0])):
            mean_c = np.mean(dt[:, i])
            std_c = np.std(dt[:, i])

            #print("\n\nCOLUNA SS: " + str(i))
            #print("\nMEDIA CALC: " + str(mean_c))
            #print("\nDESVIO CALC: " + str(std_c))

            if std_c < 1e-4:
                std_c = 1
            dt[:, i] = ((dt[:, i] - mean_c) / std_c)

            #print(dt[:, i])

    #print(dt)
    # run PCA for the normalized data
    pca = PCA(n_components=2)
    print("\nAntes do PCA\n")
    #print(X)
    print(D[listOfAtrr])
    X = pca.fit(dt).transform(dt)
    print("\nDepois do PCA\n")
    print(X)
    # PCA process results
    results = pca.components_
    print("\nResultados PCA: " + str(results))
    covm = pca.explained_variance_ratio_
    print("\nVariance PCA: " + str(covm))

    if not os.path.exists('./../' + NORM_METHOD + '_pca_results.txt'):
        f = open('./../' + NORM_METHOD + '_pca_results.txt', 'w')

        head_line = 'pbd'
        for c in range(2):
            for at in allowedScores:
                head_line = head_line + ', ' + at + '_coor' + str(c + 1)
        head_line = head_line + ', coef_var_coor1, coef_var_coor2\n'
        print("HEAD LINE PCA: " + head_line)
        f.write(head_line)
        f.close()

    f = open('./../' + NORM_METHOD + '_pca_results.txt', 'a+')
    f.write(
        rt.split('.')[0] + ', ' +
        str([*results[0], *results[1], *covm])[1:-1] + '\n')
    f.close()

    #f = open('./../' + NORM_METHOD + '_corr_mtx.txt', 'a+')
    corr_mtx = pd.DataFrame(X).corr()
    corr_mtxd = pd.DataFrame(dt).corr()
    print("\nCorrelation Matriz: \n")
    print(corr_mtx)
    print(corr_mtxd)
    #f.close()

    # connectivity matrix for structured Ward
    n_neig = int(len(X) * PER_CONNECT)
    connectivity = kneighbors_graph(X, n_neighbors=n_neig, include_self=True)

    # make connectivity symmetric
    affinity = 'euclidean'
    connectivity = 0.5 * (connectivity + connectivity.T)
    connectivity, n_components = cluster.hierarchical._fix_connectivity(
        X, connectivity, affinity)

    # define cutoff for DBSCAN
    if NORM_METHOD == 'MinMax': n_eps = 0.1
    else:
        if NORM_METHOD != 'StandardScaler':
            log.warn(
                "NORM_METHOD must be either MinMax or StandardScaler, running as StandardScaler, since it is the default method"
            )
        n_eps = 2 * 2.57 * 0.05

    t1 = time.time()
    log.info('\tTime spended (preparing data): %f s' % (t1 - t0))

    ##########################################  METHODS DEFINITION ##############################################

    #clustering_names = ['AffinityPropagation', 'DBSCAN', 'KMeans', 'MeanShift', 'SpectralClustering', 'Ward']

    log.info('\n#######  Defining clustering methods...')
    t0 = time.time()

    # create clustering estimators

    clustering_algorithms = []

    if 'AffinityPropagation' in clustering_names:
        try:
            affinity_propagation = cluster.AffinityPropagation(
                damping=0.9)  #,preference=-1)
            clustering_algorithms.append(affinity_propagation)
        except Exception as e:
            log.warn(
                "Problems were found while running Affinity Propagation clustering algorithm for "
                + NORM_METHOD +
                " normalization, skipping its execution.\nProblem: " + str(e))
            print(
                "Problems were found while running Affinity Propagation clustering algorithm for "
                + NORM_METHOD +
                " normalization, skipping its execution.\nProblem: " + str(e))

    if 'DBSCAN' in clustering_names:
        try:
            dbscan = cluster.DBSCAN(eps=n_eps,
                                    min_samples=5,
                                    algorithm='kd_tree',
                                    metric='euclidean')
            clustering_algorithms.append(dbscan)
        except Exception as e:
            log.warn(
                "Problems were found while running DBSCAN clustering algorithm for "
                + NORM_METHOD +
                " normalization, skipping its execution.\nProblem: " + str(e))
            print(
                "Problems were found while running DBSCAN clustering algorithm for "
                + NORM_METHOD +
                " normalization, skipping its execution.\nProblem: " + str(e))

    if 'KMeans' in clustering_names:
        log.info('\n\t(K-means) Searching best k-number... ')
        try:
            k, _, _ = __best_k_of_clusters('KMeans', X, MAX_CLUSTERS)
            log.info('\tk_best = ' + str(k))
            two_means = cluster.KMeans(n_clusters=k, init='k-means++')
            clustering_algorithms.append(two_means)
        except Exception as e:
            log.warn(
                "Problems were found while running KMeans clustering algorithm for "
                + NORM_METHOD +
                " normalization, skipping its execution.\nProblem: " + str(e))
            print(
                "Problems were found while running KMeans clustering algorithm for "
                + NORM_METHOD +
                " normalization, skipping its execution.\nProblem: " + str(e))

    if 'MeanShift' in clustering_names:
        try:
            ms = cluster.MeanShift()
            clustering_algorithms.append(ms)
        except Exception as e:
            log.warn(
                "Problems were found while running MeanShift clustering algorithm for "
                + NORM_METHOD +
                " normalization, skipping its execution.\nProblem: " + str(e))
            print(
                "Problems were found while running MeanShift clustering algorithm for "
                + NORM_METHOD +
                " normalization, skipping its execution.\nProblem: " + str(e))

    if 'SpectralClustering' in clustering_names:
        log.info('\n\t(Spectral) Searching best k-number... ')
        try:
            k, _, _ = __best_k_of_clusters('SpectralClustering', X,
                                           MAX_CLUSTERS)
            #print(k)
            log.info('\tk_best = ' + str(k))
            #spectral = cluster.SpectralClustering(n_clusters=k, eigen_solver='arpack', affinity='nearest_neighbors')
            spectral = cluster.SpectralClustering(n_clusters=k,
                                                  eigen_solver=None,
                                                  random_state=None,
                                                  n_init=10,
                                                  gamma=1.,
                                                  affinity='rbf',
                                                  n_neighbors=10,
                                                  eigen_tol=0.0,
                                                  degree=3,
                                                  coef0=1,
                                                  kernel_params=None)
            clustering_algorithms.append(spectral)
        except Exception as e:
            log.warn(
                "Problems were found while running Spectral clustering algorithm for "
                + NORM_METHOD +
                " normalization, skipping its execution.\nProblem: " + str(e))
            print(
                "Problems were found while running Spectral clustering algorithm for "
                + NORM_METHOD +
                " normalization, skipping its execution.\nProblem: " + str(e))

    if 'Ward' in clustering_names:
        log.info('\n\t(Ward) Searching best k-number... ')
        try:
            k, _, _ = __best_k_of_clusters('Ward',
                                           X,
                                           MAX_CLUSTERS,
                                           connectivity=connectivity)
            log.info('\tk_best = ' + str(k))
            ward = cluster.AgglomerativeClustering(n_clusters=k,
                                                   linkage='ward',
                                                   connectivity=connectivity)
            clustering_algorithms.append(ward)
        except Exception as e:
            log.warn(
                "Problems were found while running Ward clustering algorithm for "
                + NORM_METHOD +
                " normalization, skipping its execution.\nProblem: " + str(e))
            print(
                "Problems were found while running Ward clustering algorithm for "
                + NORM_METHOD +
                " normalization, skipping its execution.\nProblem: " + str(e))

    #clustering_algorithms = [two_means, affinity_propagation, ms, spectral, ward, dbscan]

    t1 = time.time()
    log.info('\n\tTime spended (defining clustering methods): %f s' %
             (t1 - t0))

    ##########################################  CLUSTERS & PLOTS ###############################################

    log.info('\n####### Cluster & Dispersion graphs...')
    t0 = time.time()

    for name, algorithm in zip(clustering_names, clustering_algorithms):
        # predict cluster memberships
        algorithm.fit(X)

        if hasattr(algorithm, 'labels_'):
            y_pred = algorithm.labels_.astype(np.int)
        else:
            y_pred = algorithm.predict(X)

        # plot
        plt.subplot(2, len(clustering_algorithms) // 2, plot_num)
        plt.title(name, size=18)
        plt.scatter(X[:, 0], X[:, 1], color=colors[y_pred].tolist(), s=10)

        if hasattr(algorithm, 'cluster_centers_'):
            centers = algorithm.cluster_centers_
            center_colors = colors[:len(centers)]
            plt.scatter(centers[:, 0], centers[:, 1], s=100, c=center_colors)

        plt.text(.99,
                 .01, ('%.2fs' % (t1 - t0)).lstrip('0'),
                 transform=plt.gca().transAxes,
                 size=15,
                 horizontalalignment='right')
        plot_num += 1

    t1 = time.time()
    log.info('\tTime spended (clst. disp. graf.): %f s' % (t1 - t0))

    ##########################################  OUTPUT FILES ###################################################

    log.info('\n####### Generating output files...')
    t0 = time.time()

    # File containing clusters data
    cluster_date_dir = 'Clusters_Data_' + NORM_METHOD
    if not os.path.isdir(outPath + cluster_date_dir):
        os.makedirs(outPath + cluster_date_dir)

    for name, algorithm in zip(clustering_names, clustering_algorithms):

        # Read labels of the algorithm
        X_labels = algorithm.labels_

        # Try to write the representative model for the clusters on analysis.out
        try:
            # Adding results on the moddeler file analysis.out
            with open(FILE_DOT_OUT, 'a') as arq:
                if clustering_names[0] == name:
                    arq.writelines(
                        '\n\n##############################################################################################'
                    )
                arq.writelines(
                    '\n>>Clustering results - Representative structure - ' +
                    name)
                arq.writelines('\nCluster\t\tFile_Name\n')

                Vec = []

                # If the clustering method has cluster_centers_ attribute and it isn't nor KMeans neither MeanShift (on these Medoid != Centroid)
                # In this set of clustering methods - AffinityPropagation
                if hasattr(algorithm, 'cluster_centers_') and (
                        name != 'KMeans') and (name != 'MeanShift'):
                    centers = algorithm.cluster_centers_[:]
                    r = int(centers[:, 0].size)

                    for j in range(r):

                        m = __aprx_medoid(X, centers[j, :])
                        nm = __medoid_name(X, pdb_names, [0, 1], [m[0], m[1]])

                        arq.write(str(j) + '\t\t')
                        arq.write(nm + '\n')

                        x_aux = dict()
                        x_aux['Nome_pdb'] = nm  #str(c)
                        x_aux['Cluster'] = j
                        Vec.append(x_aux)

                else:
                    algorithm.cluster_centers_ = []
                    for lb in set(algorithm.labels_):
                        labels = algorithm.labels_
                        data_frame = pd.DataFrame(X)
                        algorithm.cluster_centers_.append(
                            data_frame[labels == lb].mean(axis=0).values)

                    medians, _ = metrics.pairwise_distances_argmin_min(
                        algorithm.cluster_centers_, data_frame.values)

                    j = 0

                    # find medoids
                    for m in medians:
                        nm = __medoid_name(X, pdb_names, [0, 1],
                                           [X[m, 0], X[m, 1]])

                        arq.write(str(j) + '\t\t')
                        arq.write(str(nm) + '.pdb\n')

                        x_aux = dict()
                        c = 'MEDOID:\t' + str(nm) + '.pdb'
                        x_aux['Cluster'] = j
                        x_aux['\tFilename'] = str(c)
                        Vec.append(x_aux)

                        j = j + 1

                if clustering_names[-1] == name:
                    arq.writelines(
                        '##############################################################################################'
                    )

                # create results vector for the clustering method
                for i in range(pdb_names.size):
                    x_aux = dict()
                    c = '\t' + pdb_names[i] + '.pdb'

                    x_aux['Cluster'] = X_labels[i]
                    x_aux['\tFilename'] = str(c)
                    Vec.append(x_aux)

                # sort results vector by n-cluster
                Vec = sorted(Vec, key=lambda k: k['Cluster'])
                Vec = pd.DataFrame(Vec)

                # n-cluster == -1 are Outlier data (for DBscan)
                Vec.loc[Vec.Cluster == -1, ['Cluster']] = 'Outlier'

                # Write .csv results
                Vec.to_csv(outPath + cluster_date_dir + '/' + name +
                           '_Data.csv',
                           index=False)

        except Exception as ex:
            log.error('Error 1: {0}'.format(ex))

    t1 = time.time()
    log.info('\tTime spended (Generating output files): %f s' % (t1 - t0))

    log.info('\n\n\t\tThat\'s it!\n\n\n')

    if saveFig == True:
        plt.savefig(NORM_METHOD + '_dispersion_graph.png')

    plt.show()
예제 #34
0
    def interpret(self,
                  image_paths,
                  num_samples=1000,
                  batch_size=50,
                  save_path='normlime_weights.npy'):
        """
        Main function of the interpreter.

        Args:
            image_paths (list of strs): A list of image filepaths.
            num_samples (int, optional): LIME sampling numbers. Larger number of samples usually gives more accurate interpretation. Default: 1000
            batch_size (int, optional): Number of samples to forward each time. Default: 50
            save_path (str, optional): The .npy path to save the normlime weights. It is a dictionary where the key is label and value is segmentation ids with their importance. Default: 'normlime_weights.npy'

        :return: NormLIME weights: {label_i: weights on features}
        :rtype: dict

        Example::

            def paddle_model(image_input):
                import paddle.fluid as fluid
                class_num = 1000
                model = ResNet50()
                logits = model.net(input=image_input, class_dim=class_num)
                probs = fluid.layers.softmax(logits, axis=-1)
                return probs

            # The model can be downloaded from
            # http://paddle-imagenet-models-name.bj.bcebos.com/ResNet101_pretrained.tar
            # More pretrained models can be found in
            # https://github.com/PaddlePaddle/models/tree/release/1.8/PaddleCV/image_classification

            # 10 images are used here for example, but more images should be used.
            dataset_dir = "assets"
            image_paths = sorted(glob.glob(dataset_dir + "/*.png"))
            image_paths = image_paths[:10]

            normlime = it.NormLIMECVInterpreter(paddle_model,
                                                "assets/ResNet50_pretrained")

            # this can be very slow.
            normlime.interpret(image_paths, num_samples=2000, batch_size=50)


        """
        _, h_pre_models_kmeans = get_pre_models()
        kmeans_model = load_pickle_file(h_pre_models_kmeans)

        # compute lime weights and put in self.all_lime_weights
        for i in tqdm(range(len(image_paths))):
            image_path = image_paths[i]
            self._get_lime_weights(image_path,
                                   num_samples,
                                   batch_size,
                                   auto_save=(i % 10 == 0))

        np.savez(self.filepath_to_save, **self.all_lime_weights)

        # convert superpixel indexes to cluster indexes.
        normlime_weights_all_labels = {}
        for i, image_path in enumerate(image_paths):
            temp = self.all_lime_weights[image_path]
            if isinstance(temp, np.ndarray):
                temp = temp.item()

            fextractor = FeatureExtractor()
            f = fextractor.forward(temp['input'][np.newaxis, ...]).transpose(
                (1, 2, 0))

            X = extract_superpixel_features(f, temp['segmentation'])
            try:
                cluster_labels = kmeans_model.predict(
                    X)  # a list. len = number of sp.
            except AttributeError:
                from sklearn.metrics import pairwise_distances_argmin_min
                cluster_labels, _ = pairwise_distances_argmin_min(
                    X, kmeans_model.cluster_centers_)
            lime_weights = temp['lime_weights']
            pred_labels = lime_weights.keys()
            for y in pred_labels:
                normlime_weights_label_y = normlime_weights_all_labels.get(
                    y, {})
                w_f_y = [abs(w[1]) for w in lime_weights[y]]
                w_f_y_l1norm = sum(w_f_y)

                for w in lime_weights[y]:
                    seg_label = w[0]
                    weight = w[1] * w[1] / w_f_y_l1norm
                    tmp = normlime_weights_label_y.get(
                        cluster_labels[seg_label], [])
                    tmp.append(weight)
                    normlime_weights_label_y[cluster_labels[seg_label]] = tmp

                normlime_weights_all_labels[y] = normlime_weights_label_y
        # compute normlime weights.
        for y in normlime_weights_all_labels:
            normlime_weights = normlime_weights_all_labels.get(y, {})
            for k in normlime_weights:
                normlime_weights[k] = sum(normlime_weights[k]) / len(
                    normlime_weights[k])

        # check normlime
        if len(normlime_weights_all_labels.keys()) < max(
                normlime_weights_all_labels.keys()) + 1:
            print(
                "\n" + \
                "Warning: !!! \n" + \
                "There are at least {} classes, ".format(max(normlime_weights_all_labels.keys()) + 1) + \
                "but the NormLIME has results of only {} classes. \n".format(len(normlime_weights_all_labels.keys())) + \
                "It may have cause unstable results in the later computation" + \
                " but can be improved by computing more test samples." + \
                "\n"
            )

        if os.path.exists(save_path):
            n = 0
            tmp = save_path.split('.npy')[0]
            while os.path.exists(f'{tmp}-{n}.npy'):
                n += 1

            np.save(f'{tmp}-{n}.npy', normlime_weights_all_labels)
        else:
            np.save(save_path, normlime_weights_all_labels)

        return normlime_weights_all_labels
예제 #35
0
    st.subheader("Disambiguation Parameter")
    hasil_disambiguation = pd.DataFrame(disambiguation_df)
    st.dataframe(hasil_disambiguation)
    vector = hasil_disambiguation
    SUMMARY_SIZE = st.sidebar.slider("Berapa Jumlah Cluster?", 1,
                                     len(sentences),
                                     len(sentences) // 3)
    n = SUMMARY_SIZE
    avg = []
    n_clusters = len(sentences) // n
    modelkm = KMeans(n_clusters=n_clusters, init='k-means++')
    modelkm = modelkm.fit(vector)
    for j in range(n_clusters):
        idx = np.where(modelkm.labels_ == j)[0]
        avg.append(np.mean(idx))
    closest, _ = pairwise_distances_argmin_min(modelkm.cluster_centers_,
                                               vector)
    ordering = sorted(range(n_clusters), key=lambda k: avg[k])
    col5, col6 = st.beta_columns([1, 1])
    col5.subheader("Closest Cluster")
    col5.dataframe(closest)
    col6.subheader("Ordering Cluster")
    col6.dataframe(ordering)

    st.subheader("Summary Result")
    #     summary = itemgetter(*ordering)(sentences)
    #     hasilRingkasan = []
    #     for sent in summary:
    #         a = ' '.join(sent)
    #         hasilRingkasan.append(a)
    #     st.write(hasilRingkasan)
    #     summary = ' '.join([list_sentences[closest[idx]] for idx in ordering])
예제 #36
0
#!/usr/bin/env python
# coding: utf-8

# In[ ]:

from sklearn.neighbors import NearestCentroid
from sklearn.metrics import pairwise_distances_argmin_min, pairwise_distances

closest, min_dist = pairwise_distances_argmin_min(X,
                                                  clusterer.cluster_centers_)

# Distance of each point to the 9 clusters
X_np = np.array(X)
centroids = clusterer.cluster_centers_
distance = []
for i in X_np:
    data_point = np.array(i).reshape(1, -1)
    distance_to_point = pairwise_distances(data_point, centroids)
    distance.extend(distance_to_point)

distance_all_centroids_df = pd.DataFrame(distance)
예제 #37
0
# /home/genian/SEQUOIA/branches/CURRENT/src/devops/ml/sklearn/clustering/A595940_F8803.csv
df_pre = pd.read_csv('./train.csv')

dataset = df_pre.values
Hash_Name = dataset[:, 0]
MalwareBenign = dataset[:, 1]
Feature = dataset[:, 2:]

# n_clusters = 8803
print("[+] Kmeans Start n: 30")
kmeans = KMeans(n_clusters=30, random_state=0, n_jobs=-1).fit(Feature)
kmeans_result = (kmeans.labels_).tolist()
centroid = (kmeans.cluster_centers_).tolist()

# closest = nearest point index
closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_, Feature)

fp1 = open("./output.csv", 'w')  #output file write
fp1.write("Cluster Num." + ", " + "File_Name" + ", " + "Benign_Malware" + "\n")
closest = closest.tolist()

for i in range(0, len(closest)):
    cluster_num = str(i)
    malware_benign = str(MalwareBenign[closest[i]])
    file_name = str(Hash_Name[closest[i]])
    fp1.write(cluster_num + "," + file_name + ", " + malware_benign + "\n")

# Nearest Malware
if __name__ == "__main__":
    pass
예제 #38
0
#%% Cluster features
feat = impute_nan_inf(feat)

column_linkage = linkage(feat.T, method=method, metric=metric)

clusters = fcluster(column_linkage, n_clusters, criterion='maxclust')

un, n = np.unique(clusters, return_counts=True)

#print(n)

# Get cluster centers
cluster_centers = (feat.T).groupby(by=clusters).mean()
# get the index of the feature closest to the centroid of the cluster
central, _ = pairwise_distances_argmin_min(cluster_centers,
                                           feat.T,
                                           metric='cosine')
assert (np.unique(central).shape[0] == n_clusters)

# get the feature name of the feature closest to the centroid of the cluster
central = feat.columns.to_numpy()[central]

#%% Make dataframe
df = pd.DataFrame(index=feat.columns,
                  columns=['group_label', 'stat_label', 'motion_label'])
df['group_label'] = clusters

stats = np.array(['10th', '50th', '90th', 'IQR'])
df['stat_label'] = [
    np.unique([x for x in stats if x in ft]) for ft in df.index
]
예제 #39
0
    COM_action[i] = data[i + 1] - data[i]
    #print(COM_action[i], "COM action")

#print (np.shape(data))
X = np.array(COM_action[:5000])
print(np.shape(X))
X = X.reshape(5000, 24)  #first entry= X_length
X_embedded = TSNE(n_components=2).fit_transform(X)
print(X_embedded)

n_clusters = 100

kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(X_embedded)
labels = kmeans.labels_
cluster_centers = kmeans.cluster_centers_
closest, _ = pairwise_distances_argmin_min(cluster_centers, X_embedded)
print(closest, 'closest to the centroids')

## Getting ID of the individual cluster entries
# id_per_cluster=np.zeros((n_clusters))
# for i in range(n_clusters):
# 	#print ()
# 	for j in (range(len(labels))):
# 		if labels[j]==i:
# 			id_per_cluster[i]=j
# 			break

# print (id_per_cluster)

actions_modified = []
#Now that we have the id's per cluster, let's see what they are.
f = codecs.open(input_vector_file, 'r', 'utf-8')

df, labels_array, array_len = build_word_vector_matrix(total_words_set)

clusters_to_make = int(200)
kmeans_model = KMeans(init='k-means++', n_clusters=clusters_to_make, n_init=10)
kmeans_model.fit(df)

cluster_labels = kmeans_model.labels_
cluster_inertia = kmeans_model.inertia_
cluster_centers = kmeans_model.cluster_centers_
cluster_to_words = find_word_clusters(labels_array, cluster_labels)
for key in cluster_to_words:
    if len(cluster_to_words[key]) != 1:
        clusterList, cluster_label_array, cluster_len = build_word_vector_matrix(
            cluster_to_words[key])
        centroid = []
        centroid.append(cluster_centers[key])
        closest, _ = pairwise_distances_argmin_min(centroid, clusterList)
    else:
        closest = [0]

    cluster_to_words[key] = [cluster_to_words[key][closest[0]]
                             ] + cluster_to_words[key]

with open(pickle_file_cluster, 'w+b') as out_file:
    pickle.dump(cluster_labels, out_file)
    pickle.dump(cluster_inertia, out_file)
    pickle.dump(cluster_centers, out_file)
    pickle.dump(cluster_to_words, out_file)
예제 #41
0
    def summarize(self,
                  corpus,
                  top_k=3,
                  important_words=3,
                  return_cluster=True):
        """
        Summarize list of strings / corpus

        Parameters
        ----------
        corpus: str, list

        top_k: int, (default=3)
            number of summarized strings
        important_words: int, (default=3)
            number of important words

        Returns
        -------
        string: summarized string
        """
        if not isinstance(top_k, int):
            raise ValueError('top_k must be an integer')
        if not isinstance(important_words, int):
            raise ValueError('important_words must be an integer')
        if not isinstance(return_cluster, bool):
            raise ValueError('return_cluster must be a boolean')
        if not isinstance(corpus, list) and not isinstance(corpus, str):
            raise ValueError('corpus must be a list')
        if isinstance(corpus, list):
            if not isinstance(corpus[0], str):
                raise ValueError('corpus must be list of strings')
        if isinstance(corpus, str):
            corpus = corpus.replace('\n', '.')
            corpus = split_by_dot(corpus)
        else:
            corpus = [c + '.' for c in corpus]
            corpus = ' '.join(corpus)
            corpus = re.findall('(?=\S)[^.\n]+(?<=\S)', corpus)

        corpus = [summary_textcleaning(i) for i in corpus]
        sequences = _skip_thought.batch_sequence(corpus,
                                                 self.dictionary,
                                                 maxlen=self._maxlen)
        encoded, attention = self._sess.run(
            [self._logits, self._attention],
            feed_dict={self._X: np.array(sequences)},
        )
        attention = attention.sum(axis=0)
        kmeans = KMeans(n_clusters=top_k, random_state=0)
        kmeans = kmeans.fit(encoded)
        avg = []
        for j in range(top_k):
            idx = np.where(kmeans.labels_ == j)[0]
            avg.append(np.mean(idx))
        closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_,
                                                   encoded)
        indices = np.argsort(attention)[::-1]
        top_words = [
            self._rev_dictionary[i] for i in indices[:important_words]
        ]
        ordering = sorted(range(top_k), key=lambda k: avg[k])
        summarized = '. '.join([corpus[closest[idx]] for idx in ordering])
        if return_cluster:
            return {
                'summary': summarized,
                'top-words': top_words,
                'cluster-top-words': cluster_words(top_words),
            }
        return {'summary': summarized, 'top-words': top_words}
예제 #42
0
def _kmeans_single_banilla(X, sparsity, n_clusters, centers, max_iter, verbose,
                           tol, debug_directory, debug_header, **kargs):

    n_samples = X.shape[0]
    labels_old = np.zeros((n_samples, ), dtype=np.int)
    debug_label_on = kargs.get('debug_label_on', True)
    debug_centroid_on = kargs.get('debug_centroid_on', True)

    for n_iter_ in range(1, max_iter + 1):

        _iter_time = time.time()

        labels, distances = pairwise_distances_argmin_min(X,
                                                          centers,
                                                          metric='cosine')
        centers = _update(X, labels, distances, n_clusters)
        inertia = distances.sum()

        if n_iter_ == 0:
            n_diff = n_samples
        else:
            diff = np.where((labels_old - labels) != 0)[0]
            n_diff = len(diff)

        labels_old = labels

        if isinstance(sparsity, str) and sparsity == 'sculley':
            radius = kargs.get('radius', 10)
            epsilon = kargs.get('epsilon', 5)
            centers = _sculley_projections(centers, radius, epsilon)
        elif isinstance(sparsity, str) and sparsity == 'minimum_df':
            minimum_df_factor = kargs.get('minimum_df_factor', 0.01)
            centers = _minimum_df_projections(X, centers, labels_old,
                                              minimum_df_factor)

        _iter_time = time.time() - _iter_time

        degree_of_sparsity = None
        degree_of_sparsity = check_sparsity(centers)
        ds_strf = ', sparsity={:.3}'.format(
            degree_of_sparsity) if degree_of_sparsity is not None else ''
        state = 'n_iter={}, changed={}, inertia={}, iter_time={} sec{}'.format(
            n_iter_, n_diff, '%.3f' % inertia, '%.3f' % _iter_time, ds_strf)

        if debug_directory:

            # Log message
            log_path = '{}/{}_logs.txt'.format(debug_directory, debug_header)
            with open(log_path, 'a', encoding='utf-8') as f:
                f.write('{}\n'.format(state))

            # Temporal labels
            if debug_label_on:
                label_path = '{}/{}_label_iter{}.txt'.format(
                    debug_directory, debug_header, n_iter_)
                with open(label_path, 'a', encoding='utf-8') as f:
                    for label in labels:
                        f.write('{}\n'.format(label))

            # Temporal cluster_centroid
            if debug_centroid_on:
                center_path = '{}/{}_centroids_iter{}.csv'.format(
                    debug_directory, debug_header, n_iter_)
                np.savetxt(center_path, centers)

        if verbose:
            print(state)

        if n_diff <= tol:
            if verbose and (n_iter_ + 1 < max_iter):
                print('Early converged.')
            break

    return centers, labels, inertia, n_iter_
예제 #43
0
    if os.path.exists(ISOMAP_FILE):
        iso_df = isomap_df
    else:
        # need to recompute features
        # my_d = pd.read_csv(FEATURES_FILE, index_col=0)
        print("DataFrame read!")

        iso_df = iso_map()

    print("IsoMap done!")

    if 'cluster' not in iso_df.columns:

        # compute clustering
        centers, labels = clustering(iso_df[['comp1', 'comp2']], show_img=True)
        arr = iso_df[['comp1', 'comp2']].to_numpy()
        closest, _ = pairwise_distances_argmin_min(centers, arr)

        for c in closest:
            print(iso_df.iloc[c])

        # save clusters' labels
        iso_df["cluster"] = labels
        iso_df.to_csv(ISOMAP_FILE, index=True)

    print("First clustering done!\n")

    grp = iso_df.groupby(by=['cluster'])

    # do something
예제 #44
0
def phi(x, clusters):
	argmin, distances = pairwise_distances_argmin_min([x], clusters, metric='euclidean')
	return argmin[0], distances[0], argmin
예제 #45
0

# Sampling centroids and points nearby


# In[24]:


centroids=kmeans.cluster_centers_


# In[25]:


from sklearn.metrics import pairwise_distances_argmin_min
label,distances= pairwise_distances_argmin_min(X_train,kmeans.cluster_centers_)


# In[26]:


#ditances from cluster centroids
clusters['distance']=distances


# In[27]:


a1=[]
#split data based on labels
for i in range(0,c):
예제 #46
0
f.searchOpinions()
ops = f.getOpinions()
corpus = ops['sentenceText'].tolist()

X = vectorizer.fit_transform(corpus)
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(X)


from sklearn.cluster import KMeans
num_clusters = 5
km = KMeans(n_clusters=num_clusters)
km.fit(tfidf)

from sklearn.metrics import pairwise_distances_argmin_min
clusExamp, _ = pairwise_distances_argmin_min(km.cluster_centers_, tfidf)

for i in range(len(clusExamp)):
    examp = corpus[clusExamp[i]]
    print('Cluster {}: '.format(i).encode('utf-8') + examp)
        

order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(num_clusters):
    print("Cluster %d:" % i, end='')
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind], end='')
    print()    

예제 #47
0
파일: app.py 프로젝트: dangnm9699/20202_NLP
def score_post():
    request_data = request.json
    plaintext_dir = DATA_DIR + str(request_data["plaintext_dir"])
    manual_summary_dir = MANUAL_DIR + str(request_data["plaintext_dir"])
    print(plaintext_dir, manual_summary_dir)
    modeling = str(request_data["model"])
    method = str(request_data["method"])

    file = open(plaintext_dir, 'r', encoding='utf8')
    plaintext = file.read()
    file.close()
    file = open(manual_summary_dir, 'r', encoding='utf8')
    manual_summary = file.read()
    file.close()

    m_s = process(manual_summary)
    processed = process(plaintext)

    sentences = nltk.sent_tokenize(m_s)

    nsum1 = len(sentences)
    print(nsum1, end=' ')
    summary = ""

    if modeling == 'bert':
        summary = ''.join(
            model(body=processed,
                  ratio=float(nsum1),
                  min_length=0,
                  use_first=False))
        summary = summary.replace('_', ' ')
    if modeling == 'word2vec':
        sentences = nltk.sent_tokenize(plaintext)
        X = []
        for sentence in sentences:
            sentence = ViTokenizer.tokenize(sentence)
            words = sentence.split(" ")
            sentence_vec = np.zeros((300))
            for word in words:
                if word in vocab:
                    sentence_vec += vocab[word]
                    break
            X.append(sentence_vec)
        kmeans = KMeans(n_clusters=nsum1)
        kmeans.fit(X)

        avg = []
        for j in range(nsum1):
            idx = np.where(kmeans.labels_ == j)[0]
            avg.append(np.mean(idx))
        closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_, X)
        ordering = sorted(range(nsum1), key=lambda k: avg[k])
        summary = ' '.join([sentences[closest[idx]] for idx in ordering])
    summary = summary.replace('...', '')
    print(len(summary.strip().split('. ')))
    p, r, f1 = 0, 0, 0

    print(m_s)
    print(summary)

    if method == 'bert':
        p, r, f1 = bert_score_compute(summary, manual_summary, 'vi')
    if method == 'rouge':
        p, r, f1 = rouge_score_compute(summary, manual_summary, 'l')

    resp = {
        "model-summarized": summary,
        "manual-summarized": m_s,
        "paragraph": plaintext,
        "p": p,
        "r": r,
        "f1": f1
    }
    return jsonify(resp)
예제 #48
0
def active_init_1():
    #baseline active learning solution
    alpha = 20 #initial training set
    betha = 100 #number of iterations
    gamma = 20 #sampling volume

    tfidf_transformer = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer())
    ])

    #try to implement silhouette analysis for number of clusters
    #cluster = AgglomerativeClustering(n_clusters=20,affinity='cosine', linkage='complete')
    cluster = KMeans(n_clusters=20)

    unlabeled_train_data = twenty_train_data
    unlabeled_train_target = twenty_train_target

    print 'start transforming'
    unlabeled_matrix = tfidf_transformer.fit_transform(unlabeled_train_data)

    print 'start fitting'
    print datetime.now()
    res = cluster.fit_predict(unlabeled_matrix)
    print datetime.now()

    print 'clustering result'
    print OrderedDict(Counter(res))
    print res.shape

    closest, _ = pairwise_distances_argmin_min(cluster.cluster_centers_, unlabeled_matrix, metric='cosine')

    print closest

    '''
    results = defaultdict(list)
    for idx, val in enumerate(res):
        results[val].append(idx)

    take_idx = []
    for cluster_num in range(0, 20):
        idxset = results[cluster_num]
    '''



    #create labeled and unlabeled training set
    #labeled_train_data = twenty_train_data[: alpha]
    #labeled_train_target = twenty_train_target[: alpha]
    #unlabeled_train_data = twenty_train_data[alpha:]
    #unlabeled_train_target = twenty_train_target[alpha:]
    labeled_train_data = []
    labeled_train_target = []
    labeled_train_data, labeled_train_target, unlabeled_train_data, unlabeled_train_target = diploma_range_sampling(labeled_train_data, labeled_train_target, unlabeled_train_data, unlabeled_train_target, closest)
    print labeled_train_data.shape
    baseline_active_clf = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', LinearSVC())
    ])

    baseline_active_clf.fit(labeled_train_data, labeled_train_target)
    predicted = baseline_active_clf.predict(twenty_test_data)
    score = f1_score(twenty_test_target, predicted, average='macro')
    print 'baseline active clustering solution'
    diploma_res_print(len(labeled_train_data), score)
    for t in range(1, betha):
        unlabeled_matrix = tfidf_transformer.fit_transform(unlabeled_train_data)
        print datetime.now()
        res = cluster.fit_predict(unlabeled_matrix)
        print datetime.now()
        closest, _ = pairwise_distances_argmin_min(cluster.cluster_centers_, unlabeled_matrix, metric='cosine')
        print closest
        labeled_train_data, labeled_train_target, unlabeled_train_data, unlabeled_train_target = diploma_range_sampling(labeled_train_data, labeled_train_target, unlabeled_train_data, unlabeled_train_target, closest)
        baseline_active_clf.fit(labeled_train_data, labeled_train_target)
        predicted = baseline_active_clf.predict(twenty_test_data)
        score = f1_score(twenty_test_target, predicted, average='macro')
        diploma_res_print(len(labeled_train_data), score)
예제 #49
0
def createAnnotationOutput_mimic_oracle(args, model, data_loader, gold_dict,
                                        output_dict):
    sentence_index = []
    sentence_index_sent = {}
    sentence_index_tokens = defaultdict(list)

    typeTag, typeTagIndex, tokenIndex_map, sentIndex_map, _ = readTrain(
        args.test_path)
    data = {}
    with codecs.open(args.to_annotate, "w",
                     encoding='utf-8') as fout, codecs.open(
                         args.debug, "w", encoding='utf-8') as fdebug:
        sorted_type = sorted(model.approxTypeErrors.items(),
                             key=lambda kv: kv[1],
                             reverse=True)[:args.k]
        fdebug.write("TOKEN\tTYPE\tGOLD\tPRED\tPREDPROB\tERRORS\n")
        for (type, error_percent) in sorted_type:
            token_pred_error = model.predTypeErrors[type]
            token_tag_error = model.approxTokenClassErrors[type]
            sorted_token_tag_error = sorted(token_tag_error.items(),
                                            key=lambda kv: kv[1],
                                            reverse=True)
            errors = []
            maxTag = sorted_token_tag_error[0][0]
            for (tagId, error) in sorted_token_tag_error:
                tag = data_loader.id2tags["POS"][tagId]
                errors.append(tag + "=" + str(error))

            predErrors = []
            sorted_tag_error = sorted(token_pred_error.items(),
                                      key=lambda kv: kv[1],
                                      reverse=True)
            for (tagId, error) in sorted_tag_error:
                tag = data_loader.id2tags["POS"][tagId]
                predErrors.append(tag + "=" + str(error))

            token_indices = list(model.type_tokenIndices[type])
            required_embeddings, gold_token_tags, pred_token_tags = [], [], []
            for token_index in token_indices:
                embedding = model.token_embeddings[token_index]
                gammaVal = model.token_gamma_key[token_index][maxTag]
                prob = np.exp(gammaVal)
                required_embeddings.append(embedding * prob)
                (token_, tag_, sent_index_,
                 relative_index_) = tokenIndex_map[token_index]
                one_sent_ = sentIndex_map[sent_index_]
                pred_path_ = output_dict[" ".join(one_sent_)]
                gold_path_ = gold_dict[" ".join(one_sent_)]
                pred_token_tags.append(pred_path_[relative_index_])
                gold_token_tags.append(gold_path_[relative_index_])

            cluster_center = centeroidnp(np.array(required_embeddings))
            closest, _ = pairwise_distances_argmin_min(
                np.array([cluster_center]), required_embeddings)
            centroid = token_indices[closest[0]]

            (token, tag, sent_index, relative_index) = tokenIndex_map[centroid]

            one_sent = sentIndex_map[sent_index]

            sentence_index.append(sent_index)
            pred_path = output_dict[" ".join(one_sent)]
            gold_path = gold_dict[" ".join(one_sent)]
            sentence_index_sent[sent_index] = (one_sent, gold_path, pred_path)
            sentence_index_tokens[sent_index].append(relative_index)
            data[token] = {
                "tokenindices": token_indices,
                "weighted": required_embeddings,
                "centroid_center": cluster_center,
                "pred": pred_token_tags,
                "gold": gold_token_tags
            }
            fdebug.write(
                str(centroid) + "\t" + data_loader.id_to_word[type] + "\t" +
                gold_path[relative_index] + "\t" + pred_path[relative_index] +
                "\t" + "@".join(predErrors) + "\t" + "@".join(errors) + "\n")

        covered = set()
        count = 0

        with open("./" + args.model_name + "approx.pkl", "wb") as f:
            pickle.dump(data, f)

        with codecs.open(args.to_annotate, "w", encoding='utf-8') as fout:

            for sent_index in sentence_index:
                if sent_index not in covered:
                    covered.add(sent_index)
                    (sent, gold_path,
                     pred_path) = sentence_index_sent[sent_index]
                    path = deepcopy(pred_path)
                    for token_index in sentence_index_tokens[sent_index]:
                        path[token_index] = "UNK"

                    for token, tag_label, gold_tag in zip(
                            sent, path, gold_path):
                        fout.write(token + "\t" + tag_label + "\t" + gold_tag +
                                   "\n")
                        if tag_label == "UNK":
                            count += 1

                    fout.write("\n")
예제 #50
0
ax2.legend(labels)
ax2.grid()
fig2.savefig("/Users/angieryu2202/Desktop/3d_PCA_kmeans.png")


# ## (4) Articles Closest to Centroids for Each Cluster 

# In[37]:


import numpy as np
from sklearn.metrics import pairwise_distances_argmin_min
#km = KMeans(n_clusters=31).fit(topic_distribution_df)
# This function computes for each row in X, the index of the row of Y which is closest (according to the specified distance). The minimal distances are also returned
kmeans_closest, _ = pairwise_distances_argmin_min(clusters.cluster_centers_, topic_distribution_df)
print(kmeans_closest)


# In[38]:


kmeans_closest_titles = []
for index in kmeans_closest:
    kmeans_closest_titles.append(caregiver_df.title[index])
    print(str(index)+": "+str(caregiver_df.title[index]))


# In[39]:

예제 #51
0
        tffss.append(tfs)
        labels_resultados.append(kmeans.labels_)

    best_s = np.argmax(silhouette_scores)
    best_ch = np.argmax(ch_scores)
    print("Mejor k según silhouette_score: ", best_s + num_initial_clusters)
    print("Mejor k según calinski_harabaz_score: ",
          best_ch + num_initial_clusters)
    true_k = max(best_s, best_ch)

    print("Guardando ", true_k + num_initial_clusters, "-kmeans...")
    clusters = resultados[true_k]
    centros_clusters = centros_resultados[true_k]
    kmeans.labels_ = labels_resultados[true_k]

    closest, _ = pairwise_distances_argmin_min(centros_clusters, tfidf_matrix)

    with open(
            path + "resumenes-temas-relacionados-" +
            str(true_k + num_initial_clusters) + "-means.txt", 'w') as f:
        with open(
                path + "centros-temas-relacionados-" +
                str(true_k + num_initial_clusters) + "-means.txt", 'w') as f2:
            for cluster in range(len(clusters)):
                f.write("Cluster " + str(cluster + 1) + ":  " +
                        sentences_centers[closest[cluster]] + "\n")
                f2.write("Cluster " + str(cluster + 1) + ":  " +
                         sentences_centers[closest[cluster]] + "\n")
                cl = []
                for i, sentence in enumerate(clusters[cluster]):
                    c = indices_centros[sentence]