Пример #1
0
    def __init__(self, k=7) -> None:

        self.k = k
        self.distance = nltk.cluster.cosine_distance
        self.model = KMeansClusterer(self.k,
                                     self.distance,
                                     avoid_empty_clusters=True)
Пример #2
0
def spectral_clustering(A, nb_clusters, laplacian_normalization = None, algo = None):
    """
    Compute the clusters assignement from spectral clustering algorithm
    steps :
    * Compute laplacian
    * Compute k smaller eigenvalues and associated eigenvectors
    * Train a kmean on this vectors
    * Apply this kmean to the Laplacian
    """
    if algo not in ['sph', None]:
        raise Exception('Algorithm {} unknown'.format(algo))

    L = get_laplacian(A, laplacian_normalization)
    L = scipy.sparse.csr_matrix(L, dtype=np.float64)
    v, w = eigsh(L, nb_clusters, which='SM')

    if algo == None :
        km = KMeans(n_clusters= nb_clusters)
        km.fit(np.transpose(w))
        clusters = km.predict(L)

    elif algo == 'sph':
        clusterer = KMeansClusterer(nb_clusters, distance=nltk.cluster.util.cosine_distance, repeats=25)
        cluster = clusterer.cluster(np.transpose(w), True)
        vectors = [np.transpose(L[i, :].toarray()[0]) for i in range(0, L.shape[1])]
        clusters = [clusterer.classify(vector) for vector in vectors]
    return clusters
def nltk_clustering(n, filename):
    global vectors
    global names
    global repeats
    # Clustering
    print("Begin clustering, n = {:d}...".format(n))

    clusterer = KMeansClusterer(n, cosine_distance, repeats=repeats)
    clustered = clusterer.cluster(vectors, assign_clusters=True, trace=False)
    clustered = np.array(clustered)

    index = sorted(clustered)
    # print(clustered.argsort())
    names = list(names[clustered.argsort()])

    # write result to file
    print("Saving result to file...")
    output = filename[:-4] + "_" + str(n) + "_clustered.txt"
    with open(output, "w") as f:
        current_idx = None
        for itr, idx in zip(names, index):
            if current_idx != idx:
                current_idx = idx
                f.write("\nCluster {:d} (description: )\n".format(current_idx))
            else:
                pass
            f.write(itr + "\n")
    #
    print("Clustered result saved in {0}".format(output))
Пример #4
0
def clusterer_nltk_kmeans(X, n_clusters):
    # "_args": [{"type": "numpy.ndarray","dtype": "float32"} ],
    #   "_return": [{ "type": "numpy.ndarray","dtype": "int32"}

    # in this case we want to try different numbers of clusters, so it is a parameter
    import nltk
    import numpy as np
    from nltk.cluster.kmeans import KMeansClusterer
    print('clusterer_nltk_kmeans')

    clusterAlgLabelAssignmentsNK = None
    # X = XY[0]
    cmtVectors = X  # XY[1]
    if type(cmtVectors) is np.ndarray and len(cmtVectors) > 0:
        # dt = np.dtype(cmtVectors)
        dt = cmtVectors.dtype
        if dt.type is np.float32 or dt.type is np.float64:
            clusterAlgNK = KMeansClusterer(
                params['n_clusters'],
                distance=nltk.cluster.util.cosine_distance,
                repeats=25,
                avoid_empty_clusters=True)
            clusterAlgLabelAssignmentsNK = clusterAlgNK.cluster(
                cmtVectors, assign_clusters=True)

    XY = (X, clusterAlgLabelAssignmentsNK)
    return XY
Пример #5
0
def nltk_manhattan_kmeans(encoded_img):
    from scipy.spatial.distance import cityblock
    from nltk.cluster.kmeans import KMeansClusterer

    kclusterer = KMeansClusterer(2, distance=cityblock, repeats=10)
    assigned_clusters = kclusterer.cluster(encoded_img, assign_clusters=True)

    print_labels(assigned_clusters)
Пример #6
0
def nltk_euclidean_kmeans(encoded_img):
    from nltk.cluster.util import euclidean_distance
    from nltk.cluster.kmeans import KMeansClusterer

    kclusterer = KMeansClusterer(2, distance=euclidean_distance, repeats=10)
    assigned_clusters = kclusterer.cluster(encoded_img, assign_clusters=True)

    print_labels(assigned_clusters)
Пример #7
0
def spherical_clustering_from_adjency(A, nb_clusters):
    """
    Spectral clustering with spherical kmeans
    """
    A = scipy.sparse.csr_matrix(A, dtype=np.float64)
    v, w = eigsh(A, nb_clusters, which='LM')
    clusterer = KMeansClusterer(nb_clusters, distance=nltk.cluster.util.cosine_distance, repeats=25)
    cluster = clusterer.cluster(np.transpose(w), True)
    vectors = [np.transpose(A[i, :].toarray()[0]) for i in range(0, A.shape[1])]
    clusters = [clusterer.classify(vector) for vector in vectors]
    return clusters
Пример #8
0
def new_cluster(filepath):
    NUM_CLUSTERS = 4
    data = get_data(filepath)
    kclusterer = KMeansClusterer(NUM_CLUSTERS,
                                 distance=lambda a, b: np.max(a - b),
                                 repeats=1000)
    labels = kclusterer.cluster(data, assign_clusters=True)
    print("Showing the cluster results")
    for id in range(NUM_CLUSTERS):
        for i in range(len(data)):
            if labels[i] == id:
                print("Joint : ", i + 1, " Joint Values: ", data[i],
                      " Cluster Id: ", id)
Пример #9
0
 def Kmeans(self, volcabulary, vectors, n_cluster):
     """K-means clustering based on cosine similarity of word2vec.
     """
     kclusterer = KMeansClusterer(
         n_cluster,
         distance=nltk.cluster.util.cosine_distance,
         repeats=10,
         avoid_empty_clusters=True)
     assigned_clusters = kclusterer.cluster(vectors, assign_clusters=True)
     dic = defaultdict(list)
     for c, w in zip(assigned_clusters, volcabulary):
         dic[c].append(w)
     return assigned_clusters, dic
def ClusterItems(data_file, items_bias_file, index_file, clusters_file,
                 centroids_file):

    data = np.genfromtxt(data_file)
    popular_items = np.genfromtxt(index_file).astype('int')
    data = data[popular_items]
    items_bias = np.genfromtxt(items_bias_file)
    important_items = np.where(np.abs(items_bias[popular_items]) < 0.2)[0]
    kclusterer = KMeansClusterer(NUM_CLUSTERS, distance=cosine_distance)
    print(NUM_CLUSTERS, important_items.shape)
    print("end", data.shape)
    clusters = kclusterer.cluster(data[important_items], assign_clusters=True)
    np.savetxt(centroids_file, kclusterer.means())
    np.savetxt(clusters_file, clusters)
def main():
    getFiles()
    tf_idf()
    num_clusters = int(sys.argv[2])
    kclusterer = KMeansClusterer(num_clusters,
                                 distance=nltk.cluster.util.cosine_distance,
                                 repeats=25)
    assigned_clusters = kclusterer.cluster(wordvec, assign_clusters=True)
    clustersDict = {}
    for i in range(num_clusters):
        clustersDict[i] = []
    for i in range(len(assigned_clusters)):
        clustersDict[assigned_clusters[i]].append(fileList[i])
    printClustersInFormat(clustersDict)
Пример #12
0
def cluster(clusterType, vectors, y):
    if (clusterType == "KMeans"):
        kclusterer = KMeansClusterer(
            NUM_CLUSTERS,
            distance=nltk.cluster.util.cosine_distance,
            repeats=25)
        assigned_clusters = kclusterer.cluster(vectors, assign_clusters=True)

    elif (clusterType == "GMM"):
        GMM = GaussianMixture(n_components=NUM_CLUSTERS)
        assigned_clusters = GMM.fit_predict(vectors)

    elif (clusterType == "SVM"):
        classifier = SVC(kernel='rbf', gamma='auto', random_state=0)
        #cross-validation
        assigned_clusters = cross_validation(classifier, vectors, y)

    elif (clusterType == "T2VH"):
        ret = hierarchical.ward_tree(vectors, n_clusters=NUM_CLUSTERS)
        children = ret[0]
        n_leaves = ret[2]
        assigned_clusters = hierarchical._hc_cut(NUM_CLUSTERS, children,
                                                 n_leaves)

    elif (clusterType == "RandomForest"):
        classifier = RandomForestClassifier()
        #cross-validation
        assigned_clusters = cross_validation(classifier, vectors, y)
        # classifier.fit(vectors, y)
        # assigned_clusters=classifier.predict(vectors)

    elif (clusterType == "DecisionTree"):
        classifier = DecisionTreeClassifier()
        #cross-validation
        assigned_clusters = cross_validation(classifier, vectors, y)
        # classifier.fit(vectors, y)
        # assigned_clusters=classifier.predict(vectors)

    elif (clusterType == "LogisticRegression"):
        classifier = sklearn.linear_model.LogisticRegression()
        #cross-validation
        assigned_clusters = cross_validation(classifier, vectors, y)
        # classifier.fit(vectors, y)
        # assigned_clusters=classifier.predict(vectors)

    else:
        print(clusterType, " is not a predefined cluster type.")
        return
    return assigned_clusters
Пример #13
0
def cluster(folderName, vectorsize, clusterType):
    corpus = loadXES.get_doc_XES_tagged(folderName + '.xes')
    print('Data Loading finished, ', str(len(corpus)), ' traces found.')

    model = gensim.models.Doc2Vec.load('output/' + folderName + 'T2VVS' +
                                       str(vectorsize) + '.model')

    vectors = []
    NUM_CLUSTERS = 5
    print("inferring vectors")
    for doc_id in range(len(corpus)):
        inferred_vector = model.infer_vector(corpus[doc_id].words)
        vectors.append(inferred_vector)
    print("done")

    if (clusterType == "KMeans"):
        kclusterer = KMeansClusterer(
            NUM_CLUSTERS,
            distance=nltk.cluster.util.cosine_distance,
            repeats=25)
        assigned_clusters = kclusterer.cluster(vectors, assign_clusters=True)
    elif (clusterType == "HierWard"):
        ward = AgglomerativeClustering(n_clusters=NUM_CLUSTERS,
                                       linkage='ward').fit(vectors)
        assigned_clusters = ward.labels_
    elif clusterType == "OCSVM":
        ocsvm = OneClassSVM()
        assigned_clusters = ocsvm.fit_predict(vectors)

    else:
        print(
            clusterType,
            " is not a predefined cluster type. Please use 'KMeans' or 'HierWard', or create a definition for ",
            clusterType)
        return
    trace_list = loadXES.get_trace_names(folderName + ".xes")
    clusterResult = {}
    for doc_id in range(len(corpus)):
        clusterResult[trace_list[doc_id]] = assigned_clusters[doc_id]

    resultFile = open(
        'output/' + folderName + 'T2VVS' + str(vectorsize) + clusterType +
        '.csv', 'w')
    for doc_id in range(len(corpus)):
        resultFile.write(trace_list[doc_id] + ',' +
                         str(assigned_clusters[doc_id]) + "\n")

    resultFile.close()
    print("done with ", clusterType, " on event log ", folderName)
class ClusteringPairwise():
    def __init__(self, users_vecs_train_file, centroid_file, clustering_file,
                 num_clusters, n_iteration, mode):
        self.mode = mode
        self.num_clusters = num_clusters
        self.users = np.genfromtxt(users_vecs_train_file)
        self.tree = LasyTree(np.arange(self.users.shape[0]))
        self.centroids = np.genfromtxt(centroid_file)
        clusters_ = np.genfromtxt(clustering_file).astype('int')
        self.clusters = {}
        for i in range(num_clusters):
            self.clusters[i] = []
        for i in range(len(clusters_)):
            self.clusters[clusters_[i]].append(i)
        self.n_iteration = n_iteration
        self.kclusterer = KMeansClusterer(num_clusters,
                                          distance=cosine_distance,
                                          initial_means=list(self.centroids))

    def RecieveQuestions(self, item_vecs, user, user_estim, n_points,
                         item_bias, ratings):
        clusters_ = [self.kclusterer.classify(item) for item in item_vecs]
        clusters = {}
        for i in range(self.num_clusters):
            clusters[i] = []
        for i in range(len(clusters_)):
            clusters[clusters_[i]].append(i)
        a = np.argsort(clusters_)
        return AllAlgorithm(self.users, self.n_iteration, self.centroids,
                            item_vecs, item_bias, user, clusters, self.tree,
                            self.mode, ratings)
Пример #15
0
def clustering(dataframe, repeats, myStopwords):
    num_clusters = 5
    # define vectorizer parameters
    tfidf_vectorizer = TfidfVectorizer(stop_words=myStopwords)
    # Only process the content, not the title
    tfidf_matrix = tfidf_vectorizer.fit_transform(dataframe["Content"])
    # Convert it to an array
    tfidf_matrix_array = tfidf_matrix.toarray()
    # Run K-means with cosine distance as the metric
    kclusterer = KMeansClusterer(num_clusters,
                                 distance=cosine_distance,
                                 repeats=repeats)
    # Output to assigned_clusters
    assigned_clusters = kclusterer.cluster(tfidf_matrix_array,
                                           assign_clusters=True)
    # cluster_size counts how many elements each cluster contains
    cluster_size = [0, 0, 0, 0, 0]
    # Create a 5x5 array and fill it with zeros
    matrix = [[0 for x in range(5)] for y in range(5)]
    # For every category
    for category in categories:
        # For every article
        for row in range(0, len(assigned_clusters)):
            # Compare the cluster number with the category number
            if assigned_clusters[row] == categories.index(category):
                ind = categories.index(dataframe.ix[row][4])
                matrix[categories.index(category)][ind] += 1
    # Count how many elements each cluster contains
    for row in range(0, len(assigned_clusters)):
        cluster_size[assigned_clusters[row]] += 1
    for x in range(5):
        for y in range(5):
            # Calculate frequency
            matrix[x][y] /= cluster_size[x]
            # Only keep the 2 first decimal digits
            matrix[x][y] = format(matrix[x][y], '.2f')
    # Output to a .csv file
    out_file = open("output/clustering_KMeans.csv", 'w')
    wr = csv.writer(out_file, delimiter="\t")
    newCategories = categories
    newCategories.insert(0, "\t")
    wr.writerow(newCategories)
    for x in range(5):
        newMatrix = matrix[x]
        clusterName = "Cluster " + str(x + 1)
        newMatrix.insert(0, clusterName)
        wr.writerow(matrix[x])
 def __init__(self, users_vecs_train_file, centroid_file, clustering_file,
              num_clusters, n_iteration, mode):
     self.mode = mode
     self.num_clusters = num_clusters
     self.users = np.genfromtxt(users_vecs_train_file)
     self.tree = LasyTree(np.arange(self.users.shape[0]))
     self.centroids = np.genfromtxt(centroid_file)
     clusters_ = np.genfromtxt(clustering_file).astype('int')
     self.clusters = {}
     for i in range(num_clusters):
         self.clusters[i] = []
     for i in range(len(clusters_)):
         self.clusters[clusters_[i]].append(i)
     self.n_iteration = n_iteration
     self.kclusterer = KMeansClusterer(num_clusters,
                                       distance=cosine_distance,
                                       initial_means=list(self.centroids))
Пример #17
0
def recluster(df, cl, clusters, n_clusters):
    lbls = cl.labels_
    mask = np.array([False for i in range(len(lbls))])
    for c in clusters:
        mask |= lbls==c
    subpipe, results = data_pipeline(df[mask])
    
    ##use cosine similarity! NLTK clustering implementation
    #KMeans cluster object as carrier for consistency
    subcl = cluster(results, n_clusters)
    kclusterer = KMeansClusterer(n_clusters, distance=nltk.cluster.util.cosine_distance, repeats=50)
    assigned_clusters = kclusterer.cluster(results, assign_clusters=True)
    #assign new cluster labels and cluster centroids
    subcl.labels_ = np.array(assigned_clusters)
    subcl.cluster_centers_ = np.array(kclusterer.means())
    
    return subpipe, subcl, results, df[mask]
Пример #18
0
    def cluster_docs(self):

        vectors = []
        used_lines = []

        for doc, id in self.es_docs():
            tokens = text_cleaner.clean_tokens(doc)
            if tokens != 'NC' and len(tokens) > 200:
                used_lines.append(tokens)
                vectors.append(self.model.infer_vector(tokens))

        kclusterer = KMeansClusterer(
            NUM_CLUSTERS,
            distance=nltk.cluster.util.cosine_distance,
            repeats=25)
        assigned_clusters = kclusterer.cluster(vectors, assign_clusters=True)

        print("done")
Пример #19
0
class kmeans_cosine(object):
    def __init__(self,k):
        self.k = k
        self.model = KMeansClusterer(k, distance=nltk.cluster.util.cosine_distance, repeats=25)

    def build(self,X,p):
        """
        """
        data = scipy.sparse.csr_matrix(X).toarray()
        kclusters= np.array(self.model.cluster(data, assign_clusters=True))
        prediction = self.model.classify(p)
        cluster_id = kclusters == prediction
        return cluster_id, prediction

    def save(self, filename = "model2.pkl"):
        """
        """
        with open(filename, 'w') as f:
            pickle.dump(self.model, f)
Пример #20
0
def get_cluster(tfidf_arr, k):
    """
    K-means聚类
    :param tfidf_arr:
    :param k:
    :return:
    """
    kmeans = KMeansClusterer(num_means=k,
                             distance=cosine_distance,
                             avoid_empty_clusters=True)  # 分成k类,使用余弦相似分析
    kmeans.cluster(tfidf_arr)

    # 获取分类
    kinds = pd.Series([kmeans.classify(i) for i in tfidf_arr])
    fw = open('/you_filed_algos/prod_kudu_data/ClusterText.txt',
              'a+',
              encoding='utf-8')
    for i, v in kinds.items():
        fw.write(str(i) + '\t' + str(v) + '\n')
    fw.close()
Пример #21
0
	def __init__(self, model):
		"""
		@param model: (type=Word2Vec model)
		"""
		self.model = model #store the Word2Vec model object in case of future use
		self.word_to_vec = {word:model.wv[word] for word in model.wv.vocab} #mapping from word strings to vectors
		self.vectors = [model.wv[word] for word in model.wv.vocab] 
		clusterer = KMeansClusterer(num_means=5, distance=cosine_distance) #the object that will cluster our vectors, num_means will eventually be parameterized
		clusterer.cluster_vectorspace(self.vectors)
		self.central_words = []

		#find closest words to centroids
		for centroid in clusterer._means:
			closest = None
			for word in self.word_to_vec:
				vector = self.word_to_vec[word]
				if not closest or (cosine_distance(vector, centroid) < cosine_distance(closest[1], centroid)):
					closest = (word, self.word_to_vec[word])
			self.central_words.append(closest)
		self.centroids = clusterer._means
Пример #22
0
class KMeansClusters(BaseEstimator, TransformerMixin):
    def __init__(self, k=7) -> None:

        self.k = k
        self.distance = nltk.cluster.cosine_distance
        self.model = KMeansClusterer(self.k,
                                     self.distance,
                                     avoid_empty_clusters=True)

    def fit(self, data, labels=None):
        return self

    def transform(self, data):
        return self.model.cluster(data, assign_clusters=True)
Пример #23
0
 def __init__(self,k):
     self.k = k
     self.model = KMeansClusterer(k, distance=nltk.cluster.util.cosine_distance, repeats=25)
Пример #24
0
def bbox_iou(x1, y1, w1, h1, x2, y2, w2, h2):
    x_a = torch.max(x1 - w1 / 2.0, x2 - w2 / 2.0)
    y_a = torch.max(y1 - h1 / 2.0, y2 - h2 / 2.0)
    x_b = torch.min(x1 + h1 / 2.0, x2 + w2 / 2.0)
    y_b = torch.max(y1 + h1 / 2.0, y2 + h2 / 2.0)

    intersection = torch.clamp(x_b - x_a, min=0) * torch.clamp(y_b - y_a,
                                                               min=0)
    union = w1 * h1 + w2 * h2 - intersection

    return intersection / (union + 1e-6)


kclusterer = KMeansClusterer(args.num_bbox,
                             distance=nltk.cluster.util.cosine_distance,
                             repeats=25)
assigned_clusters = kclusterer.cluster(data, assign_clusters=True)

kmeans_wh = KMeans(n_clusters=args.num_bbox)
kmeans_wh.fit(train_wh)
bbox_priors = kmeans_wh.cluster_centers_
np.save('priors.npy', bbox_priors)
bbox_priors = torch.from_numpy(bbox_priors).cuda()

# Set up the network

features = DenseNet(growth_rate=8,
                    block_config=(4, 8, 16, 32),
                    activation=nn.LeakyReLU(inplace=True),
                    input_channels=3)
Пример #25
0
        phrase = ii[1]
        if score < 0.7:
            break
        try:
            arr = numpy.append(arr,
                               numpy.reshape(model.wv.word_vec(phrase),
                                             (1, 100)),
                               axis=0)
        except KeyError:
            pass
        else:
            embedded_phrases.append(phrase)

    print('number of sample points:', len(embedded_phrases))

    kmeans = KMeansClusterer(6, nltk.cluster.util.cosine_distance)
    clusters = kmeans.cluster(arr, assign_clusters=True)
    centers = kmeans.means()

    result = {0: [], 1: [], 2: [], 3: [], 4: [], 5: []}
    for i in range(len(clusters)):
        result[clusters[i]].append([
            nltk.cluster.util.cosine_distance(centers[clusters[i]], arr[i]),
            embedded_phrases[i]
        ])
    for k in result:
        sorted_result = sorted(result[k], reverse=True)
        final_result = '\n'.join(
            ['%.10f' % x[0] + '\t' + x[1] for x in sorted_result])
        f = open('cluster' + str(k) + '.txt', 'w+')
        f.write(final_result)
Пример #26
0
    lines = open(datacfg).readlines()
    images = []
    for line in lines:
        if (line.split(' ')[0] == 'train'):
            valid_path = line.strip().split(' ')[-1]
            if (valid_path[0] != '/'):
                valid_path = workspace + valid_path
            lists = open(valid_path).readlines()
            images = [x.strip() for x in lists]

    bboxes = []
    for image in images:
        label = image.replace('.jpg', '.txt')
        lines = open(label).readlines()
        for line in lines:
            splitline = line.split(' ')
            # bboxes.append([float(x)*13. for x in splitline[-2:]])
            bboxes.append([float(splitline[-2])*1., float(splitline[-1])*1.])
    print(len(bboxes))
    # samples = random.sample(bboxes, 15000)
    # print(len(samples))
    bboxes = np.array(bboxes)
    # samples = np.array(samples)
    # print(samples.shape)

    KMeans = KMeansClusterer(5, negIoU, repeats=1)
    # clusters = KMeans.cluster(samples, True)
    clusters = KMeans.cluster(bboxes, True)
    centroids = KMeans.means()
    print(np.array(centroids) / np.array((1., 1.)))
Пример #27
0
# create counter and idf vectors

count_vect = TfidfVectorizer    (stop_words=stop_words)
count_vect.fit(df['Content']) #12266
X_train_counts = count_vect.transform(df['Content']

# reduce size of vector with LSI

svd = TruncatedSVD(n_components=5)
X_train_counts = svd.fit_transform(X_train_counts)


# Clustering

kclusterer = KMeansClusterer(num_means = 5, distance=cosine_distance, repeats=25, avoid_empty_clusters= True)
clusters = kclusterer.cluster(X_train_counts, assign_clusters=True)
# print "Clusters:\n " , clusters
# print "Means" , kclusterer.means()


# Prepare results Matrix

categories_map={
'Politics': 0,
'Business': 1,
'Film': 2,
'Technology': 3,
'Football': 4
}
def main(argv):
    # Parse arguments
    parser = argparse.ArgumentParser()
    parser.add_argument('-i', '--input_file', help='input file', required=True)
    parser.add_argument('-s', '--step', help='step', required=True)
    parser.add_argument('-ik', '--init_k', help='K initial', required=True)
    parser.add_argument('-fk', '--final_k', help='K final', required=True)
    parser.add_argument('-od',
                        '--distortion_out_file',
                        help='elbow distortion graph file',
                        required=True)
    parser.add_argument('-os',
                        '--silhouette_out_file',
                        help='elbow silhoutte graph',
                        required=True)
    parser.add_argument('-pca', '--pca', help='with pca', action='store_true')
    parser.add_argument('-k_pca', '--k_pca', help='k pca')
    ARGS = parser.parse_args()

    descriptors = load_dataset(ARGS.input_file)
    if ARGS.pca == True:
        print("With pca")
        pca = PCA(n_components=int(ARGS.k_pca))
        descriptors = pca.fit_transform(descriptors)

    ks = []
    distortions = []
    silhouettes = []

    for k in range(int(ARGS.init_k), int(ARGS.final_k), int(ARGS.step)):
        # kmeanModel = KMeans(n_clusters=k, init='k-means++')
        # kmeanModel.fit(descriptors)
        # predictions = kmeanModel.predict(descriptors)
        # cluster_centers_ = kmeanModel.cluster_centers_

        kclusterer = KMeansClusterer(
            k, distance=nltk.cluster.util.cosine_distance)
        predictions = kclusterer.cluster(descriptors, assign_clusters=True)
        predictions = np.array(predictions)
        cluster_centers_ = np.array(kclusterer.means())

        distortion = sum(
            np.min(distance.cdist(descriptors, cluster_centers_, 'cosine'),
                   axis=1)) / descriptors.shape[0]

        silhouette_score = metrics.silhouette_score(descriptors,
                                                    predictions,
                                                    metric='cosine')

        distortions.append(distortion)
        silhouettes.append(silhouette_score)
        ks.append(k)

        print("k:", k, "distortion:", distortion, "Silhouette Coefficient",
              silhouette_score)

    # Plot the elbow with distortion
    fig = plt.figure()
    plt.plot(ks, distortions, 'bx-')
    plt.grid()
    plt.xlabel('k')
    plt.ylabel('Distortion')
    plt.title('The Elbow Method')
    fig.savefig(ARGS.distortion_out_file)

    # Plot the elbow with distortion
    fig = plt.figure()
    plt.plot(ks, silhouettes, 'bx-')
    plt.grid()
    plt.xlabel('k')
    plt.ylabel('Silhouette Score')
    plt.title('Silhouette Score analysis')
    fig.savefig(ARGS.silhouette_out_file)
Пример #29
0
 def cluster(self, docs_repr):
     kclusterer = KM(self.n_clusters, distance=cosine_distance, repeats=25,avoid_empty_clusters=True)
     assigned_clusters = kclusterer.cluster(docs_repr, assign_clusters=True)
     return assigned_clusters
Пример #30
0
# print(kmeans)

# #Plot the clusters obtained using k means
# fig = plt.figure()
# ax = fig.add_subplot(111)

# scatter = ax.scatter(big_data_copy['Accounting'],big_data_copy['3D Printing'],
#                       c=kmeans[0],s=50)

# plt.colorbar(scatter)

# this one is not working out...dataframe might not be correct format
NUM_CLUSTERS = 10
kclusterer = KMeansClusterer(NUM_CLUSTERS,
                             distance=nltk.cluster.util.cosine_distance,
                             repeats=25)
assigned_clusters = kclusterer.cluster(big_data_copy, assign_clusters=True)
'''NEW PLAN, thanks Evan
    ONE HOT ENCODIGN BUT WITH ADDED UP VECTROS
    ex:
        Math 1 Art 2 Math 3 CS 50
    Joe     1   0       0    0
    Bob     0   0       1    0
    Smith   1   0       0    0
    Bob     0   1       0    0
    Smith   0   0       0    1

    groupByIndividual alphabetical is fine probably, jsut 
    want them to be same name next to each other