def nltk_clustering(n, filename):
    global vectors
    global names
    global repeats
    # Clustering
    print("Begin clustering, n = {:d}...".format(n))

    clusterer = KMeansClusterer(n, cosine_distance, repeats=repeats)
    clustered = clusterer.cluster(vectors, assign_clusters=True, trace=False)
    clustered = np.array(clustered)

    index = sorted(clustered)
    # print(clustered.argsort())
    names = list(names[clustered.argsort()])

    # write result to file
    print("Saving result to file...")
    output = filename[:-4] + "_" + str(n) + "_clustered.txt"
    with open(output, "w") as f:
        current_idx = None
        for itr, idx in zip(names, index):
            if current_idx != idx:
                current_idx = idx
                f.write("\nCluster {:d} (description: )\n".format(current_idx))
            else:
                pass
            f.write(itr + "\n")
    #
    print("Clustered result saved in {0}".format(output))
예제 #2
0
def clusterer_nltk_kmeans(X, n_clusters):
    # "_args": [{"type": "numpy.ndarray","dtype": "float32"} ],
    #   "_return": [{ "type": "numpy.ndarray","dtype": "int32"}

    # in this case we want to try different numbers of clusters, so it is a parameter
    import nltk
    import numpy as np
    from nltk.cluster.kmeans import KMeansClusterer
    print('clusterer_nltk_kmeans')

    clusterAlgLabelAssignmentsNK = None
    # X = XY[0]
    cmtVectors = X  # XY[1]
    if type(cmtVectors) is np.ndarray and len(cmtVectors) > 0:
        # dt = np.dtype(cmtVectors)
        dt = cmtVectors.dtype
        if dt.type is np.float32 or dt.type is np.float64:
            clusterAlgNK = KMeansClusterer(
                params['n_clusters'],
                distance=nltk.cluster.util.cosine_distance,
                repeats=25,
                avoid_empty_clusters=True)
            clusterAlgLabelAssignmentsNK = clusterAlgNK.cluster(
                cmtVectors, assign_clusters=True)

    XY = (X, clusterAlgLabelAssignmentsNK)
    return XY
예제 #3
0
def spectral_clustering(A, nb_clusters, laplacian_normalization = None, algo = None):
    """
    Compute the clusters assignement from spectral clustering algorithm
    steps :
    * Compute laplacian
    * Compute k smaller eigenvalues and associated eigenvectors
    * Train a kmean on this vectors
    * Apply this kmean to the Laplacian
    """
    if algo not in ['sph', None]:
        raise Exception('Algorithm {} unknown'.format(algo))

    L = get_laplacian(A, laplacian_normalization)
    L = scipy.sparse.csr_matrix(L, dtype=np.float64)
    v, w = eigsh(L, nb_clusters, which='SM')

    if algo == None :
        km = KMeans(n_clusters= nb_clusters)
        km.fit(np.transpose(w))
        clusters = km.predict(L)

    elif algo == 'sph':
        clusterer = KMeansClusterer(nb_clusters, distance=nltk.cluster.util.cosine_distance, repeats=25)
        cluster = clusterer.cluster(np.transpose(w), True)
        vectors = [np.transpose(L[i, :].toarray()[0]) for i in range(0, L.shape[1])]
        clusters = [clusterer.classify(vector) for vector in vectors]
    return clusters
예제 #4
0
    def __init__(self, k=7) -> None:

        self.k = k
        self.distance = nltk.cluster.cosine_distance
        self.model = KMeansClusterer(self.k,
                                     self.distance,
                                     avoid_empty_clusters=True)
예제 #5
0
def nltk_euclidean_kmeans(encoded_img):
    from nltk.cluster.util import euclidean_distance
    from nltk.cluster.kmeans import KMeansClusterer

    kclusterer = KMeansClusterer(2, distance=euclidean_distance, repeats=10)
    assigned_clusters = kclusterer.cluster(encoded_img, assign_clusters=True)

    print_labels(assigned_clusters)
예제 #6
0
def nltk_manhattan_kmeans(encoded_img):
    from scipy.spatial.distance import cityblock
    from nltk.cluster.kmeans import KMeansClusterer

    kclusterer = KMeansClusterer(2, distance=cityblock, repeats=10)
    assigned_clusters = kclusterer.cluster(encoded_img, assign_clusters=True)

    print_labels(assigned_clusters)
예제 #7
0
def spherical_clustering_from_adjency(A, nb_clusters):
    """
    Spectral clustering with spherical kmeans
    """
    A = scipy.sparse.csr_matrix(A, dtype=np.float64)
    v, w = eigsh(A, nb_clusters, which='LM')
    clusterer = KMeansClusterer(nb_clusters, distance=nltk.cluster.util.cosine_distance, repeats=25)
    cluster = clusterer.cluster(np.transpose(w), True)
    vectors = [np.transpose(A[i, :].toarray()[0]) for i in range(0, A.shape[1])]
    clusters = [clusterer.classify(vector) for vector in vectors]
    return clusters
예제 #8
0
 def Kmeans(self, volcabulary, vectors, n_cluster):
     """K-means clustering based on cosine similarity of word2vec.
     """
     kclusterer = KMeansClusterer(
         n_cluster,
         distance=nltk.cluster.util.cosine_distance,
         repeats=10,
         avoid_empty_clusters=True)
     assigned_clusters = kclusterer.cluster(vectors, assign_clusters=True)
     dic = defaultdict(list)
     for c, w in zip(assigned_clusters, volcabulary):
         dic[c].append(w)
     return assigned_clusters, dic
예제 #9
0
def new_cluster(filepath):
    NUM_CLUSTERS = 4
    data = get_data(filepath)
    kclusterer = KMeansClusterer(NUM_CLUSTERS,
                                 distance=lambda a, b: np.max(a - b),
                                 repeats=1000)
    labels = kclusterer.cluster(data, assign_clusters=True)
    print("Showing the cluster results")
    for id in range(NUM_CLUSTERS):
        for i in range(len(data)):
            if labels[i] == id:
                print("Joint : ", i + 1, " Joint Values: ", data[i],
                      " Cluster Id: ", id)
def main():
    getFiles()
    tf_idf()
    num_clusters = int(sys.argv[2])
    kclusterer = KMeansClusterer(num_clusters,
                                 distance=nltk.cluster.util.cosine_distance,
                                 repeats=25)
    assigned_clusters = kclusterer.cluster(wordvec, assign_clusters=True)
    clustersDict = {}
    for i in range(num_clusters):
        clustersDict[i] = []
    for i in range(len(assigned_clusters)):
        clustersDict[assigned_clusters[i]].append(fileList[i])
    printClustersInFormat(clustersDict)
def ClusterItems(data_file, items_bias_file, index_file, clusters_file,
                 centroids_file):

    data = np.genfromtxt(data_file)
    popular_items = np.genfromtxt(index_file).astype('int')
    data = data[popular_items]
    items_bias = np.genfromtxt(items_bias_file)
    important_items = np.where(np.abs(items_bias[popular_items]) < 0.2)[0]
    kclusterer = KMeansClusterer(NUM_CLUSTERS, distance=cosine_distance)
    print(NUM_CLUSTERS, important_items.shape)
    print("end", data.shape)
    clusters = kclusterer.cluster(data[important_items], assign_clusters=True)
    np.savetxt(centroids_file, kclusterer.means())
    np.savetxt(clusters_file, clusters)
예제 #12
0
def cluster(clusterType, vectors, y):
    if (clusterType == "KMeans"):
        kclusterer = KMeansClusterer(
            NUM_CLUSTERS,
            distance=nltk.cluster.util.cosine_distance,
            repeats=25)
        assigned_clusters = kclusterer.cluster(vectors, assign_clusters=True)

    elif (clusterType == "GMM"):
        GMM = GaussianMixture(n_components=NUM_CLUSTERS)
        assigned_clusters = GMM.fit_predict(vectors)

    elif (clusterType == "SVM"):
        classifier = SVC(kernel='rbf', gamma='auto', random_state=0)
        #cross-validation
        assigned_clusters = cross_validation(classifier, vectors, y)

    elif (clusterType == "T2VH"):
        ret = hierarchical.ward_tree(vectors, n_clusters=NUM_CLUSTERS)
        children = ret[0]
        n_leaves = ret[2]
        assigned_clusters = hierarchical._hc_cut(NUM_CLUSTERS, children,
                                                 n_leaves)

    elif (clusterType == "RandomForest"):
        classifier = RandomForestClassifier()
        #cross-validation
        assigned_clusters = cross_validation(classifier, vectors, y)
        # classifier.fit(vectors, y)
        # assigned_clusters=classifier.predict(vectors)

    elif (clusterType == "DecisionTree"):
        classifier = DecisionTreeClassifier()
        #cross-validation
        assigned_clusters = cross_validation(classifier, vectors, y)
        # classifier.fit(vectors, y)
        # assigned_clusters=classifier.predict(vectors)

    elif (clusterType == "LogisticRegression"):
        classifier = sklearn.linear_model.LogisticRegression()
        #cross-validation
        assigned_clusters = cross_validation(classifier, vectors, y)
        # classifier.fit(vectors, y)
        # assigned_clusters=classifier.predict(vectors)

    else:
        print(clusterType, " is not a predefined cluster type.")
        return
    return assigned_clusters
예제 #13
0
파일: Trace2Vec.py 프로젝트: n0mori/tcc
def cluster(folderName, vectorsize, clusterType):
    corpus = loadXES.get_doc_XES_tagged(folderName + '.xes')
    print('Data Loading finished, ', str(len(corpus)), ' traces found.')

    model = gensim.models.Doc2Vec.load('output/' + folderName + 'T2VVS' +
                                       str(vectorsize) + '.model')

    vectors = []
    NUM_CLUSTERS = 5
    print("inferring vectors")
    for doc_id in range(len(corpus)):
        inferred_vector = model.infer_vector(corpus[doc_id].words)
        vectors.append(inferred_vector)
    print("done")

    if (clusterType == "KMeans"):
        kclusterer = KMeansClusterer(
            NUM_CLUSTERS,
            distance=nltk.cluster.util.cosine_distance,
            repeats=25)
        assigned_clusters = kclusterer.cluster(vectors, assign_clusters=True)
    elif (clusterType == "HierWard"):
        ward = AgglomerativeClustering(n_clusters=NUM_CLUSTERS,
                                       linkage='ward').fit(vectors)
        assigned_clusters = ward.labels_
    elif clusterType == "OCSVM":
        ocsvm = OneClassSVM()
        assigned_clusters = ocsvm.fit_predict(vectors)

    else:
        print(
            clusterType,
            " is not a predefined cluster type. Please use 'KMeans' or 'HierWard', or create a definition for ",
            clusterType)
        return
    trace_list = loadXES.get_trace_names(folderName + ".xes")
    clusterResult = {}
    for doc_id in range(len(corpus)):
        clusterResult[trace_list[doc_id]] = assigned_clusters[doc_id]

    resultFile = open(
        'output/' + folderName + 'T2VVS' + str(vectorsize) + clusterType +
        '.csv', 'w')
    for doc_id in range(len(corpus)):
        resultFile.write(trace_list[doc_id] + ',' +
                         str(assigned_clusters[doc_id]) + "\n")

    resultFile.close()
    print("done with ", clusterType, " on event log ", folderName)
예제 #14
0
def clustering(dataframe, repeats, myStopwords):
    num_clusters = 5
    # define vectorizer parameters
    tfidf_vectorizer = TfidfVectorizer(stop_words=myStopwords)
    # Only process the content, not the title
    tfidf_matrix = tfidf_vectorizer.fit_transform(dataframe["Content"])
    # Convert it to an array
    tfidf_matrix_array = tfidf_matrix.toarray()
    # Run K-means with cosine distance as the metric
    kclusterer = KMeansClusterer(num_clusters,
                                 distance=cosine_distance,
                                 repeats=repeats)
    # Output to assigned_clusters
    assigned_clusters = kclusterer.cluster(tfidf_matrix_array,
                                           assign_clusters=True)
    # cluster_size counts how many elements each cluster contains
    cluster_size = [0, 0, 0, 0, 0]
    # Create a 5x5 array and fill it with zeros
    matrix = [[0 for x in range(5)] for y in range(5)]
    # For every category
    for category in categories:
        # For every article
        for row in range(0, len(assigned_clusters)):
            # Compare the cluster number with the category number
            if assigned_clusters[row] == categories.index(category):
                ind = categories.index(dataframe.ix[row][4])
                matrix[categories.index(category)][ind] += 1
    # Count how many elements each cluster contains
    for row in range(0, len(assigned_clusters)):
        cluster_size[assigned_clusters[row]] += 1
    for x in range(5):
        for y in range(5):
            # Calculate frequency
            matrix[x][y] /= cluster_size[x]
            # Only keep the 2 first decimal digits
            matrix[x][y] = format(matrix[x][y], '.2f')
    # Output to a .csv file
    out_file = open("output/clustering_KMeans.csv", 'w')
    wr = csv.writer(out_file, delimiter="\t")
    newCategories = categories
    newCategories.insert(0, "\t")
    wr.writerow(newCategories)
    for x in range(5):
        newMatrix = matrix[x]
        clusterName = "Cluster " + str(x + 1)
        newMatrix.insert(0, clusterName)
        wr.writerow(matrix[x])
 def __init__(self, users_vecs_train_file, centroid_file, clustering_file,
              num_clusters, n_iteration, mode):
     self.mode = mode
     self.num_clusters = num_clusters
     self.users = np.genfromtxt(users_vecs_train_file)
     self.tree = LasyTree(np.arange(self.users.shape[0]))
     self.centroids = np.genfromtxt(centroid_file)
     clusters_ = np.genfromtxt(clustering_file).astype('int')
     self.clusters = {}
     for i in range(num_clusters):
         self.clusters[i] = []
     for i in range(len(clusters_)):
         self.clusters[clusters_[i]].append(i)
     self.n_iteration = n_iteration
     self.kclusterer = KMeansClusterer(num_clusters,
                                       distance=cosine_distance,
                                       initial_means=list(self.centroids))
예제 #16
0
def recluster(df, cl, clusters, n_clusters):
    lbls = cl.labels_
    mask = np.array([False for i in range(len(lbls))])
    for c in clusters:
        mask |= lbls==c
    subpipe, results = data_pipeline(df[mask])
    
    ##use cosine similarity! NLTK clustering implementation
    #KMeans cluster object as carrier for consistency
    subcl = cluster(results, n_clusters)
    kclusterer = KMeansClusterer(n_clusters, distance=nltk.cluster.util.cosine_distance, repeats=50)
    assigned_clusters = kclusterer.cluster(results, assign_clusters=True)
    #assign new cluster labels and cluster centroids
    subcl.labels_ = np.array(assigned_clusters)
    subcl.cluster_centers_ = np.array(kclusterer.means())
    
    return subpipe, subcl, results, df[mask]
예제 #17
0
    def cluster_docs(self):

        vectors = []
        used_lines = []

        for doc, id in self.es_docs():
            tokens = text_cleaner.clean_tokens(doc)
            if tokens != 'NC' and len(tokens) > 200:
                used_lines.append(tokens)
                vectors.append(self.model.infer_vector(tokens))

        kclusterer = KMeansClusterer(
            NUM_CLUSTERS,
            distance=nltk.cluster.util.cosine_distance,
            repeats=25)
        assigned_clusters = kclusterer.cluster(vectors, assign_clusters=True)

        print("done")
예제 #18
0
def get_cluster(tfidf_arr, k):
    """
    K-means聚类
    :param tfidf_arr:
    :param k:
    :return:
    """
    kmeans = KMeansClusterer(num_means=k,
                             distance=cosine_distance,
                             avoid_empty_clusters=True)  # 分成k类,使用余弦相似分析
    kmeans.cluster(tfidf_arr)

    # 获取分类
    kinds = pd.Series([kmeans.classify(i) for i in tfidf_arr])
    fw = open('/you_filed_algos/prod_kudu_data/ClusterText.txt',
              'a+',
              encoding='utf-8')
    for i, v in kinds.items():
        fw.write(str(i) + '\t' + str(v) + '\n')
    fw.close()
예제 #19
0
	def __init__(self, model):
		"""
		@param model: (type=Word2Vec model)
		"""
		self.model = model #store the Word2Vec model object in case of future use
		self.word_to_vec = {word:model.wv[word] for word in model.wv.vocab} #mapping from word strings to vectors
		self.vectors = [model.wv[word] for word in model.wv.vocab] 
		clusterer = KMeansClusterer(num_means=5, distance=cosine_distance) #the object that will cluster our vectors, num_means will eventually be parameterized
		clusterer.cluster_vectorspace(self.vectors)
		self.central_words = []

		#find closest words to centroids
		for centroid in clusterer._means:
			closest = None
			for word in self.word_to_vec:
				vector = self.word_to_vec[word]
				if not closest or (cosine_distance(vector, centroid) < cosine_distance(closest[1], centroid)):
					closest = (word, self.word_to_vec[word])
			self.central_words.append(closest)
		self.centroids = clusterer._means
예제 #20
0
def bbox_iou(x1, y1, w1, h1, x2, y2, w2, h2):
    x_a = torch.max(x1 - w1 / 2.0, x2 - w2 / 2.0)
    y_a = torch.max(y1 - h1 / 2.0, y2 - h2 / 2.0)
    x_b = torch.min(x1 + h1 / 2.0, x2 + w2 / 2.0)
    y_b = torch.max(y1 + h1 / 2.0, y2 + h2 / 2.0)

    intersection = torch.clamp(x_b - x_a, min=0) * torch.clamp(y_b - y_a,
                                                               min=0)
    union = w1 * h1 + w2 * h2 - intersection

    return intersection / (union + 1e-6)


kclusterer = KMeansClusterer(args.num_bbox,
                             distance=nltk.cluster.util.cosine_distance,
                             repeats=25)
assigned_clusters = kclusterer.cluster(data, assign_clusters=True)

kmeans_wh = KMeans(n_clusters=args.num_bbox)
kmeans_wh.fit(train_wh)
bbox_priors = kmeans_wh.cluster_centers_
np.save('priors.npy', bbox_priors)
bbox_priors = torch.from_numpy(bbox_priors).cuda()

# Set up the network

features = DenseNet(growth_rate=8,
                    block_config=(4, 8, 16, 32),
                    activation=nn.LeakyReLU(inplace=True),
                    input_channels=3)
예제 #21
0
        phrase = ii[1]
        if score < 0.7:
            break
        try:
            arr = numpy.append(arr,
                               numpy.reshape(model.wv.word_vec(phrase),
                                             (1, 100)),
                               axis=0)
        except KeyError:
            pass
        else:
            embedded_phrases.append(phrase)

    print('number of sample points:', len(embedded_phrases))

    kmeans = KMeansClusterer(6, nltk.cluster.util.cosine_distance)
    clusters = kmeans.cluster(arr, assign_clusters=True)
    centers = kmeans.means()

    result = {0: [], 1: [], 2: [], 3: [], 4: [], 5: []}
    for i in range(len(clusters)):
        result[clusters[i]].append([
            nltk.cluster.util.cosine_distance(centers[clusters[i]], arr[i]),
            embedded_phrases[i]
        ])
    for k in result:
        sorted_result = sorted(result[k], reverse=True)
        final_result = '\n'.join(
            ['%.10f' % x[0] + '\t' + x[1] for x in sorted_result])
        f = open('cluster' + str(k) + '.txt', 'w+')
        f.write(final_result)
예제 #22
0
    lines = open(datacfg).readlines()
    images = []
    for line in lines:
        if (line.split(' ')[0] == 'train'):
            valid_path = line.strip().split(' ')[-1]
            if (valid_path[0] != '/'):
                valid_path = workspace + valid_path
            lists = open(valid_path).readlines()
            images = [x.strip() for x in lists]

    bboxes = []
    for image in images:
        label = image.replace('.jpg', '.txt')
        lines = open(label).readlines()
        for line in lines:
            splitline = line.split(' ')
            # bboxes.append([float(x)*13. for x in splitline[-2:]])
            bboxes.append([float(splitline[-2])*1., float(splitline[-1])*1.])
    print(len(bboxes))
    # samples = random.sample(bboxes, 15000)
    # print(len(samples))
    bboxes = np.array(bboxes)
    # samples = np.array(samples)
    # print(samples.shape)

    KMeans = KMeansClusterer(5, negIoU, repeats=1)
    # clusters = KMeans.cluster(samples, True)
    clusters = KMeans.cluster(bboxes, True)
    centroids = KMeans.means()
    print(np.array(centroids) / np.array((1., 1.)))
def k_means_cluster():
    print("K means")

    # file = open("Subtree/sports.team.pro_athlete.txt","r")
    # op_pos = open("K_means/sports.team.pro_athlete_pos.txt","w")
    # op_neg = open("K_means/sports.team.pro_athlete_neg.txt","w")

    file = open("Test/test_subtree.txt", "r")
    op_pos = open("Test/test_sub_pos.txt", "w")
    op_neg = open("Test/test_sub_neg.txt", "w")

    new_list = file.readlines()
    new = []
    filtered_list = []
    temp_list = []

    for item in new_list:
        new.append(item)
        if (item == "\n"):
            temp_list.append(new)
            new = []

    for item in temp_list:
        if (len(item) == 1):
            continue
        else:
            filtered_list.append(item)

    data_matrix = random_sample.data_mat()
    new = filtered_list
    NUM_CLUSTERS = 5
    print("Start assigning clusters")
    kclusterer = KMeansClusterer(NUM_CLUSTERS,
                                 distance=nltk.cluster.util.cosine_distance,
                                 repeats=100,
                                 avoid_empty_clusters=True)
    assigned_clusters = kclusterer.cluster(data_matrix, assign_clusters=True)
    #print("Assigned clusters ",assigned_clusters)
    c0 = 0
    c1 = 0
    c2 = 0
    c3 = 0
    c4 = 0
    for item in assigned_clusters:
        if (item == 0):
            c0 = c0 + 1
        if (item == 1):
            c1 = c1 + 1
        if (item == 2):
            c2 = c2 + 1
        if (item == 3):
            c3 = c3 + 1
        if (item == 4):
            c4 = c4 + 1

    maximum = max(c0, c1, c2, c3, c4)
    print("Cluster grouping")
    if (maximum == c0):
        pos = []
        neg = []
        positive_pos = []
        negative_pos = []
        second_max = max(c1, c2, c3, c4)
        if (second_max == c1):
            for item in range(len(assigned_clusters)):
                if (assigned_clusters[item] == 0):
                    print(new[item], file=op_pos)

                if (assigned_clusters[item] == 1):
                    print(new[item], file=op_pos)

                if (assigned_clusters[item] == 2):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 3):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 4):
                    print(new[item], file=op_neg)

        if (second_max == c2):
            for item in range(len(assigned_clusters)):
                if (assigned_clusters[item] == 0):
                    print(new[item], file=op_pos)

                if (assigned_clusters[item] == 1):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 2):
                    print(new[item], file=op_pos)

                if (assigned_clusters[item] == 3):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 4):
                    print(new[item], file=op_neg)

        if (second_max == c3):
            for item in range(len(assigned_clusters)):
                if (assigned_clusters[item] == 0):
                    print(new[item], file=op_pos)

                if (assigned_clusters[item] == 1):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 2):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 3):
                    print(new[item], file=op_pos)

                if (assigned_clusters[item] == 4):
                    print(new[item], file=op_neg)

        if (second_max == c4):
            for item in range(len(assigned_clusters)):
                if (assigned_clusters[item] == 0):
                    print(new[item], file=op_pos)

                if (assigned_clusters[item] == 1):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 2):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 3):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 4):
                    print(new[item], file=op_pos)

    if (maximum == c1):
        pos = []
        neg = []
        positive_pos = []
        negative_pos = []
        second_max = max(c0, c2, c3, c4)
        if (second_max == c0):
            for item in range(len(assigned_clusters)):
                if (assigned_clusters[item] == 1):
                    print(new[item], file=op_pos)

                if (assigned_clusters[item] == 0):
                    print(new[item], file=op_pos)

                if (assigned_clusters[item] == 2):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 3):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 4):
                    print(new[item], file=op_neg)

        if (second_max == c2):
            for item in range(len(assigned_clusters)):
                if (assigned_clusters[item] == 1):
                    print(new[item], file=op_pos)

                if (assigned_clusters[item] == 0):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 2):
                    print(new[item], file=op_pos)

                if (assigned_clusters[item] == 3):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 4):
                    print(new[item], file=op_neg)

        if (second_max == c3):
            for item in range(len(assigned_clusters)):
                if (assigned_clusters[item] == 1):
                    print(new[item], file=op_pos)

                if (assigned_clusters[item] == 0):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 2):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 3):
                    print(new[item], file=op_pos)

                if (assigned_clusters[item] == 4):
                    print(new[item], file=op_neg)

            return pos, neg, positive_pos, negative_pos
        if (second_max == c4):
            for item in range(len(assigned_clusters)):
                if (assigned_clusters[item] == 1):
                    print(new[item], file=op_pos)

                if (assigned_clusters[item] == 0):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 2):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 3):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 4):
                    print(new[item], file=op_pos)

    if (maximum == c2):
        pos = []
        neg = []
        positive_pos = []
        negative_pos = []
        second_max = max(c0, c1, c3, c4)
        if (second_max == c0):
            for item in range(len(assigned_clusters)):
                if (assigned_clusters[item] == 2):
                    print(new[item], file=op_pos)

                if (assigned_clusters[item] == 0):
                    print(new[item], file=op_pos)

                if (assigned_clusters[item] == 1):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 3):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 4):
                    print(new[item], file=op_neg)

        if (second_max == c1):
            for item in range(len(assigned_clusters)):
                if (assigned_clusters[item] == 2):
                    print(new[item], file=op_pos)

                if (assigned_clusters[item] == 0):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 1):
                    print(new[item], file=op_pos)

                if (assigned_clusters[item] == 3):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 4):
                    print(new[item], file=op_neg)

        if (second_max == c3):
            for item in range(len(assigned_clusters)):
                if (assigned_clusters[item] == 2):
                    print(new[item], file=op_pos)

                if (assigned_clusters[item] == 0):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 1):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 3):
                    print(new[item], file=op_pos)

                if (assigned_clusters[item] == 4):
                    print(new[item], file=op_neg)

        if (second_max == c4):
            for item in range(len(assigned_clusters)):
                if (assigned_clusters[item] == 2):
                    print(new[item], file=op_pos)

                if (assigned_clusters[item] == 0):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 1):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 3):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 4):
                    print(new[item], file=op_pos)

    if (maximum == c3):
        pos = []
        neg = []
        positive_pos = []
        negative_pos = []
        second_max = max(c0, c1, c2, c4)
        if (second_max == c0):
            for item in range(len(assigned_clusters)):
                if (assigned_clusters[item] == 3):
                    print(new[item], file=op_pos)

                if (assigned_clusters[item] == 0):
                    print(new[item], file=op_pos)

                if (assigned_clusters[item] == 1):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 2):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 4):
                    print(new[item], file=op_neg)

        if (second_max == c1):
            for item in range(len(assigned_clusters)):
                if (assigned_clusters[item] == 3):
                    print(new[item], file=op_pos)

                if (assigned_clusters[item] == 0):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 1):
                    print(new[item], file=op_pos)

                if (assigned_clusters[item] == 2):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 4):
                    print(new[item], file=op_neg)

        if (second_max == c2):
            for item in range(len(assigned_clusters)):
                if (assigned_clusters[item] == 3):
                    print(new[item], file=op_pos)

                if (assigned_clusters[item] == 0):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 1):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 2):
                    print(new[item], file=op_pos)

                if (assigned_clusters[item] == 4):
                    print(new[item], file=op_neg)

            return pos, neg, positive_pos, negative_pos
        if (second_max == c4):
            for item in range(len(assigned_clusters)):
                if (assigned_clusters[item] == 3):
                    print(new[item], file=op_pos)

                if (assigned_clusters[item] == 0):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 1):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 2):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 4):
                    print(new[item], file=op_pos)

    if (maximum == c4):
        pos = []
        neg = []
        positive_pos = []
        negative_pos = []
        second_max = max(c0, c1, c2, c3)
        if (second_max == c0):
            for item in range(len(assigned_clusters)):
                if (assigned_clusters[item] == 4):
                    print(new[item], file=op_pos)

                if (assigned_clusters[item] == 0):
                    print(new[item], file=op_pos)

                if (assigned_clusters[item] == 1):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 3):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 2):
                    print(new[item], file=op_neg)

        if (second_max == c1):
            for item in range(len(assigned_clusters)):
                if (assigned_clusters[item] == 4):
                    print(new[item], file=op_pos)

                if (assigned_clusters[item] == 0):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 1):
                    print(new[item], file=op_pos)

                if (assigned_clusters[item] == 3):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 2):
                    print(new[item], file=op_neg)

        if (second_max == c2):
            for item in range(len(assigned_clusters)):
                if (assigned_clusters[item] == 4):
                    print(new[item], file=op_pos)

                if (assigned_clusters[item] == 0):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 1):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 3):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 2):
                    print(new[item], file=op_pos)

        if (second_max == c3):
            for item in range(len(assigned_clusters)):
                if (assigned_clusters[item] == 4):
                    print(new[item], file=op_pos)

                if (assigned_clusters[item] == 0):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 1):
                    print(new[item], file=op_neg)

                if (assigned_clusters[item] == 3):
                    print(new[item], file=op_pos)

                if (assigned_clusters[item] == 2):
                    print(new[item], file=op_neg)
# In[64]:

# Part 1 - Compute Kmeans using Cosine distance with 5 clusters

# Use KMeans Clusterer from Natural Language Toolkit Package as this allows Cosine distance to be used as distance measure

# Import packages from NLTK

from nltk.cluster.kmeans import KMeansClusterer
from nltk.cluster.util import cosine_distance

# In[65]:

# Perform clustering using Cosine distance
km_cos = KMeansClusterer(5,
                         distance=cosine_distance,
                         avoid_empty_clusters=True)
km_cos_cl = km_cos.cluster(X, assign_clusters=True)

# In[103]:

# Part 2 - Calculate Adjusted Rand Score and MI Score
print("Adjusted Rand Score (Cosine Distance): " +
      str(met.adjusted_rand_score(true_Labels, km_cos_cl)))
print("\nAdjusted Mutual Information Score (Cosine Distance): " +
      str(met.adjusted_mutual_info_score(true_Labels, km_cos_cl)))

# Clustering using Cosine distance has resulted in far better results on both the Adjusted Rand and Adjusted MI scores.  As with Euclidean distance, attempting clustering after performing PCA resulted in reduced scores on both indices (see Appendix).

# In[222]:
예제 #25
0
#使用夹角余弦距离进行k均值聚类

articals = []
for cutword in Red_df.cutword:
    articals.append(" ".join(cutword))
## 构建语料库,并计算文档--词的TF-IDF矩阵
vectorizer = CountVectorizer()
transformer = TfidfVectorizer()
tfidf = transformer.fit_transform(articals)

## tfidf 以稀疏矩阵的形式存储,将tfidf转化为数组的形式,文档-词矩阵
dtm = tfidf.toarray()

## 使用夹角余弦距离进行k均值聚类
kmeans = KMeansClusterer(num_means=2,       #聚类数目
                         distance=nltk.cluster.util.cosine_distance,  #夹角余弦距离
                         )
kmeans.cluster(dtm)

## 聚类得到的类别
labpre = [kmeans.classify(i) for i in dtm]
kmeanlab = Red_df[["ChapName","Chapter"]]
kmeanlab["cosd_pre"] = labpre
kmeanlab


## 查看每类有多少个分组
count = kmeanlab.groupby("cosd_pre").count()

## 将分类可视化
count.plot(kind="barh",figsize=(6,5))
예제 #26
0
def manhattan_distance(u, v):
    diff = u - v
    return np.sum(np.abs(diff))


def minkowski3_distance(u, v):
    #order set to 3.0 in this example
    return distance.minkowski(u, v, 3.0)


#Storing the best distances along with their purity results
best = ''
highest = 0

kclusterer = KMeansClusterer(k,
                             distance=nltk.cluster.util.euclidean_distance,
                             repeats=25)
assigned_clusters = kclusterer.cluster(data, assign_clusters=True)
purity = purity_score(classMatrix, assigned_clusters)
if highest < purity:
    highest = purity
    best = 'euclidean distance'
print(
    "Purity score of kmeans clustering based on euclidean distance on the given dataset is: "
    + str(purity))

kclusterer = KMeansClusterer(k,
                             distance=nltk.cluster.util.cosine_distance,
                             repeats=25)
assigned_clusters = kclusterer.cluster(data, assign_clusters=True)
purity = purity_score(classMatrix, assigned_clusters)
예제 #27
0
# print(kmeans)

# #Plot the clusters obtained using k means
# fig = plt.figure()
# ax = fig.add_subplot(111)

# scatter = ax.scatter(big_data_copy['Accounting'],big_data_copy['3D Printing'],
#                       c=kmeans[0],s=50)

# plt.colorbar(scatter)

# this one is not working out...dataframe might not be correct format
NUM_CLUSTERS = 10
kclusterer = KMeansClusterer(NUM_CLUSTERS,
                             distance=nltk.cluster.util.cosine_distance,
                             repeats=25)
assigned_clusters = kclusterer.cluster(big_data_copy, assign_clusters=True)
'''NEW PLAN, thanks Evan
    ONE HOT ENCODIGN BUT WITH ADDED UP VECTROS
    ex:
        Math 1 Art 2 Math 3 CS 50
    Joe     1   0       0    0
    Bob     0   0       1    0
    Smith   1   0       0    0
    Bob     0   1       0    0
    Smith   0   0       0    1

    groupByIndividual alphabetical is fine probably, jsut 
    want them to be same name next to each other
def main(argv):
    # Parse arguments
    parser = argparse.ArgumentParser()
    parser.add_argument('-i', '--input_file', help='input file', required=True)
    parser.add_argument('-s', '--step', help='step', required=True)
    parser.add_argument('-ik', '--init_k', help='K initial', required=True)
    parser.add_argument('-fk', '--final_k', help='K final', required=True)
    parser.add_argument('-od',
                        '--distortion_out_file',
                        help='elbow distortion graph file',
                        required=True)
    parser.add_argument('-os',
                        '--silhouette_out_file',
                        help='elbow silhoutte graph',
                        required=True)
    parser.add_argument('-pca', '--pca', help='with pca', action='store_true')
    parser.add_argument('-k_pca', '--k_pca', help='k pca')
    ARGS = parser.parse_args()

    descriptors = load_dataset(ARGS.input_file)
    if ARGS.pca == True:
        print("With pca")
        pca = PCA(n_components=int(ARGS.k_pca))
        descriptors = pca.fit_transform(descriptors)

    ks = []
    distortions = []
    silhouettes = []

    for k in range(int(ARGS.init_k), int(ARGS.final_k), int(ARGS.step)):
        # kmeanModel = KMeans(n_clusters=k, init='k-means++')
        # kmeanModel.fit(descriptors)
        # predictions = kmeanModel.predict(descriptors)
        # cluster_centers_ = kmeanModel.cluster_centers_

        kclusterer = KMeansClusterer(
            k, distance=nltk.cluster.util.cosine_distance)
        predictions = kclusterer.cluster(descriptors, assign_clusters=True)
        predictions = np.array(predictions)
        cluster_centers_ = np.array(kclusterer.means())

        distortion = sum(
            np.min(distance.cdist(descriptors, cluster_centers_, 'cosine'),
                   axis=1)) / descriptors.shape[0]

        silhouette_score = metrics.silhouette_score(descriptors,
                                                    predictions,
                                                    metric='cosine')

        distortions.append(distortion)
        silhouettes.append(silhouette_score)
        ks.append(k)

        print("k:", k, "distortion:", distortion, "Silhouette Coefficient",
              silhouette_score)

    # Plot the elbow with distortion
    fig = plt.figure()
    plt.plot(ks, distortions, 'bx-')
    plt.grid()
    plt.xlabel('k')
    plt.ylabel('Distortion')
    plt.title('The Elbow Method')
    fig.savefig(ARGS.distortion_out_file)

    # Plot the elbow with distortion
    fig = plt.figure()
    plt.plot(ks, silhouettes, 'bx-')
    plt.grid()
    plt.xlabel('k')
    plt.ylabel('Silhouette Score')
    plt.title('Silhouette Score analysis')
    fig.savefig(ARGS.silhouette_out_file)
예제 #29
0
# create counter and idf vectors

count_vect = TfidfVectorizer    (stop_words=stop_words)
count_vect.fit(df['Content']) #12266
X_train_counts = count_vect.transform(df['Content']

# reduce size of vector with LSI

svd = TruncatedSVD(n_components=5)
X_train_counts = svd.fit_transform(X_train_counts)


# Clustering

kclusterer = KMeansClusterer(num_means = 5, distance=cosine_distance, repeats=25, avoid_empty_clusters= True)
clusters = kclusterer.cluster(X_train_counts, assign_clusters=True)
# print "Clusters:\n " , clusters
# print "Means" , kclusterer.means()


# Prepare results Matrix

categories_map={
'Politics': 0,
'Business': 1,
'Film': 2,
'Technology': 3,
'Football': 4
}
예제 #30
0
 def __init__(self,k):
     self.k = k
     self.model = KMeansClusterer(k, distance=nltk.cluster.util.cosine_distance, repeats=25)