def calc_anchors():
    """
    计算anchors
    :return:
    """
    ratios = []
    sizes = []
    bbx = []
    root = '/media/hvt/95f846d8-d39c-4a04-8b28-030feb1957c6/dataset/充电宝/遮挡问题/core'
    for label_file_path in sorted(os.listdir(osp.join(root, 'Annotation'))):
        labels = pd.read_csv(osp.join(root, 'Annotation', label_file_path), delimiter=' ', header=None).values
        for label in labels:
            _, name, xmin, ymin, xmax, ymax = label
            if name not in label_dict.keys() or ymax - ymin < 2:
                # print(label_file_path, label)
                continue

            ratios.append((xmax - xmin) / (ymax - ymin))
            sizes.append([xmax - xmin, ymax - ymin])
            bbx.append([xmin, ymin, xmax, ymax])
    # 聚类分析anchor
    cluster = KMeans(n_clusters=3).fit(sizes)
    # 两种分辨率
    print(cluster.cluster_centers_ * (1333 / 2000))
    cluster = KMeans(n_clusters=2).fit(np.array(ratios).reshape(-1, 1))
    print(cluster.cluster_centers_)

    plt.hist(ratios, bins=10)
    plt.show()
    plt.hist2d(np.array(sizes)[:, 0], np.array(sizes)[:, 1], bins=20)
    plt.show()
예제 #2
0
def train_k_means_by_step(n_clusters, init_cluster_centers, x_array, eps):
    # eps = 1e-4
    # eps = 0.1
    # eps = 100.0
    # prev_sample = np.array(clf.cluster_centers_, np.float)
    prev_centers = init_cluster_centers
    clf = KMeans(init=prev_centers,
                 n_clusters=n_clusters,
                 n_init=1,
                 n_jobs=-1,
                 tol=eps,
                 max_iter=1)
    # if isinstance(prev_centers, str):
    #     prev_centers = clf.cluster_centers_
    clf.fit(x_array)
    new_centers = clf.cluster_centers_

    centers_list = [prev_centers, new_centers]
    args = [1]
    values = [clf.inertia_]
    while get_distance(prev_centers, new_centers) > eps:
        prev_centers = new_centers
        clf = KMeans(init=prev_centers,
                     n_clusters=n_clusters,
                     n_init=1,
                     n_jobs=-1,
                     tol=eps,
                     max_iter=1).fit(x_array)
        new_centers = clf.cluster_centers_
        args.append(len(args) + 1)
        values.append(clf.inertia_)
        centers_list.append(new_centers)
    # print "k = %s, len centers = %s" % (n_clusters, len(f_values))
    return args, values, centers_list
예제 #3
0
def plot_network(n_clusters, subset_job, subset_edu, no_jobs, no_edu):
    plt.figure(figsize=(6, 8))
    job_kmeans = KMeans(n_clusters=n_clusters)
    job_predict = job_kmeans.fit_predict(subset_job)
    empl_edu_kmean = KMeans(n_clusters=n_clusters)
    empl_predict = empl_edu_kmean.fit_predict(subset_edu)

    cluster_sum_jobs, cluster_sum_employ_edu = [], []

    for i in range(n_clusters):
        cluster_sum_employ_edu.append(
            sum_cluster(empl_predict, i, no_edu) / sum(no_edu))
        cluster_sum_jobs.append(
            sum_cluster(job_predict, i, no_jobs) / sum(no_jobs))

    jobs_centres = job_kmeans.cluster_centers_
    emp_edu_centres = empl_edu_kmean.cluster_centers_
    result, all_coords = min_span_tree(jobs_centres, emp_edu_centres,
                                       cluster_sum_jobs,
                                       cluster_sum_employ_edu)
    city_labels()
    plot_california_counties()
    plot_california()
    for i in range(len(result)):
        for j in range(len(result[i])):
            if result[i][j] == 0:  # NO LINK
                continue

            plt.scatter(jobs_centres[i][0] if i < n_clusters else
                        emp_edu_centres[i - n_clusters][0],
                        jobs_centres[i][1] if i < n_clusters else
                        emp_edu_centres[i - n_clusters][1],
                        edgecolors='b',
                        facecolors='none')
            plt.scatter(jobs_centres[j][0] if j < n_clusters else
                        emp_edu_centres[j - n_clusters][0],
                        jobs_centres[j][1] if j < n_clusters else
                        emp_edu_centres[j - n_clusters][1],
                        edgecolors='b',
                        facecolor='none')
            plt.plot(
                (jobs_centres[i][0] if i < n_clusters else
                 emp_edu_centres[i - n_clusters][0], jobs_centres[j][0]
                 if j < n_clusters else emp_edu_centres[j - n_clusters][0]),
                (jobs_centres[i][1] if i < n_clusters else
                 emp_edu_centres[i - n_clusters][1], jobs_centres[j][1] if
                 j < n_clusters else emp_edu_centres[j - n_clusters][1]), 'b-')

    plt.show()
예제 #4
0
def plot_employment_edu_cluster(n_clusters, no_edu, subset_edu, kmeans=None):
    if not kmeans:
        kmeans = KMeans(n_clusters=n_clusters)
    empl_predict = kmeans.fit_predict(subset_edu)
    plot_california()
    plot_california_counties()
    for i in range(n_clusters):
        mean_employment_score = np.mean(
            [no_edu[j] for j in range(len(no_edu)) if empl_predict[j] == i])
        plt.scatter([
            subset_edu[j][0]
            for j in range(len(subset_edu)) if empl_predict[j] == i
        ], [
            subset_edu[j][1]
            for j in range(len(subset_edu)) if empl_predict[j] == i
        ],
                    label=f"Mean Employment Score:{mean_employment_score:.5f}",
                    s=4.5)
    plt.legend()
    plt.gca().set_xlabel("Longitude")
    plt.gca().set_ylabel("Latitude")
    plt.xlim((-120, -116))
    plt.ylim((33, 35))
    plt.axis('equal')
    plt.show()
예제 #5
0
def initial_kmeans(k, rand_state, data, reallabels):
    min_clusters, max_clusters = k_range(k)  # 根据真实类标签数得到实验所用的簇数量范围
    bestAri_arr = []  # 每一个k簇值ari最好值的集合
    # bestCr_arr = [] #每一个k簇值CR最好值的集合
    kmeans_labels = []  # 某一k簇值得到的最好的划分
    kmeans_labels_arr = []  # 每一个k簇值的最好划分的集合
    for clusters in range(min_clusters, max_clusters):
        bestAri = 0  # 某一k簇值中的ari最好值
        # bestCr = -1
        for i in range(ini_generation):
            y_kmeans = KMeans(n_clusters=clusters,
                              random_state=rand_state).fit_predict(data)
            kmeans_ari = adjusted_rand_score(reallabels, y_kmeans)
            # kmeans_cr = corrected_rand(reallabels, y_kmeans)
            if kmeans_ari > bestAri:
                bestAri = kmeans_ari
                kmeans_labels = y_kmeans
            # if kmeans_cr > bestCr:
            #     bestCr = kmeans_cr
        # bestCr_arr.append(bestCr)
        bestAri_arr.append(bestAri)
        ind_kmeans = creator.Individual(kmeans_labels)
        kmeans_labels_arr.append(ind_kmeans)
    # print ('kmeans的最好CR值为:%s'%bestCr_arr)
    return kmeans_labels_arr, bestAri_arr
예제 #6
0
    def k_means_clustering(self, matrix):
        for k in range(3, 10):
            km = KMeans(n_clusters=k)
            self.cluster_number.append(
                [k, silhouette_score(matrix, km.fit_predict(matrix))])

        return self
예제 #7
0
def rsnn(sampledData, remainedData, sampledIndex, remainedIndex, singleName):
    predicted_labelAll = []
    for i in range(len(sampledData)):
        # clusters = random.randint(min_clusters,max_clusters)
        clusters = random.randint(2, 11)
        # clusters = random.randint(2,11)#范围是[2,10]
        if singleName == 'kmeans':
            predicted_label = KMeans(n_clusters=clusters).fit_predict(
                sampledData[i])
        elif singleName in ('ward', 'complete', 'average'):
            predicted_label = AgglomerativeClustering(
                linkage=singleName,
                n_clusters=clusters).fit_predict(sampledData[i])

        predicted_labelAll.append(predicted_label.tolist())  ##对采样出来的数据集的预测标签集合

    assinALLNnLabels = []  #全部的通过近邻分配的标签

    #remainedData和sampleedData拥有的数据的行数是一致的,所以j的值无论从len(remainedData)还是从len(sampledData)取都可以
    for j in range(len(remainedData)):
        assinNnLabels = []  # 通过近邻分配的标签
        for m in range(len(remainedData[j])):
            minDist = inf
            minindex = -1
            for k in range(len(sampledData[j])):
                distJI = distEclud(remainedData[j][m], sampledData[j][k])
                if distJI < minDist:
                    minDist = distJI
                    minindex = k
            assinNnLabels.append(
                predicted_labelAll[j][minindex])  #对除采样外的数据集的根据近邻关系得到的预测标签集合
        assinALLNnLabels.append(assinNnLabels)

    #对两个预测标签和序列值分别进行组合
    combineIndex = []
    combinedLables = []
    for column in range(len(predicted_labelAll)):
        combineIndexOne = sampledIndex[column] + remainedIndex[column]
        combinedLablesOne = predicted_labelAll[column] + assinALLNnLabels[
            column]
        combineIndex.append(combineIndexOne)
        combinedLables.append(combinedLablesOne)
    #把打乱的序号按照从小到大排列出来,得到元素升序的序列值
    seqIndexAll = []
    for combineIndex1 in combineIndex:
        seqIndex = []
        for seq in range(len(sampledData[0]) + len(remainedData[0])):
            for elementIndex in range(len(combineIndex1)):
                if combineIndex1[elementIndex] == seq:
                    seqIndex.append(elementIndex)
        seqIndexAll.append(seqIndex)

    #得到真正的sampledData和remainedData组合后的标签值
    finalLabel = []
    for finalIndex in range(len(combinedLables)):
        finallabelone = []
        for index in seqIndexAll[finalIndex]:
            finallabelone.append(combinedLables[finalIndex][index])
        finalLabel.append(finallabelone)  #最终聚类结果
    return finalLabel
def answer(test_path):

    import warnings
    warnings.filterwarnings("ignore")

    import time
    t0 = time.time()

    from learning import process_test_data, training_data, training_answers
    from sklearn.cluster.k_means_ import KMeans
    from sklearn.linear_model.logistic import LogisticRegression

    test_data = process_test_data(test_path)

    km = KMeans()
    km.fit(training_data, training_answers)

    myNum = km.predict(test_data).item()

    numX = [1, 2, 4, 2, 7, 0, 2, 7, 4, 3, 2, 1, 4, 5, 5, 1, 3, 0, 4, 2]
    numbers = [[num] for num in numX]
    letX = [
        'a', 'a', 'o', 'a', 'o', 'o', 'a', 'a', 'o', 'a', 'a', 'o', 'a', 'o',
        'o', 'o', 'a', 'a', 'o', 'a'
    ]
    letters = [[letter] for letter in letX]

    lr = LogisticRegression()
    lr.fit(numbers, letters)

    ans = lr.predict(myNum).item()

    t1 = time.time()
    return [ans, t1 - t0]
예제 #9
0
def main(argv=None):

    if argv is None:
        argv = sys.argv[1:]

    args = parser.parse_args(argv)
    log.info('start parameters: ' + str(args))

    log.info('loading data')
    data = np.loadtxt(args.data_points)

    if args.root is not None:
        data = np.sqrt(data)

    (k, initial_points) = get_initial_centers(args.clusters, args.start_points)

    log.info('calculate center points')
    kmeans = KMeans(k, initial_points, 1, args.max_iter, copy_x=False)
    predict = kmeans.fit_predict(data)

    log.info('storing results')

    if args.model:
        save_object_to_file(kmeans, args.model)

    with utf8_file_open(args.outfile, 'w') as outfile:

        for i in xrange(predict.shape[0]):
            outfile.write(u'%d\n' % predict[i])

    if args.centroids:
        np.savetxt(args.centroids, kmeans.cluster_centers_)

    log.info('finished')
def test_KMeansConstrained_parity_digits():

    iris = datasets.load_iris()
    X = iris.data

    k = 8
    random_state = 1
    size_min, size_max = None, None  # No restrictions and so should produce same result


    clf_constrained = KMeansConstrained(
        n_clusters=k,
        size_min=size_min,
        size_max=size_max,
        random_state=random_state
    )
    y_constrained = clf_constrained.fit_predict(X)

    clf_kmeans = KMeans(
        n_clusters=k,
        random_state=random_state
    )
    y_kmeans = clf_kmeans.fit_predict(X)

    assert_array_equal(y_constrained, y_kmeans)
    assert_almost_equal(clf_constrained.cluster_centers_, clf_kmeans.cluster_centers_)
    assert_almost_equal(clf_constrained.inertia_, clf_kmeans.inertia_)
예제 #11
0
def k_way_spectral_clustering():
    x = np.load('q2data.npy')
    A = np.load('AMatrix.npy')
    WeightMatrix = np.zeros((16, 16))
    for i in range(16):
        for j in range(16):
            if A[i][j] == 1:
                WeightMatrix[i][j] = np.exp(-1 * ((np.linalg.norm(x[i] - x[j]) ** 2)))
            else:
                WeightMatrix[i][j] = 0

    DegreeMatrix = np.sum(WeightMatrix, axis=1)
    L = DegreeMatrix - WeightMatrix
    DSquareRoot = np.diag(1.0 / (DegreeMatrix ** (0.5)))
    Lnorm = np.dot(np.dot(DSquareRoot, L), DSquareRoot)

    eigvals, eigvecs = np.linalg.eig(Lnorm)
    eigvecs = np.array(eigvecs, dtype=np.float64)
    sortedinds = eigvals.argsort()
    eigvec1, eigvec2, eigvec3, eigvec4 = eigvecs[:, 10], eigvecs[:, 11], eigvecs[:, 13], eigvecs[:, 14]

    kmeans = KMeans(n_clusters=3, init='random')
    kmeans.fit(eigvecs)
    components = kmeans.labels_
    return components
예제 #12
0
def train_kNN_after_kMeans(n_clusters, train_x_array, eps, predict_x_array):
    k_means = KMeans(init="random",
                     n_clusters=n_clusters,
                     n_init=1,
                     n_jobs=-1,
                     tol=eps).fit(train_x_array)
    # clf.cluster_centers_
    # clf.fit(X, y)
    iter_i = [
        k_means.cluster_centers_[j].reshape((28, 28))
        for j in range(n_clusters)
    ]
    picture = np.column_stack(iter_i)
    plt.imshow(picture, cmap="gray")

    input_data = raw_input("enter %s digits via space" % n_clusters)
    new_y = [int(i) for i in input_data.split(" ")]

    k_nn = KNeighborsClassifier(n_neighbors=1, n_jobs=-1)
    k_nn.fit(k_means.cluster_centers_, new_y)

    result_file = open("result-k-%s.csv" % n_clusters)
    result_file.write("ImageId,Label\n")
    prediction = k_nn.predict(predict_x_array)
    for i in range(len(prediction)):
        string = str(i + 1) + "," + str(prediction[i]) + "\n"
        result_file.write(string)
예제 #13
0
def getClusters(input_data):
    km = KMeans(n_clusters=10, random_state=0).fit(input_data)
    centers = km.cluster_centers_
    np.insert(
        centers, 0, 1, 1
    )  ####=========================== ADD BIAS =====================================##
    print("Centers : ", centers.shape)
    return centers
예제 #14
0
def getClustering(method_name="k-mean", param_map={}):
    if method_name == "k-mean":
        from sklearn.cluster.k_means_ import KMeans
        return KMeans(**param_map)
    elif method_name == "dbscan":
        from sklearn.cluster import DBSCAN
        return DBSCAN(**param_map)
    return None
def performKmeans(data,n_clusters):
    
    print "Performing K-Means on data"
    est = KMeans(n_clusters)
    est.fit(data)
    orb_cb_handler.store_estimator(est)
    
    return est
예제 #16
0
파일: Cluster.py 프로젝트: RenzeLou/MORE
def K_means_BERT(datasets, pred_vector, labels, opt):
    # datasets: a list ,each element is a [3,max_len] array sample
    # pred_vector: a model's function to predict embedding
    # num_classes: num of class
    num_classes = len(np.unique(labels))
    feature_embeddings = model_pred_BERT(datasets, pred_vector, labels, opt)
    kmeans = KMeans(n_clusters=num_classes, n_init=10).fit(feature_embeddings)
    label_list = kmeans.labels_.tolist()
    return label_list, create_msg(label_list), kmeans.cluster_centers_, feature_embeddings
예제 #17
0
def fsrsnn(sampledData, remainedData, sampledIndex, remainedIndex,
           sampledDataFs, k):
    min_clusters, max_clusters = k_range(k)  # 根据真实类标签数得到实验所用的簇数量范围
    predicted_labelAll = []
    for i in range(len(sampledData)):
        clusters = random.randint(min_clusters, max_clusters)
        # clusters = random.randint(2,11)#范围是[2,10]
        predicted_label = KMeans(n_clusters=clusters).fit_predict(
            sampledDataFs[i])

        predicted_labelAll.append(predicted_label.tolist())  ##对采样出来的数据集的预测标签集合

    assinALLNnLabels = []  #全部的通过近邻分配的标签

    #remainedData和sampleedData拥有的数据的行数是一致的,所以j的值无论从len(remainedData)还是从len(sampledData)取都可以
    for j in range(len(remainedData)):
        assinNnLabels = []  # 通过近邻分配的标签
        for m in range(len(remainedData[j])):
            minDist = inf
            minindex = -1
            for k in range(len(sampledData[j])):
                distJI = distEclud(remainedData[j][m],
                                   sampledData[j][k])  # 计算质心和数据点之间的距离
                if distJI < minDist:
                    minDist = distJI
                    minindex = k
            assinNnLabels.append(
                predicted_labelAll[j][minindex])  #对除采样外的数据集的根据近邻关系得到的预测标签集合
        assinALLNnLabels.append(assinNnLabels)

    #对两个预测标签和序列值分别进行组合
    combineIndex = []
    combinedLables = []
    for column in range(len(predicted_labelAll)):
        combineIndexOne = sampledIndex[column] + remainedIndex[column]
        combinedLablesOne = predicted_labelAll[column] + assinALLNnLabels[
            column]
        combineIndex.append(combineIndexOne)
        combinedLables.append(combinedLablesOne)
    #把打乱的序号按照从小到大排列出来,得到元素升序的序列值
    seqIndexAll = []
    for combineIndex1 in combineIndex:
        seqIndex = []
        for seq in range(len(sampledData[0]) + len(remainedData[0])):
            for elementIndex in range(len(combineIndex1)):
                if combineIndex1[elementIndex] == seq:
                    seqIndex.append(elementIndex)
        seqIndexAll.append(seqIndex)

    #得到真正的sampledData和remainedData组合后的标签值
    finalLabel = []
    for finalIndex in range(len(combinedLables)):
        finallabelone = []
        for index in seqIndexAll[finalIndex]:
            finallabelone.append(combinedLables[finalIndex][index])
        finalLabel.append(finallabelone)  #最终聚类结果
    return finalLabel
예제 #18
0
def performKmeans(data, n_clusters):

    print "Performing K-Means on data"
    est = KMeans(n_clusters)
    est.fit(data)
    labels = est.labels_
    labels_np = np.array(labels)

    return labels, est
예제 #19
0
def initial_kmeans(k, rand_state, data):
    min_clusters, max_clusters = k_range(k)  # 根据真实类标签数得到实验所用的簇数量范围
    kmeans_labels_arr = []  # 每一个k簇值的最好划分的集合
    for clusters in range(min_clusters, max_clusters):
        kmeans_labels = KMeans(n_clusters=clusters,
                               random_state=rand_state).fit_predict(data)
        ind_kmeans = creator.Individual(kmeans_labels)

        kmeans_labels_arr.append(ind_kmeans)
    return kmeans_labels_arr
예제 #20
0
def evaluate_kmeans_unsupervised(data, nclusters, k_init=20):
    """
    Clusters data with kmeans algorithm and then returns the cluster centroids
    :param data: Points that need to be clustered as a numpy array
    :param nclusters: Total number of clusters
    :param method_name: Name of the method from which the clustering space originates (only used for printing)
    :return: Formatted string containing metrics and method name, cluster centers
    """
    kmeans = KMeans(n_clusters=nclusters, n_init=k_init)
    kmeans.fit(data)
    return kmeans.cluster_centers_
예제 #21
0
def perform_cluster(data, params):
    km = KMeans()
    km.set_params(**params)
    vectorizer = TfidfVectorizer()
    print(data[1][0])
    tfidf = vectorizer.fit_transform(data[1])
    labels = km.fit_predict(tfidf)
    result = {i: [] for i in set(labels)}
    for i, l in zip(range(len(labels)), labels):
        result[l].append(data[0][i])
    return result
예제 #22
0
def initialMultiRun(data, times, singleName):
    predicted_labelAll = []
    for i in range(times):
        clusters = random.randint(2, 11)
        if singleName == "kmeans":
            predicted_label = KMeans(n_clusters=clusters).fit_predict(data)
        elif singleName in ('ward', 'average', 'complete'):
            predicted_label = AgglomerativeClustering(
                linkage=singleName, n_clusters=clusters).fit_predict(data)
        predicted_labelAll.append(predicted_label.tolist())
    return predicted_labelAll
예제 #23
0
 def __init__(self, tweet_file_path, no_of_clusters):
     """
     The constructor reads csv file and builds the data matrix.
     """
     self.np_extractor = ConllExtractor()
     self.pos_tagger = NLTKTagger()
     self.tweet_file_path = tweet_file_path
     self.data_matrix = self.__get_data_matrix_from_file(tweet_file_path)
     self.vectorizer = DictVectorizer(sparse=True)
     self.k_means_estimator = KMeans(init="random",
                                     n_clusters=no_of_clusters)
예제 #24
0
 def start_algorithm(self):
     """
     start clustering the stored tweets
     :return: list of clusters containing tweets
     """
     vectors = self.vectorize_data()
     kmeans = KMeans(init='k-means++',
                     n_clusters=self.cluster_amount,
                     n_init=10)
     kmeans.fit(vectors)
     return self.cluster_tweet(kmeans.labels_)
예제 #25
0
 def ClusterBalance(self, indexesToPick, stopCount, kmeansFlag=True):
     print "ClusterBalancing..."
     indexesPicked = []
     obs1 = self.observations[indexesToPick]
     obs = normalize(obs1, axis=0)
     if len(indexesToPick) != 0:
         if kmeansFlag:
             if (len(indexesToPick) < self.numClusters):
                 cluster = KMeans(init='k-means++',
                                  n_clusters=len(obs),
                                  n_init=10)
             else:
                 cluster = KMeans(init='k-means++',
                                  n_clusters=self.numClusters,
                                  n_init=10)
         else:
             if (len(indexesToPick) < self.numClusters):
                 cluster = spectral_clustering(n_clusters=len(obs),
                                               n_init=10)
             else:
                 cluster = spectral_clustering(n_clusters=self.numClusters,
                                               n_init=10)
         cluster.fit(obs)
         labels = cluster.labels_
         whenToStop = max(2, stopCount)
         count = 0
         while count != whenToStop:
             cluster_list = range(self.numClusters)
             index = 0
             for j in labels:
                 if j in cluster_list:
                     indexesPicked.append(indexesToPick[index])
                     cluster_list.remove(j)
                     count += 1
                     if count == whenToStop:
                         break
                     labels[index] = -1
                     if len(cluster_list) == 0:
                         break
                 index += 1
     return indexesPicked
예제 #26
0
def _centroids(n_clusters: int,
               points: List[List[float]]) -> List[List[float]]:
    """ Return n_clusters centroids of points
    """

    k_means = KMeans(n_clusters=n_clusters)
    k_means.fit(points)

    closest, _ = pairwise_distances_argmin_min(k_means.cluster_centers_,
                                               points)

    return list(map(list, np.array(points)[closest.tolist()]))
예제 #27
0
def evaluateKMeans(data, labels, nclusters, method_name):
    '''
    Clusters data with kmeans algorithm and then returns the string containing method name and metrics, and also the evaluated cluster centers
    :param data: Points that need to be clustered as a numpy array
    :param labels: True labels for the given points
    :param nclusters: Total number of clusters
    :param method_name: Name of the method from which the clustering space originates (only used for printing)
    :return: Formatted string containing metrics and method name, cluster centers
    '''
    kmeans = KMeans(n_clusters=nclusters, n_init=20)
    kmeans.fit(data)
    return getClusterMetricString(method_name, labels,
                                  kmeans.labels_), kmeans.cluster_centers_
예제 #28
0
파일: Cluster.py 프로젝트: RenzeLou/MORE
def K_means(datasets, pred_vector, num_classes, opt):
    '''
    Args:
        datasets: a list ,each element is a [3,max_len] array sample
        pred_vector: a model's function to predict embedding
        num_classes: num of class

    Returns:
        K-means results -- a tuple(label_list, message, cluster_centers, features)
    '''
    feature_embeddings = model_pred(datasets, pred_vector, opt)
    kmeans = KMeans(n_clusters=num_classes, n_init=10).fit(feature_embeddings)
    label_list = kmeans.labels_.tolist()
    return label_list, create_msg(label_list), kmeans.cluster_centers_, feature_embeddings
예제 #29
0
def perLabel(label_name, labels, sample_size, n_clusters):
    print(79 * '_')
    print label_name
    print(
        '% 9s' % 'feature'
        '    time  inertia    h**o   compl  v-meas     ARI AMI  silhouette')
    #print "number of distinct classes for true labels for ",label_name, len(Counter(labels))
    estimator = KMeans(n_clusters=n_clusters)
    bench_k_means(labels, sample_size, estimator, "RGB", rgb_data)
    bench_k_means(labels, sample_size, estimator, "LAB", lab_data)
    bench_k_means(labels, sample_size, estimator, "HOG", hog_data)
    bench_k_means(labels, sample_size, estimator, "GIST", gist_data)
    bench_k_means(labels, sample_size, estimator, "SURF", surf_data)
    bench_k_means(labels, sample_size, estimator, "SIFT", sift_data)
    bench_k_means(labels, sample_size, estimator, "ORB", orb_data)
예제 #30
0
def extract_word_clusters(commentList, commentCount):
    brown_ic = wordnet_ic.ic('ic-brown.dat')
    a, corpus, global_synsets = extract_global_bag_of_words(commentList, True)
    similarity_dict = {}
    i = 0
    t = len(global_synsets)**2
    
    for syn_out in global_synsets:
        similarity_dict[syn_out] = {} 
        for syn_in in global_synsets:
            if syn_in.pos() == syn_out.pos():
                similarity_dict[syn_out][syn_in] = syn_out.lin_similarity(syn_in, brown_ic)
            else:
                similarity_dict[syn_out][syn_in] = max(wn.path_similarity(syn_out,syn_in), wn.path_similarity(syn_in,syn_out))
        
            if i % 10000 == 0:
                print i, 'synsets processed out of',len(global_synsets)**2, '(',float(i)/(t),'%)'
            i += 1

    tuples = [(i[0], i[1].values()) for i in similarity_dict.items()] 
    vectors = [np.array(tup[1]) for tup in tuples]

    
    # Rule of thumb
    n = sqrt(len(global_synsets)/2)
    print "Number of clusters", n
    km_model = KMeans(n_clusters=n)
    km_model.fit(vectors)
    
    clustering = collections.defaultdict(list)
    for idx, label in enumerate(km_model.labels_):
        clustering[label].append(tuples[idx][0])
        
    pprint.pprint(dict(clustering), width=1)
    
    feature_vector = np.zeros([len(corpus),n])
    
    for i,comment in enumerate(corpus):
        for w in comment:
            for key, clust in clustering.items():
                if w in clust:
                    feature_vector[i][key] += 1
        if i % 1000 == 0:
            print i, 'comments processed'
        
    print feature_vector
    '''