def ClusterItems(data_file, items_bias_file, index_file, clusters_file,
                 centroids_file):

    data = np.genfromtxt(data_file)
    popular_items = np.genfromtxt(index_file).astype('int')
    data = data[popular_items]
    items_bias = np.genfromtxt(items_bias_file)
    important_items = np.where(np.abs(items_bias[popular_items]) < 0.2)[0]
    kclusterer = KMeansClusterer(NUM_CLUSTERS, distance=cosine_distance)
    print(NUM_CLUSTERS, important_items.shape)
    print("end", data.shape)
    clusters = kclusterer.cluster(data[important_items], assign_clusters=True)
    np.savetxt(centroids_file, kclusterer.means())
    np.savetxt(clusters_file, clusters)
def recluster(df, cl, clusters, n_clusters):
    lbls = cl.labels_
    mask = np.array([False for i in range(len(lbls))])
    for c in clusters:
        mask |= lbls==c
    subpipe, results = data_pipeline(df[mask])
    
    ##use cosine similarity! NLTK clustering implementation
    #KMeans cluster object as carrier for consistency
    subcl = cluster(results, n_clusters)
    kclusterer = KMeansClusterer(n_clusters, distance=nltk.cluster.util.cosine_distance, repeats=50)
    assigned_clusters = kclusterer.cluster(results, assign_clusters=True)
    #assign new cluster labels and cluster centroids
    subcl.labels_ = np.array(assigned_clusters)
    subcl.cluster_centers_ = np.array(kclusterer.means())
    
    return subpipe, subcl, results, df[mask]
示例#3
0
    lines = open(datacfg).readlines()
    images = []
    for line in lines:
        if (line.split(' ')[0] == 'train'):
            valid_path = line.strip().split(' ')[-1]
            if (valid_path[0] != '/'):
                valid_path = workspace + valid_path
            lists = open(valid_path).readlines()
            images = [x.strip() for x in lists]

    bboxes = []
    for image in images:
        label = image.replace('.jpg', '.txt')
        lines = open(label).readlines()
        for line in lines:
            splitline = line.split(' ')
            # bboxes.append([float(x)*13. for x in splitline[-2:]])
            bboxes.append([float(splitline[-2])*1., float(splitline[-1])*1.])
    print(len(bboxes))
    # samples = random.sample(bboxes, 15000)
    # print(len(samples))
    bboxes = np.array(bboxes)
    # samples = np.array(samples)
    # print(samples.shape)

    KMeans = KMeansClusterer(5, negIoU, repeats=1)
    # clusters = KMeans.cluster(samples, True)
    clusters = KMeans.cluster(bboxes, True)
    centroids = KMeans.means()
    print(np.array(centroids) / np.array((1., 1.)))
def main(argv):
    # Parse arguments
    parser = argparse.ArgumentParser()
    parser.add_argument('-i', '--input_file', help='input file', required=True)
    parser.add_argument('-s', '--step', help='step', required=True)
    parser.add_argument('-ik', '--init_k', help='K initial', required=True)
    parser.add_argument('-fk', '--final_k', help='K final', required=True)
    parser.add_argument('-od',
                        '--distortion_out_file',
                        help='elbow distortion graph file',
                        required=True)
    parser.add_argument('-os',
                        '--silhouette_out_file',
                        help='elbow silhoutte graph',
                        required=True)
    parser.add_argument('-pca', '--pca', help='with pca', action='store_true')
    parser.add_argument('-k_pca', '--k_pca', help='k pca')
    ARGS = parser.parse_args()

    descriptors = load_dataset(ARGS.input_file)
    if ARGS.pca == True:
        print("With pca")
        pca = PCA(n_components=int(ARGS.k_pca))
        descriptors = pca.fit_transform(descriptors)

    ks = []
    distortions = []
    silhouettes = []

    for k in range(int(ARGS.init_k), int(ARGS.final_k), int(ARGS.step)):
        # kmeanModel = KMeans(n_clusters=k, init='k-means++')
        # kmeanModel.fit(descriptors)
        # predictions = kmeanModel.predict(descriptors)
        # cluster_centers_ = kmeanModel.cluster_centers_

        kclusterer = KMeansClusterer(
            k, distance=nltk.cluster.util.cosine_distance)
        predictions = kclusterer.cluster(descriptors, assign_clusters=True)
        predictions = np.array(predictions)
        cluster_centers_ = np.array(kclusterer.means())

        distortion = sum(
            np.min(distance.cdist(descriptors, cluster_centers_, 'cosine'),
                   axis=1)) / descriptors.shape[0]

        silhouette_score = metrics.silhouette_score(descriptors,
                                                    predictions,
                                                    metric='cosine')

        distortions.append(distortion)
        silhouettes.append(silhouette_score)
        ks.append(k)

        print("k:", k, "distortion:", distortion, "Silhouette Coefficient",
              silhouette_score)

    # Plot the elbow with distortion
    fig = plt.figure()
    plt.plot(ks, distortions, 'bx-')
    plt.grid()
    plt.xlabel('k')
    plt.ylabel('Distortion')
    plt.title('The Elbow Method')
    fig.savefig(ARGS.distortion_out_file)

    # Plot the elbow with distortion
    fig = plt.figure()
    plt.plot(ks, silhouettes, 'bx-')
    plt.grid()
    plt.xlabel('k')
    plt.ylabel('Silhouette Score')
    plt.title('Silhouette Score analysis')
    fig.savefig(ARGS.silhouette_out_file)
示例#5
0
        if score < 0.7:
            break
        try:
            arr = numpy.append(arr,
                               numpy.reshape(model.wv.word_vec(phrase),
                                             (1, 100)),
                               axis=0)
        except KeyError:
            pass
        else:
            embedded_phrases.append(phrase)

    print('number of sample points:', len(embedded_phrases))

    kmeans = KMeansClusterer(6, nltk.cluster.util.cosine_distance)
    clusters = kmeans.cluster(arr, assign_clusters=True)
    centers = kmeans.means()

    result = {0: [], 1: [], 2: [], 3: [], 4: [], 5: []}
    for i in range(len(clusters)):
        result[clusters[i]].append([
            nltk.cluster.util.cosine_distance(centers[clusters[i]], arr[i]),
            embedded_phrases[i]
        ])
    for k in result:
        sorted_result = sorted(result[k], reverse=True)
        final_result = '\n'.join(
            ['%.10f' % x[0] + '\t' + x[1] for x in sorted_result])
        f = open('cluster' + str(k) + '.txt', 'w+')
        f.write(final_result)
# In[222]:

# Part 3 - Run 50 random initializations

cos_rand_scores = np.zeros((50))
cos_mi_scores = np.zeros((50))
centroids_cos = {}
for i in range(0, 50):
    km_cos = KMeansClusterer(5,
                             distance=cosine_distance,
                             avoid_empty_clusters=True)
    km_cos_cl = km_cos.cluster(X, assign_clusters=True)
    cos_rand_scores[i] = (met.adjusted_rand_score(true_Labels, km_cos_cl))
    cos_mi_scores[i] = (met.adjusted_mutual_info_score(true_Labels, km_cos_cl))
    centroids_cos[i] = km_cos.means()

# In[199]:

# Report average Adjusted Rand and Mutual Information scores

print(
    "Adjusted Rand Score Averaged over 50 initializations (Cosine Distance): "
    + str(cos_rand_scores.mean()))
print(
    "Adjusted Rand Score St Dev over 50 initializations (Cosine Distance): " +
    str(cos_rand_scores.std()))
print(
    "\nAdjusted Mutual Information Score Averaged over 50 initializations (Cosine Distance): "
    + str(cos_mi_scores.mean()))
print(
示例#7
0
def main(argv):
    # Parse arguments
    parser = argparse.ArgumentParser()
    parser.add_argument('-i', '--input_file', help='input file', required=True)
    parser.add_argument('-ids', '--ids_file', help='ids file', required=True)
    parser.add_argument('-n_components',
                        '--n_components',
                        help='number of components in pca',
                        required=True)
    parser.add_argument('-k', '--k', help='k of kmeans', required=True)
    ARGS = parser.parse_args()

    descriptors = load_dataset(ARGS.input_file)
    ids_list, news_groups = get_hash_ids(ARGS.ids_file)

    print("PCA")
    pca = PCA(n_components=int(ARGS.n_components))
    descriptors = pca.fit_transform(descriptors)

    # kmeanModel = KMeans(n_clusters=int(ARGS.k), init='k-means++')
    # kmeanModel.fit(descriptors)
    # predictions = kmeanModel.predict(descriptors)
    # cluster_centers_ = kmeanModel.cluster_centers_
    # print(predictions)

    print("Kmeans")
    kclusterer = KMeansClusterer(int(ARGS.k),
                                 distance=nltk.cluster.util.cosine_distance)
    predictions = np.array(
        kclusterer.cluster(descriptors, assign_clusters=True))
    cluster_centers_ = np.array(kclusterer.means())

    print("Distortions")
    # distortion_eu = sum(np.min(distance.cdist(descriptors, cluster_centers_, 'euclidean'), axis=1)) / descriptors.shape[0]
    distortion_cos = sum(
        np.min(distance.cdist(descriptors, cluster_centers_, 'cosine'),
               axis=1)) / descriptors.shape[0]

    print("Silhouettes")
    # silhouette_score_eu = metrics.silhouette_score(descriptors, predictions, metric='euclidean')
    silhouette_score_cos = metrics.silhouette_score(descriptors,
                                                    predictions,
                                                    metric='cosine')

    # print("EUCLIDEAN K:", ARGS.k, "distortion:", distortion_eu, "silhouette score:", silhouette_score_eu)
    print("COS K:", ARGS.k, "distortion:", distortion_cos, "silhouette score:",
          silhouette_score_cos)

    closest, _ = pairwise_distances_argmin_min(cluster_centers_, descriptors)

    medoids_ids = ids_list[closest]

    medoids = descriptors[closest]

    dist = distance.cdist(medoids, medoids, metric='cosine')
    # Five
    knns = dist.argsort(axis=1)[:, :6][:, 1:]

    for id_, knn in zip(medoids_ids, knns):
        print("\nMedoid id:", id_, "label:", news_groups[id_])
        print("Cercanos:")
        for nn in knn:
            print("\t id:", medoids_ids[nn], "labels:",
                  news_groups[medoids_ids[nn]])

    metric = []

    for i in range(int(225)):
        ids_l = ids_list[np.where(predictions == i)]

        #     if len(ids_l) == 0:
        #         counter_0+=1
        #         continue
        clusters_labels = []
        for id_l in ids_l:
            label_list = news_groups[id_l]
            for ll in label_list:
                clusters_labels.append(ll)

        clnp = np.array(clusters_labels)
        uni, con = np.unique(clnp, return_counts=True)
        #letter_counts = Counter(clusters_labels)
        #df = pandas.DataFrame.from_dict(letter_counts, orient='index')

        ind = np.argsort(con)[::-1]
        uni = uni[ind]
        con = con[ind]

        maxim = con.sum()
        cont = con[0]

        label = uni[0]
        uni = uni[1:]
        con = con[1:]
        marker = np.zeros(uni.shape)

        for s in label.split('.'):
            for j in range(uni.shape[0]):
                if marker[j] == 0 and s in uni[j]:
                    cont += con[j]
                    marker[j] = 1

    #     print("cluster:", i, "metrica:", cont/maxim  )
        metric.append(cont / maxim)

    metric = np.array(metric, dtype=np.float)

    print("mean:", metric.mean())
    print("std:", metric.std())
    print("median:", np.median(metric))
    print("Min:", np.min(metric))
    print("Max:", np.max(metric))

    return 0