예제 #1
0
def cluster_kmeans(vectors, num_clusters, distance_metric="cosine"):
    """ 

    Takes in vectors and clusters them using KMeans clustering.

    Inputs:
    vectors -- matrix containing rows of vectors
    num_clusters -- number of clusters to create
    distance_metric -- distance measure between vectors (default "cosine")

    """

    print "Starting KMeans clustering"

    start_time = time.time()

    # initialize
    if distance_metric == "euclidean":
        clusterer = cluster.KMeansClusterer(num_clusters, euclidean_distance)
    elif distance_metric == "cosine":
        clusterer = cluster.KMeansClusterer(num_clusters, cosine_distance)

    assignment = clusterer.cluster(vectors, True)

    end_time = time.time()
    print "Clustering required", (end_time - start_time), "seconds"

    return assignment
예제 #2
0
파일: 2.py 프로젝트: Samsomyajit/K-MAENS
def demo():
    # example from figure 14.9, page 517, Manning and Schutze

    from nltk import cluster

    vectors = [numpy.array(f) for f in [[2, 1], [1, 3], [4, 7], [6, 7]]]
    means = [[4, 3], [5, 5]]

    clusterer = cluster.KMeansClusterer(2, euclidean_distance, initial_means=means)
    clusters = clusterer.cluster(vectors, True, trace=True)

    print 'Clustered:', vectors
    print 'As:', clusters
    print 'Means:', clusterer.means()
    print

    vectors = [numpy.array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]]
    
    # test k-means using the euclidean distance metric, 2 means and repeat
    # clustering 10 times with random seeds

    clusterer = cluster.KMeansClusterer(2, euclidean_distance, repeats=10)
    clusters = clusterer.cluster(vectors, True)
    print 'Clustered:', vectors
    print 'As:', clusters
    print 'Means:', clusterer.means()
    print

    # classify a new vector
    vector = numpy.array([3, 3])
    print 'classify(%s):' % vector,
    print clusterer.classify(vector)
    print
예제 #3
0
def kmeans_note_fvs(keys, nfvs, n=10, metric=cluster.euclidean_distance):
    # turn fvs into vectors
    vectors = []
    for n in nfvs:
        vectors.append(numpy.array([n.get(k, 0) for k in keys]))
    clusterer = cluster.KMeansClusterer(n, metric)
    clusterer.cluster(vectors, True)
    return clusterer
예제 #4
0
 def cluster(self,
             assignAndReturnDetails=False,
             numberOfTopFeatures=5,
             algorithmSource='nltk',
             **kwargs):
     bestFeatures, error = {}, None
     if algorithmSource == 'nltk':
         clusterer = cluster.KMeansClusterer(self.numberOfClusters,
                                             euclidean_distance, **kwargs)
         clusters = clusterer.cluster(self.vectors, True)
         means = clusterer.means()
         for id, mean in zip(clusterer.cluster_names(), means):
             bestFeatures[id] = [
                 (dimension, score) for dimension, score in
                 sorted(zip([
                     self.dimensions.get(Clustering.DIMENSION_TO_PHRASE, i)
                     for i in range(len(mean))
                 ], mean),
                        key=itemgetter(1),
                        reverse=True)[:numberOfTopFeatures] if score > 0
             ]
     elif algorithmSource == 'biopython':
         from Bio.Cluster import kcluster, clustercentroids
         clusters, error, _ = kcluster(self.vectors,
                                       nclusters=self.numberOfClusters,
                                       npass=kwargs['repeats'])
         means, _ = clustercentroids(self.vectors, self.masks, clusters)
         means = [unitVector(c) for c in means]
         for id, mean in zip(range(len(means)), means):
             bestFeatures[id] = [
                 (dimension, score) for dimension, score in
                 sorted(zip([
                     self.dimensions.get(Clustering.DIMENSION_TO_PHRASE, i)
                     for i in range(len(mean))
                 ], mean),
                        key=itemgetter(1),
                        reverse=True)[:numberOfTopFeatures] if score > 0
             ]
     if assignAndReturnDetails:
         documentAssignments = sorted(
             [(docId, clusterId)
              for docId, clusterId in zip(self.docIds, clusters)],
             key=itemgetter(1))
         clusters = dict(
             (clusterId, [t[0] for t in documents])
             for clusterId, documents in groupby(documentAssignments,
                                                 key=itemgetter(1)))
         return {
             'clusters': clusters,
             'bestFeatures': bestFeatures,
             'error': error
         }
     return clusters
예제 #5
0
def km_cluster_docs(docs,
                    nclusters=3,
                    normalise=True,
                    distance=cluster.cosine_distance,
                    svd_d=None):
    # first convert to numeric vectors
    dv = (lambda docs: ([(id, array([count for fname, count in dfreq]))
                         for id, dfreq in docs.iteritems()
                         if sum([count for fname, count in dfreq]) > 0]))(docs)

    n_features = len(dv[0][1])
    rand_means = [
        array([random.random() for i in xrange(n_features)])
        for j in xrange(nclusters)
    ]

    if svd_d is not None:
        kmc = cluster.KMeansClusterer(
            nclusters,
            distance=distance,
            normalise=normalise,
            svd_dimensions=svd_d,
            initial_means=rand_means)  ## svd is horribly
    else:
        kmc = cluster.KMeansClusterer(nclusters,
                                      distance=distance,
                                      normalise=normalise,
                                      initial_means=rand_means)

    print "Documents: ", len([dv_[1] for dv_ in dv])
    for dv_ in dv:
        print dv_[1]
    kmc.cluster([dv_[1] for dv_ in dv])
    #print kmc.cluster(dv.values())
    classes_by_jid = dict([(id, kmc.classify(fv)) for id, fv in dv])
    return dv, classes_by_jid, kmc
예제 #6
0
def cluster_things(keys_to_use, gold_standard="normal", make_pickle=False):
    # Open the CSV file
    vectors = []
    gold_filter = []
    with open(FEATURE_CSV, 'r') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            row_values = []
            gold_filter += [int(row['book_id'])]
            for key in row:
                if key != 'book_id' and key in keys_to_use:
                    row_values += [float(row[key])]
            vectors += [row_values]
    gold_clusters = []
    if gold_standard == "normal":
        gold_clusters = get_gold_standard(gold_filter)
    else:
        gold_clusters = get_kincaid_cluster(gold_filter)
    vectors = [array(f) for f in vectors]
    clusterer = cluster.KMeansClusterer(len(gold_clusters), euclidean_distance)
    clusters = clusterer.cluster(vectors, True)
    if make_pickle == True:
        pickle.dump(clusterer, open(PICKLE_FILE, 'w'))

    # Attempt to classify the things again, so we know which vector they belong to
    results = []
    for i in range(0, len(gold_clusters)):
        results += [[]]
    with open(FEATURE_CSV, 'r') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            row_values = []
            for key in row:
                if key != 'book_id' and key in keys_to_use:
                    row_values += [float(row[key])]
            results[clusterer.classify([array(f)
                                        for f in row_values])] += [row]

    book_ids = []
    for i, c in enumerate(results):
        t = []
        for row in c:
            t += [int(row['book_id'])]
        book_ids += [t]
    # Open the source files and find the correct things
    return score_clusters(gold_clusters, book_ids)
예제 #7
0
파일: alias.py 프로젝트: schetudiante/tpred
def go_cluster(topic_rows):
    print "Clustering", len(topic_rows), "Topics"
    topics = [t[1] for t in topic_rows]

    print "Getting topic words"
    words = get_words(topics)

    print "Vectorizing topics"
    vectorized = [vectorspaced(topic, words) for topic in topics]

    k = len(topics) / 3

    c = cluster.KMeansClusterer(k, cutil.euclidean_distance, avoid_empty_clusters=True)

    print "Clustering into", k
    res = c.cluster(vectorized, assign_clusters=True, trace=False)
    #print res

    print "Clustering done, gathering"
    clusters = {}
    output_clusters = {}

    for (t, tid), cluster_id in zip(topic_rows, res):
        if cluster_id not in output_clusters:
            output_clusters[cluster_id] = []
            clusters[cluster_id] = []

        output_clusters[cluster_id].append(t)
        clusters[cluster_id].append(tid)

    #pprint.pprint(clusters)

    print "Saving clusters"
    save_clusters(output_clusters)

    return []
예제 #8
0
fig = pylab.figure(figsize=(100, 100))
linkageMatrix = hier.linkage(distSquareMatrix, method='ward')
dendro = hier.dendrogram(linkageMatrix, orientation='left', labels=scriptlist)

#fig.show()
fig.savefig('dendrogram.png')
print '\nlinkage matrix:'
print linkageMatrix
print '\ndendrogram:'
print dendro

answer = []

vectors = [array(f) for f in data]
clusterer = cluster.KMeansClusterer(8,
                                    euclidean_distance,
                                    repeats=10,
                                    avoid_empty_clusters=True)
print '\nK-means results using NLTK:'
answer = clusterer.cluster(vectors, True)

i = 0
j = 0
for j in range(8):
    print '\n cluster:'
    print j
    for i in range(len(answer)):
        if (answer[i] == j):
            print scriptlist[i]

# classify a new vector
'''
예제 #9
0
def main():
    print("good")
    #df = pd.read_csv("D:\\\document_vector.csv", delimiter=',')
    #print(len(df))
    '''
    my_randoms = []

    for i in range(6797):
        my_randoms.append(random.randrange(1, 101, 1))

    print(my_randoms)
    '''
    #取出content
    title_list = []
    content_list = []
    show_list = []
    content = "D:\\NTUST\\人工智慧\\final\\csv\\headfile.csv"
    content_file = open(content, 'r', encoding='utf-8')
    content_filecsvCursor = csv.reader(content_file)
    next(content_filecsvCursor, None)  # skip the headers
    for row in content_filecsvCursor:
        #print(row[0]+"~~"+row[1])
        #title_list.append(row[3])
        #content_list.append(row[5])
        context = re.findall("(.{1,25})", row[5])
        rebuildcontext = ""
        for w in context:
            rebuildcontext += (w + "<br>")

        show_list.append(row[0] + "<br>" + row[3] + "<br>" + rebuildcontext)
        #print(rebuildcontext)
        #exit()

    #讀取document_vector
    fileName = "D:\\NTUST\\人工智慧\\final\\csv\\document_vector.csv"
    file = open(fileName, 'r', encoding='utf-8')
    filecsvCursor = csv.reader(file)
    high_dim_data = []
    for row in filecsvCursor:
        rowlist = list(row)
        #print(rowlist)
        temp = []
        count = 0
        for item in rowlist:
            if (count >= 2):
                temp.append(float(item))
                #print(float(item))
            count += 1
        high_dim_data.append(temp)

    #PCA降至2維
    pca = PCA(n_components=2)
    newData = pca.fit_transform(high_dim_data)  #降至2維
    print(newData)
    lx = [x for x, y in newData]
    print(lx)
    ly = [y for x, y in newData]
    print(ly)

    # Create a trace
    trace = Scatter(x=lx,
                    y=ly,
                    mode='markers',
                    marker=dict(size=10,
                                color='rgba(255, 182, 193, .9)',
                                line=dict(width=2, )),
                    text="good")

    data = [trace]
    #plotly.offline.plot(data)

    vectors = [array(f) for f in newData]

    # test k-means using the euclidean distance metric, 2 means and repeat
    # clustering 10 times with random seeds
    k = 5
    clusterer = cluster.KMeansClusterer(k, cosine_distance, repeats=10)
    clusters = clusterer.cluster(vectors, True)

    print('Clustered:', end="")
    print(vectors)
    print('As:', end="")
    print(clusters)
    print('Means:', end="")
    print(clusterer.means())

    totaldata = array(vectors)
    print(totaldata)
    print(type(totaldata))
    print(type(totaldata[0]))
    label = array(clusters)
    print(label)
    center = [f.tolist() for f in clusterer.means()]

    trace_set = []
    colornow = []
    ds = totaldata[np.where(label == 0)]

    for i in range(k):
        #colornow.append(random.random(0,255))
        r = random.randrange(0, 255)
        g = random.randrange(0, 255)
        b = random.randrange(0, 255)
        colornow.append('rgba(' + str(r) + ', ' + str(g) + ', ' + str(b) +
                        ', .9)')
    for i in range(k):
        ds = totaldata[np.where(label == i)]

        trace_now = Scatter(x=ds[:, 0],
                            y=ds[:, 1],
                            mode='markers',
                            marker=dict(size=10,
                                        color=colornow[i],
                                        line=dict(width=2, )),
                            text=show_list)
        trace_set.append(trace_now)

    centerx = [x for x, y in center]
    print(lx)
    centery = [y for x, y in center]
    center_trace = Scatter(
        x=centerx,
        y=centery,
        mode='markers',
        marker=dict(size=10, color="rgba(0,0,0)", line=dict(width=30, )),
    )
    trace_set.append(center_trace)
    plotly.offline.plot(trace_set)
    #print type(document)
    document = nltk.Text(tweet_corpus.words(document))
    word_counts = []
    for word in unique_terms:
        word_counts.append(document.count(word))
    #print word_counts
    return word_counts


vectors = [numpy.array(BOW(f)) for f in tweet_corpus.fileids()]
print "Vectors created."
print "First 10 words are", unique_terms[:10]
print "First 10 counts for first document are", vectors[0][0:10]

CLUSTERS = 2
kmeans_clusterer = cluster.KMeansClusterer(CLUSTERS, cosine_distance)

print "Starting Clustering"
clusters = kmeans_clusterer.cluster(vectors,
                                    assign_clusters=False,
                                    trace=False)
#print 'Clustered:', vectors
#print 'As:', clusters
print "Number of clusters: ", kmeans_clusterer.num_clusters()
print "Means:", kmeans_clusterer.means()
print "Cluster names: ", str(kmeans_clusterer.cluster_names())

# Go through the docs in the same order as we did when we created feature vectors
# Create a dict of tweet => cluster ID
cluster_dict = {}
for i, fileid in enumerate(tweet_corpus.fileids()):
예제 #11
0
#not sure why it is needed but value is not actually
#used. just needed to push down values to right spot
#once added to articledf
categories = pd.read_csv('categoriesIndex.txt')

fileIN = 'Output2.txt'

featwords = np.genfromtxt(fileIN,
                          dtype='U300',
                          converters={0: lambda x: x.decode()})

articledf = pd.DataFrame(data=datamat[:, 1:],
                         index=datamat[:, 0],
                         columns=featwords[1:])

clusterer = cluster.KMeansClusterer(6, euclidean_distance, repeats=1)
results = clusterer.cluster(datamat2, True)
means1 = clusterer.means()

clusterer2 = cluster.KMeansClusterer(6, cosine_distance, repeats=1)
results2 = clusterer2.cluster(datamat2, True)
means2 = clusterer2.means()

clusterer3 = cluster.KMeansClusterer(6,
                                     spatial.distance.jaccard,
                                     repeats=1,
                                     avoid_empty_clusters=True,
                                     conv_test=1)
results3 = clusterer3.cluster(datamat2, True)
means3 = clusterer3.means()
예제 #12
0
#clusterer = cluster.KMeansClusterer(2, euclidean_distance, initial_means=means)
#clusters = clusterer.cluster(vectors, True, trace=True)

#print 'Clustered:', vectors
#print 'As:', clusters
#print 'Means:', clusterer.means()
#print

#vectors = []
#vectors = [array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]]

# test k-means using the euclidean distance metric, 2 means and repeat
# clustering 10 times with random seeds

clusterer = cluster.KMeansClusterer(2,
                                    euclidean_distance,
                                    avoid_empty_clusters=True)
clusters = clusterer.cluster(vectors, True)

#print 'Clustered:', vectors
print 'As:'  # clusters
i = 2
for clst in clusters:
    print i, clst
    i = i + 1
print 'Means:', clusterer.means()
#print vectors

# classify a new vector
#vector = array([3, 3])
#print 'classify(%s):' % vector,
예제 #13
0
파일: cluster.py 프로젝트: irr/python-labs
from numpy import array
from nltk import cluster
from nltk.cluster import euclidean_distance
vectors = [array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0]]]
clusterer = cluster.KMeansClusterer(2, euclidean_distance, repeats=10)
print clusterer.cluster(vectors, True)
    def cluster(self, feats_file, num_training, docs_file):
        """ feats_file: file contains features
            num_training: number of random data point from file to use for traninig.
            Because of limitted computation power we have to train our clusters on subset of data.
            doc_files: file contains all original data with their id.
        """
        lines = [line.strip() for line in open(feats_file)]
        feats = []
        for line in lines:
            parts = line.split()
            v = [float(x) for x in parts[1].split(",")]
            feats.append(v)
        feats = np.array(feats)

        # use nltk clustering because it has cosine distance
        self.clusterer = cluster.KMeansClusterer(self.n_clusters,
                                                 cosine_distance,
                                                 repeats=3,
                                                 avoid_empty_clusters=True)
        # randomly select 10000 data points
        self.clusters = self.clusterer.cluster(
            feats[np.random.choice(feats.shape[0], num_training, replace=False
                                   ), :], False, True)
        P = []
        for i in range(feats.shape[0]):
            p = self.clusterer.classify(feats[i, :])
            P.append(p)
        # load the docs
        lines = [line.strip() for line in open(docs_file)]
        docs = []
        for line in lines:
            docs.append(line)

        clean_docs = []
        # pass the docs through pipeline to remove stopwords etc
        for doc in docs:
            v = []
            words = self.pattern.sub(" ", doc.lower()).split()
            for word in words:
                if word in self.dictionary.token2id:
                    v.append(word)
            #if (len(v) > 0):
            clean_docs.append(v)

        # find what are words corresponding to each cluster
        words_cluster = {}
        cluster_counter = {}
        for itr in range(len(clean_docs)):
            cn = P[itr]
            words = clean_docs[itr]
            if cn not in cluster_counter:
                cluster_counter[cn] = 1
            cluster_counter[cn] += 1

            if cn not in words_cluster:
                words_cluster[cn] = {}
            for w in words:
                if w not in words_cluster[cn]:
                    words_cluster[cn][w] = 1
                else:
                    words_cluster[cn][w] += 1

        # find a albel for each cluster
        cluster2word = {}
        for cn in words_cluster:
            sorted_words = []
            for w in sorted(words_cluster[cn],
                            key=words_cluster[cn].get,
                            reverse=True):
                sorted_words.append(w)
            cluster2word[cn] = sorted_words

        self.main_topics_id = []
        for i in sorted(cluster_counter, key=cluster_counter.get,
                        reverse=True):
            self.main_topics_id.append(i)

        self.cluster_names = {}
        for ci in cluster2word:
            v = cluster2word[ci][:4]
            #print(ci, " --- ", v)
            self.cluster_names[ci] = "-".join(v)

        print("TOP TOPICS")
        for i in range(10):
            tid = self.main_topics_id[i]
            print(tid, " --- ", self.cluster_names[tid])
예제 #15
0
            del v[5]  #remove ss
            del v[-2]  #remove aidf
            del v[-3]  #remove aidf
            del v[1]
            heads.append(tokens[0])
            heads2values[tokens[0]] = v
            values.append(v)

    #print len(heads)
    #print len(values)
    print(key)
    #print heads
    #print values
    vectors = [array(f) for f in values]
    clusterer = cluster.KMeansClusterer(3, euclidean_distance)
    clusters = clusterer.cluster(vectors, True)
    print(clusters)
    #print len(clusters)
    #print vectors

    cluster2head = defaultdict(list)
    for i in range(0, len(clusters)):
        head = heads[i]
        cl = clusters[i]
        cluster2head[cl].append(head)

    for cl in list(cluster2head.keys()):
        print("Cluster: {0}".format(cl))
        print("{0:31} {1}".format("head", key))
        for head in cluster2head[cl]:
예제 #16
0
#generating the vectors for each text
tweetsBoW = [None] * len(tweets)
for tweet in range(0, len(tweets)):
    tweetsBoW[tweet] = numpy.zeros(len(vocab))
    for w in tweets[tweet]:
        for i, word in enumerate(vocab):
            if word == w:
                tweetsBoW[tweet][i] += 1

#clustering

nClustersStr = input("---> How many clusters do you want to use? ")
nClusters = int(nClustersStr)
kmeans = cluster.KMeansClusterer(nClusters,
                                 cosine_distance,
                                 avoid_empty_clusters=True,
                                 conv_test=1e-4)
clusters = kmeans.cluster(tweetsBoW, True, trace=True)

for doc, cls in zip(unchangedTweets, clusters):
    print(cls, doc)

#plotting clusters number of elements for analysis
labels = list(Counter(clusters))
print(labels)
values = list(Counter(clusters).values())
print(values)
y_pos = numpy.arange(len(labels))
plt.bar(y_pos, values, color=(0.5, 0.1, 0.5, 0.6))
plt.title('Number of clusters in total = ' + str(nClusters))
plt.xlabel('Clusters')