示例#1
0
def demo():
    # example from figure 14.9, page 517, Manning and Schutze

    from nltk.cluster import KMeansClusterer, euclidean_distance

    vectors = [numpy.array(f) for f in [[2, 1], [1, 3], [4, 7], [6, 7]]]
    means = [[4, 3], [5, 5]]

    clusterer = KMeansClusterer(2, euclidean_distance, initial_means=means)
    clusters = clusterer.cluster(vectors, True, trace=True)

    print 'Clustered:', vectors
    print 'As:', clusters
    print 'Means:', clusterer.means()
    print

    vectors = [numpy.array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]]

    # test k-means using the euclidean distance metric, 2 means and repeat
    # clustering 10 times with random seeds

    clusterer = KMeansClusterer(2, euclidean_distance, repeats=10)
    clusters = clusterer.cluster(vectors, True)
    print 'Clustered:', vectors
    print 'As:', clusters
    print 'Means:', clusterer.means()
    print

    # classify a new vector
    vector = numpy.array([3, 3])
    print 'classify(%s):' % vector,
    print clusterer.classify(vector)
    print
示例#2
0
def demo():
    # example from figure 14.9, page 517, Manning and Schutze

    from nltk.cluster import KMeansClusterer, euclidean_distance

    vectors = [numpy.array(f) for f in [[2, 1], [1, 3], [4, 7], [6, 7]]]
    means = [[4, 3], [5, 5]]

    clusterer = KMeansClusterer(2, euclidean_distance, initial_means=means)
    clusters = clusterer.cluster(vectors, True, trace=True)

    print 'Clustered:', vectors
    print 'As:', clusters
    print 'Means:', clusterer.means()
    print

    vectors = [numpy.array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]]

    # test k-means using the euclidean distance metric, 2 means and repeat
    # clustering 10 times with random seeds

    clusterer = KMeansClusterer(2, euclidean_distance, repeats=10)
    clusters = clusterer.cluster(vectors, True)
    print 'Clustered:', vectors
    print 'As:', clusters
    print 'Means:', clusterer.means()
    print

    # classify a new vector
    vector = numpy.array([3, 3])
    print 'classify(%s):' % vector,
    print clusterer.classify(vector)
    print
示例#3
0
def grouper(filename):
    stemmer_func = nltk.stem.snowball.EnglishStemmer().stem

    @decorators.memoize
    def normalize_word(word):
        return stemmer_func(word.lower())

    def get_words(titles):
        words = set()
        for title in job_titles:
            for word in title.split():
                words.add(normalize_word(word))
        return list(words)

    @decorators.memoize
    def vectorspaced(title):
        title_components = [normalize_word(word) for word in title.split()]
        return numpy.array([
            word in title_components for word in words], numpy.short)
        
    with open(filename) as title_file:

        job_titles = [line.decode('utf-8').strip() for line in title_file.readlines()]
        #name = Data(keyword = job_titles)
        #db.session.add(name)
        #db.session.commit()
        words = get_words(job_titles)
        if len(words) >= 1500:
            k = 75
        elif len(words) >= 500 and len(words) < 1000:
            k = 55
        elif len(words) >200 and len(words)<500:
            k =30
        else:
            k = 15
       

        cluster = KMeansClusterer(k,euclidean_distance,avoid_empty_clusters = True )
        cluster.cluster([vectorspaced(title) for title in job_titles if title])

        classified_examples = [
                cluster.classify(vectorspaced(title)) for title in job_titles
            ]
        global gen_file
        gen_file =str(uuid.uuid4())+".csv"
        f = open("/home/ubuntu/downloads/"+gen_file,'wb')
        try:
            w = csv.writer(f)
            w.writerow(('Search Terms','GroupID'))
            for cluster_id, title in sorted(zip(classified_examples, job_titles)):
                w.writerow((title.encode('utf-8'),cluster_id))
            #print "done"
        finally:
            f.close()
        f1 = open("/home/ubuntu/time/"+gen_file+".txt", 'wb')
        try:
            t = (time.time() - start_time)
            f1.write(str(t))
        finally:
            f.close()
示例#4
0
def get_clusters(txt):
    clusters = {}
    num_clusters = len(txt) / 4
    if num_clusters < 2:
        num_clusters = 2
    if num_clusters > 5:
        num_clusters = 5
    #txt = [''.join([l for l in txt])]
    #print txt
    responses = [line.strip() for line in txt]
    words = get_words(responses)

    cluster = KMeansClusterer(num_clusters,
                              euclidean_distance,
                              repeats=100,
                              avoid_empty_clusters=True)
    cluster.cluster(
        [vectorspaced(response, words) for response in responses if response])
    classified_examples = [
        cluster.classify(vectorspaced(response, words))
        for response in responses
    ]

    for cluster_id, title in sorted(zip(classified_examples, responses)):
        if not cluster_id in clusters:
            clusters[cluster_id] = [title]
        else:
            clusters[cluster_id].append(title)

    return (clusters)
def cluster_kmean(train_file, test_file):
    """Load train and test data into data frames"""
    f_train = open(train_file, encoding="utf-8")
    train_data = json.load(f_train)
    df_train = pd.DataFrame(train_data, columns=['text'])
    f_train.close()

    f_test = open(test_file, encoding='utf-8')
    test_data = json.load(f_test)
    df_test = pd.DataFrame(test_data, columns=['text', 'labels'])
    f_test.close()

    labels = df_test.labels
    labels = list(set(sum(labels, [])))[:3]
    """"Initialize TF-IDF vectorizer"""
    tfidf_vect = TfidfVectorizer(stop_words="english", min_df=5)

    dtm = tfidf_vect.fit_transform(df_train['text'])

    num_clusters = 3
    """Initialize clutering"""
    clusterer = KMeansClusterer(num_clusters, cosine_distance, repeats=20)

    clusters = clusterer.cluster(dtm.toarray(), assign_clusters=True)

    centroids = np.array(clusterer.means())

    sorted_centroids = centroids.argsort()[:, ::-1]

    voc_lookup = tfidf_vect.get_feature_names()

    test_dtm = tfidf_vect.transform(df_test.text)

    predicted = [clusterer.classify(v) for v in test_dtm.toarray()]

    df_test['label_test'] = df_test['labels'].apply(lambda x: x[0])

    confusion_df = pd.DataFrame(list(
        zip(df_test["label_test"].values, predicted)),
                                columns=["actual_class", "cluster"])

    df_result = pd.crosstab(index=confusion_df.cluster,
                            columns=confusion_df.actual_class)

    print(df_result)

    df_clusterLabelsPredicted = list(
        df_result.apply(lambda x: x.idxmax(), axis=1))
    cluster_dict = dict(
        (i, j) for i, j in enumerate(df_clusterLabelsPredicted))

    predicted_target = [cluster_dict[i] for i in predicted]

    print(
        metrics.classification_report(df_test["label_test"], predicted_target))
    for i in cluster_dict:
        print("Cluster %d : Topic %s" % (i, cluster_dict[i]))
示例#6
0
def cluster_kmean(train_file, test_file):

    data = pd.read_json(train_file, orient='columns')
    data.columns = ["text"]
    tfidf_vect = TfidfVectorizer(min_df=5, stop_words='english')
    dtm = tfidf_vect.fit_transform(data["text"])

    num_clusters = 3
    clusterer = KMeansClusterer(num_clusters, cosine_distance, repeats=5)
    clusters = clusterer.cluster(dtm.toarray(), assign_clusters=True)

    test = pd.read_json(test_file, orient='columns')
    test.columns = ["text", "label"]

    #to convert dataframe with multiple targets to the first target
    x = test["label"]
    truth = []
    for item in x:
        truth.append(item[0])
    test["label"] = truth

    test_dtm = tfidf_vect.transform(test["text"])
    predicted = [clusterer.classify(v) for v in test_dtm.toarray()]
    confusion_df = pd.DataFrame(list(zip(test["label"].values, predicted)),
                                columns=["label", "cluster"])
    crosstab = pd.crosstab(index=confusion_df.cluster,
                           columns=confusion_df.label)
    print("using cosine: ")
    print(crosstab)
    dfmax = crosstab.idxmax(axis=1)
    print(dfmax)
    cluster_dict = {0: dfmax[0], 1: dfmax[1], 2: dfmax[2]}
    predicted_target = [cluster_dict[i] for i in predicted]

    print(metrics.classification_report(test["label"], predicted_target))

    # Kmeans with 20 different centroid seeds
    num_clusters = 3
    km = KMeans(n_clusters=num_clusters, n_init=20).fit(dtm)
    clusters = km.labels_.tolist()
    predicted2 = km.predict(test_dtm)
    confusion_df2 = pd.DataFrame(list(zip(test["label"].values, predicted2)),
                                 columns=["label", "cluster"])

    crosstab2 = pd.crosstab(index=confusion_df2.cluster,
                            columns=confusion_df2.label)
    print("using Euclidean distance")
    print(crosstab2)
    dfmax = crosstab2.idxmax(axis=1)
    print(dfmax)
    cluster_dict = {0: dfmax[0], 1: dfmax[1], 2: dfmax[2]}

    predicted_target2 = [cluster_dict[i] for i in predicted2]
    print(metrics.classification_report(test["label"], predicted_target2))

    return None
def get_word_clusters(tweets):
    ListTweets = get_all_text(tweets)
    ListTweets = list(ListTweets)
    #   Project tweet text onto a vector space 
    vs_tweets = list(TweetVectors(tweets))
    cluster = KMeansClusterer(10, euclidean_distance, avoid_empty_clusters = True)
    cluster.cluster(vs_tweets)
    classified_examples = [ cluster.classify(tweet) for tweet in vs_tweets ]
    for cluster_id, tweet in sorted(zip(classified_examples, ListTweets)):
        print cluster_id, tweet
示例#8
0
def main():
    tracknames = get_tracknames()  
    #title_file = open("example_jobs.txt", 'r')

    #job_titles = [line.strip() for line in title_file.readlines()]
    words = get_words(tracknames)

    cluster = KMeansClusterer(20, euclidean_distance, avoid_empty_clusters=True)
    cluster.cluster([vectorspaced(trakname, words) for trakname in tracknames if trakname])
    classified_examples = [cluster.classify(vectorspaced(trackname, words)) for trackname in tracknames]


    for cluster_id, title in sorted(zip(classified_examples, tracknames)):
        print cluster_id, title
示例#9
0
文件: kmeans.py 项目: yokeyong/atap
class KMeansTopics(object):

    def __init__(self, corpus, k=10):
        """
        corpus is a corpus object, e.g. an HTMLCorpusReader()
        or an HTMLPickledCorpusReader() object

        k is the number of clusters
        """
        self.k = k
        self.model = None
        self.vocab = list(
            set(normalize(corpus.words(categories=['news'])))
            )

    def vectorize(self, document):
        """
        Vectorizes a document consisting of a list of part of speech
        tagged tokens using the segmentation and tokenization methods.

        One-hot encode the set of documents
        """
        features = set(normalize(document))
        return np.array([
            token in features for token in self.vocab], np.short)

    def cluster(self, corpus):
        """
        Fits the K-Means model to the given data.
        """
        cosine = nltk.cluster.util.cosine_distance
        self.model = KMeansClusterer(
            self.k, distance=cosine, avoid_empty_clusters=True)
        self.model.cluster([
            self.vectorize(
                corpus.words(fileid)
            ) for fileid in corpus.fileids(categories=['news'])
        ])

    def classify(self, document):
        """
        Pass through to the internal model classify
        """
        return self.model.classify(self.vectorize(document))
示例#10
0
class KMeansTopics(object):
    def __init__(self, corpus, k=10):
        """
        corpus is a corpus object, e.g. an HTMLCorpusReader()
        or an HTMLPickledCorpusReader() object

        k is the number of clusters
        """
        self.k = k
        self.model = None
        self.vocab = list(set(normalize(corpus.words(categories=['news']))))

    def vectorize(self, document):
        """
        Vectorizes a document consisting of a list of part of speech
        tagged tokens using the segmentation and tokenization methods.

        One-hot encode the set of documents
        """
        features = set(normalize(document))
        return np.array([token in features for token in self.vocab], np.short)

    def cluster(self, corpus):
        """
        Fits the K-Means model to the given data.
        """
        cosine = nltk.cluster.util.cosine_distance
        self.model = KMeansClusterer(self.k,
                                     distance=cosine,
                                     avoid_empty_clusters=True)
        self.model.cluster([
            self.vectorize(corpus.words(fileid))
            for fileid in corpus.fileids(categories=['news'])
        ])

    def classify(self, document):
        """
        Pass through to the internal model classify
        """
        return self.model.classify(self.vectorize(document))
示例#11
0
def cluster_kmean(train_file, test_file):
    with open(train_file, 'r', encoding='utf-8') as f:
        jayson_train = json.load(f)

    with open(test_file, 'r', encoding='utf-8') as f:
        jayson_test = json.load(f)

    train = pd.DataFrame(jayson_train)
    # Initialize the TfidfVectorizer
    # Set min document frequency to 5
    tfidf_vect = TfidfVectorizer(stop_words="english", min_df=5)
    dtm = tfidf_vect.fit_transform(train[0])
    # set number of clusters
    num_clusters = 3
    clusterer_Cos = KMeansClusterer(num_clusters,
                                    distance=cosine_distance,
                                    repeats=20)
    clusterer_Euc = KMeansClusterer(num_clusters,
                                    distance=euclidean_distance,
                                    repeats=20)
    clusters_cos = clusterer_Cos.cluster(dtm.toarray(), assign_clusters=True)
    clusters_Euc = clusterer_Euc.cluster(dtm.toarray(), assign_clusters=True)

    test = pd.DataFrame(jayson_test)
    # Use the first label in the ground-truth label list of each test document
    t = []
    for i in test[1]:
        t.append(i[0])
    test['label'] = t
    # Make prediction on test sample
    test_dtm = tfidf_vect.transform(test[0])
    predicted_cos = [clusterer_Cos.classify(v) for v in test_dtm.toarray()]
    predicted_Euc = [clusterer_Euc.classify(v) for v in test_dtm.toarray()]
    # Create a dataframe with cluster id and ground truth label
    confusion_df_cos = pd.DataFrame(list(
        zip(test['label'].values, predicted_cos)),
                                    columns=["label", "cluster"])
    confusion_df_Euc = pd.DataFrame(list(
        zip(test['label'].values, predicted_Euc)),
                                    columns=["label", "cluster"])
    # Draw the crosstab table
    crosstab_cos = pd.crosstab(index=confusion_df_cos['cluster'],
                               columns=confusion_df_cos['label'])
    crosstab_Euc = pd.crosstab(index=confusion_df_Euc['cluster'],
                               columns=confusion_df_Euc['label'])
    # Draw the majority vote into dictionary
    majority_vote_cos = crosstab_cos.idxmax(axis=1, skipna=True).to_dict()
    majority_vote_Euc = crosstab_Euc.idxmax(axis=1, skipna=True).to_dict()
    # Map true label to cluster id
    predicted_target_cos = [majority_vote_cos[i] for i in predicted_cos]
    predicted_target_Euc = [majority_vote_Euc[i] for i in predicted_Euc]
    # Precision/recall/f-score for each label
    result_cos = metrics.classification_report(test["label"],
                                               predicted_target_cos)
    result_Euc = metrics.classification_report(test["label"],
                                               predicted_target_Euc)
    # Print out the result
    print('cosine')
    print(crosstab_cos)
    for i in majority_vote_cos:
        print('Cluster %d: Topic %s' % (i, majority_vote_cos[i]))
    print(result_cos)
    print('\nL2')
    print(crosstab_Euc)
    for i in majority_vote_Euc:
        print('Cluster %d: Topic %s' % (i, majority_vote_Euc[i]))
    print(result_Euc)
    return None
示例#12
0
print cosine

from nltk.cluster import KMeansClusterer, euclidean_distance
#import nltk.stem
#stemmer_func = nltk.stem.snowball.SnowballStemmer("english").stem
#stopwords = set(nltk.corpus.stopwords.words('english'))

#def normalize_word(word):
#    return stemmer_func(word.lower())

#def get_words(blogs):
 #   words = set()
  #  for post in sentences:
   #     for word in post.split():
    #        words.add(normalize_word(word))
    #return list(words)

def vectorspaced(post):
    #post_components = [normalize_word(word) for word in post.split()]
    return numpy.array([
        word in words and not word in stop_words
        for word in words], numpy.short)

#words = get_words(sentences)

cluster = KMeansClusterer(7, euclidean_distance)
cluster.cluster([vectorspaced(post) for post in blog_data if post])
classified_examples = [cluster.classify(vectorspaced(post)) for post in blog_data]

for cluster_id, post in sorted(zip(classified_examples, blog_data)):
    print cluster_id, post
def cluster_kmean(train_file, test_file):
    with open(train_file) as json_train_file:
        train_json_data = json.load(json_train_file)
        train_json_dataframe = pd.DataFrame(train_json_data)
        train_json_dataframe.columns = ['Text']
        #print(train_json_dataframe)

    with open(test_file) as json_test_file:
        test_json_data = json.load(json_test_file)
        test_json_dataframe = pd.DataFrame(test_json_data)
        test_json_dataframe.columns = ['Text', 'Labels']
        test_json_dataframe['First'] = [
            x[0] for x in test_json_dataframe.Labels
        ]
        unique_variety = test_json_dataframe["First"].unique()
#         print(unique_variety)
#         print(test_json_dataframe)

# set the min document frequency to 5
# generate tfidf matrix
    tfidf_vect = TfidfVectorizer(stop_words="english", min_df=5)

    dtm = tfidf_vect.fit_transform(train_json_dataframe['Text'])
    #print (dtm.shape)

    # set number of clusters
    num_clusters = 3

    # initialize clustering model
    # using cosine distance
    # clustering will repeat 20 times
    # each with different initial centroids
    clusterer = KMeansClusterer(num_clusters, cosine_distance, repeats=20)

    # samples are assigned to cluster labels
    # starting from 0
    clusters = clusterer.cluster(dtm.toarray(), assign_clusters=True)

    #print the cluster labels of the first 5 samples
    #print(clusters[0:5])

    # note transform function is used
    # not fit_transform
    test_dtm = tfidf_vect.transform(test_json_dataframe["Text"])

    predicted = [clusterer.classify(v) for v in test_dtm.toarray()]

    #print(predicted[0:10])

    # determine cluster labels and calcuate precision and recall

    # Create a dataframe with cluster id and
    # ground truth label
    confusion_df = pd.DataFrame(list(
        zip(test_json_dataframe['First'].values, predicted)),
                                columns=["label", "cluster"])
    confusion_df.head()

    # generate crosstab between clusters and true labels
    print(pd.crosstab(index=confusion_df.cluster, columns=confusion_df.label))

    # Map cluster id to true labels by "majority vote"
    cluster_dict = {i: j for i, j in enumerate(unique_variety)}
    print(cluster_dict)

    # Map true label to cluster id
    predicted_target = [cluster_dict[i] for i in predicted]

    print(
        metrics.classification_report(test_json_dataframe['First'],
                                      predicted_target))
tfidf = transformer.fit_transform(vectorizer.fit_transform(articals))
print(tfidf.toarray)
print(tfidf)
dtm = tfidf.toarray()

#使用TF-IDF矩阵对章节进行聚类
## 使用夹角余弦距离进行k均值聚类
#越接近1,夹角越接近0,越相似
kmeans = KMeansClusterer(
    num_means=3,  ## 聚类数目
    distance=nltk.cluster.util.cosine_distance,  ## 夹角余弦距离
)
kmeans.cluster(dtm)

##  聚类得到的类别
labpre = [kmeans.classify(i) for i in dtm]
kmeanlab = Red_df[["ChapName", "Chapter"]]
kmeanlab["cosd_pre"] = labpre
print(kmeanlab)

## 查看每类有多少个分组
count = kmeanlab.groupby("cosd_pre").count()

## 可视化
count = count.reset_index()
count.plot(kind="barh",
           figsize=(6, 5),
           x="cosd_pre",
           y="ChapName",
           legend=False)
for xx, yy, s in zip(count.cosd_pre, count.ChapName, count.ChapName):
示例#15
0
        for word in words], numpy.short)
 
if __name__ == '__main__':
 
    filename=r'C:\Users\Shravya.Shanmukh\AppData\Roaming\Microsoft\Windows\Start Menu\Programs\Python 3.6\Cars.txt'
    
    with open(filename) as title_file:
 
        print ("Reading Files")
        job_titles = [line.strip() for line in title_file.readlines()]
 
        print ("Parsing Words")
        words = get_words(job_titles)
 
        print ("Creating Cluster Instance")
        cluster = KMeansClusterer(4, euclidean_distance, 5)

              
        print ("Clustering")
        cluster.cluster([vectorspaced(title) for title in job_titles if title])
 
        print ("Classifying")
        classified_examples = [
                cluster.classify(vectorspaced(title)) for title in job_titles
           ]
        print ("Saving results")
        for cluster_id, title in sorted(zip(classified_examples, job_titles)):
            filename = r'C:\Users\Shravya.Shanmukh\AppData\Roaming\Microsoft\Windows\Start Menu\Programs\Python 3.6\results'+ str(cluster_id) + ".csv"
            list = codecs.open(filename, "a", "utf-8")
            list.write(title + "\n")
示例#16
0
        exit()
 
    with open(filename) as title_file:
 
        print "Reading Files"
        job_titles = [unicode(line.strip(), "utf-8") for line in title_file.readlines()]
 
        print "Parsing Words"
        words = get_words(job_titles)
 
        print "Creating Cluster Instance"
        cluster = KMeansClusterer(10, euclidean_distance, 5)

        # Alternative Clusterer - Less accurate for my use
        #cluster = GAAClusterer(20)
        
        print "Clustering"
        cluster.cluster([vectorspaced(title) for title in job_titles if title])
 
        # NOTE: This is inefficient, cluster.classify should really just be
        # called when you are classifying previously unseen examples!
        print "Classifying"
        classified_examples = [
                cluster.classify(vectorspaced(title)) for title in job_titles
           ]
        print "Saving results"
        for cluster_id, title in sorted(zip(classified_examples, job_titles)):
            filename = "results/"+ str(cluster_id) + ".txt"
            list = codecs.open(filename, "a", "utf-8")
            list.write(title + "\n")
示例#17
0
    clustered: pd.DataFrame = vwp[vwp["cluster"] == i]
    print("Sample:")
    for index, row in clustered.sample(5, random_state=222).iterrows(
    ):  # print 5 sample paragraphs from cluster
        content: str = recover_raw_paragraph(vwpr, row["letter"],
                                             row["offset"])
        print(
            f"[{index}] letter {row['letter']}, paragraph {row['offset']}: {content}, embedded words: {row['chosen_words']}"
        )

    print(f"Paragraphs most similar to mean vector of cluster {i}:")
    similars: pd.DataFrame = most_similar_paragraphs(vwp, means[i])
    for index, row in similars.head(3).iterrows():
        content: str = recover_raw_paragraph(vwpr, row["letter"],
                                             row["offset"])
        in_cluster: bool = "YES" if int(vwp.at[index,
                                               "cluster"]) == i else "NO"
        print(
            f"[{index}] letter {row['letter']}, paragraph {row['offset']} (similarity: {row['similarity']}) (in cluster {i}? {in_cluster}):\n{content}"
        )

print("\n\n")

for i in range(10):  # tests
    test = np.random.choice(vwp.index)
    classification = kmeans.classify(np.array(vwp.at[test, "embedding"]))
    print(f"TEST index: {test}, classification: {classification}")
    print(f"CONTENT: {vwpr.at[test, 'text']}")

# save dataframe after clustering
vwp.to_json(VWP_CLUSTERED, orient="index")
示例#18
0
	0's are inserted otherwise. 
	@param response The survey response to generate a vector for 
    '''
    response_components = [normalize_word(word) for word in response.split()]    
    return numpy.array([
        word in response_components and not word in stopwords
        for word in words], numpy.short)
 
if __name__ == '__main__':
 
    num_clusters = DEFAULT_NUM_CLUSTERS
    if len(sys.argv) == 2:
        num_clusters = int(sys.argv[1])
 
    with open("reviews.txt") as survey_file:
 
        responses = [line.strip() for line in survey_file.readlines()]
 
        words = get_words(responses)
 
	cluster = KMeansClusterer(num_clusters, euclidean_distance,
			repeats=100, avoid_empty_clusters=True)
        cluster.cluster([vectorspaced(response) for response in responses if response])
 
        classified_examples = [
                cluster.classify(vectorspaced(response)) for response in responses
            ]
 
        for cluster_id, title in sorted(zip(classified_examples, responses)):
            print cluster_id, title
示例#19
0

vectors = [np.array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]]

# test k-means using 2 means, euclidean distance, and 10 trial clustering repetitions with random seeds
clusterer = KMeansClusterer(2, euclidean_distance, repeats=10)
clusters = clusterer.cluster(vectors, True)
centroids = clusterer.means()
print('Clustered:', vectors)
print('As:', clusters)
print('Means:', centroids)

# classify a new vector
vector = np.array([2,2])
print('classify(%s):' % vector, end=' ')
print(clusterer.classify(vector))


# **Plot a Chart of the Clusters in Example-2**  
# Make a Scatter Plot of the two clusters using matplotlib.pyplot.   
# We plot all the points in cluster-0 blue, and all the points in cluster-1 red. Then we plot the two centroids in orange.  
# I used list comprehensions to create new lists for all the x0, y0, x1 and y1 values.

# In[104]:


import matplotlib.pyplot as plt

x0 = np.array([x[0] for idx, x in enumerate(vectors) if clusters[idx]==0])
y0 = np.array([x[1] for idx, x in enumerate(vectors) if clusters[idx]==0])
plt.scatter(x0,y0, color='blue')
示例#20
0
# Step 5: k-means clustering
vectors = [array(f) for f in doc_lda]
clusterer = KMeansClusterer(num_topics,
                            euclidean_distance,
                            repeats=100,
                            avoid_empty_clusters=True)
clusterer.cluster(vectors, True)

apps_per_topic = []
for x in range(num_topics):
    apps_per_topic.append([])

# classify a new vector
apk_names = name_desc_pairs.keys()
for i, doc in enumerate(doc_lda):
    topic_id = clusterer.classify(array(doc))
    apps_per_topic[topic_id].append(apk_names[i])

# Step 6: make text for each topic
text_for_topics = []
for x in range(num_topics):
    text_for_topics.append('')

apkname_stem_pairs = dict(zip(name_desc_pairs.keys(), processed))
for topic_id, names in enumerate(apps_per_topic):
    for name in names:
        # FIXME: there have two options for word cloud 1) pure descriptions 2) using stem processed
        # text_for_topics[topic_id] = text_for_topics[topic_id] + " " + name_desc_pairs[name]
        text = " ".join(apkname_stem_pairs[name])
        text_for_topics[topic_id] = text_for_topics[topic_id] + text
示例#21
0
vectors = [numpy.array(f) for f in [[2, 1], [1, 3], [4, 7], [6, 7]]]
means = [[4, 3], [5, 5]]

clusterer = KMeansClusterer(2, euclidean_distance, initial_means=means)
clusters = clusterer.cluster(vectors, True, trace=True)

print('Clustered:', vectors)
print('As:', clusters)
print('Means:', clusterer.means())
print()

vectors = [
    numpy.array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]
]

# test k-means using the euclidean distance metric, 2 means and repeat
# clustering 10 times with random seeds

clusterer = KMeansClusterer(2, euclidean_distance, repeats=10)
clusters = clusterer.cluster(vectors, True)
print('Clustered:', vectors)
print('As:', clusters)
print('Means:', clusterer.means())
print()

# classify a new vector
vector = numpy.array([3, 3])
print('classify(%s):' % vector, end=' ')
print(clusterer.classify(vector))
print()