Python KMeansClusterer.cluster 예제들, nltk.cluster.KMeansClusterer.cluster Python 예제들

예제 #1

0

파일 보기

def grouper(filename):
    stemmer_func = nltk.stem.snowball.EnglishStemmer().stem

    @decorators.memoize
    def normalize_word(word):
        return stemmer_func(word.lower())

    def get_words(titles):
        words = set()
        for title in job_titles:
            for word in title.split():
                words.add(normalize_word(word))
        return list(words)

    @decorators.memoize
    def vectorspaced(title):
        title_components = [normalize_word(word) for word in title.split()]
        return numpy.array([
            word in title_components for word in words], numpy.short)
        
    with open(filename) as title_file:

        job_titles = [line.decode('utf-8').strip() for line in title_file.readlines()]
        #name = Data(keyword = job_titles)
        #db.session.add(name)
        #db.session.commit()
        words = get_words(job_titles)
        if len(words) >= 1500:
            k = 75
        elif len(words) >= 500 and len(words) < 1000:
            k = 55
        elif len(words) >200 and len(words)<500:
            k =30
        else:
            k = 15
       

        cluster = KMeansClusterer(k,euclidean_distance,avoid_empty_clusters = True )
        cluster.cluster([vectorspaced(title) for title in job_titles if title])

        classified_examples = [
                cluster.classify(vectorspaced(title)) for title in job_titles
            ]
        global gen_file
        gen_file =str(uuid.uuid4())+".csv"
        f = open("/home/ubuntu/downloads/"+gen_file,'wb')
        try:
            w = csv.writer(f)
            w.writerow(('Search Terms','GroupID'))
            for cluster_id, title in sorted(zip(classified_examples, job_titles)):
                w.writerow((title.encode('utf-8'),cluster_id))
            #print "done"
        finally:
            f.close()
        f1 = open("/home/ubuntu/time/"+gen_file+".txt", 'wb')
        try:
            t = (time.time() - start_time)
            f1.write(str(t))
        finally:
            f.close()

예제 #2

0

파일 보기

def get_clusters(txt):
    clusters = {}
    num_clusters = len(txt) / 4
    if num_clusters < 2:
        num_clusters = 2
    if num_clusters > 5:
        num_clusters = 5
    #txt = [''.join([l for l in txt])]
    #print txt
    responses = [line.strip() for line in txt]
    words = get_words(responses)

    cluster = KMeansClusterer(num_clusters,
                              euclidean_distance,
                              repeats=100,
                              avoid_empty_clusters=True)
    cluster.cluster(
        [vectorspaced(response, words) for response in responses if response])
    classified_examples = [
        cluster.classify(vectorspaced(response, words))
        for response in responses
    ]

    for cluster_id, title in sorted(zip(classified_examples, responses)):
        if not cluster_id in clusters:
            clusters[cluster_id] = [title]
        else:
            clusters[cluster_id].append(title)

    return (clusters)

예제 #3

0

파일 보기

파일: kmeans.py 프로젝트: Jaspreet10/moody

def demo():
    # example from figure 14.9, page 517, Manning and Schutze

    from nltk.cluster import KMeansClusterer, euclidean_distance

    vectors = [numpy.array(f) for f in [[2, 1], [1, 3], [4, 7], [6, 7]]]
    means = [[4, 3], [5, 5]]

    clusterer = KMeansClusterer(2, euclidean_distance, initial_means=means)
    clusters = clusterer.cluster(vectors, True, trace=True)

    print('Clustered:', vectors)
    print('As:', clusters)
    print('Means:', clusterer.means())
    print()

    vectors = [numpy.array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]]

    # test k-means using the euclidean distance metric, 2 means and repeat
    # clustering 10 times with random seeds

    clusterer = KMeansClusterer(2, euclidean_distance, repeats=10)
    clusters = clusterer.cluster(vectors, True)
    print('Clustered:', vectors)
    print('As:', clusters)
    print('Means:', clusterer.means())
    print()

    # classify a new vector
    vector = numpy.array([3, 3])
    print('classify(%s):' % vector, end=' ')
    print(clusterer.classify(vector))
    print()

예제 #4

0

파일 보기

파일: cluster.py 프로젝트: zhuxi0511/NLTK_test

def my_demo_main(file_list_name, tokenizer_num=0):
    from mmseg import seg_txt
    from nltk.cluster import KMeansClusterer, euclidean_distance
    from nltk.cluster import GAAClusterer
    tokenizer_list = [seg_txt,]
    file_list = open(file_list_name)
    tokenizer = tokenizer_list[tokenizer_num]
    texts = [[term for term in tokenizer(open('pos/' + str(file_name.strip())).read())] for file_name in file_list]

    data = TF_IDF(texts)

    vectors = []

    file_count = 1
    feature_set = set()
    for text in data.texts:
        vector = list()
        for term in set(text):
            vector.append((data.tf_idf(term, text), term))
        vector.sort(key=lambda x:x[0], reverse=True)
        for term in vector[:int(len(vector)*0.15) + 1]:
            feature_set.add(term[1])

    print feature_set
    print len(feature_set)
    for text in data.texts:
        vector = list()
        for term in feature_set:
            if term in text:
                vector.append(data.tf_idf(term, text))
            else:
                vector.append(0)
        square_sum = map(lambda x:x*x, vector)
        square_sum = math.sqrt(sum(square_sum))
        vector = map(lambda x:x/square_sum, vector)
        vectors += [numpy.array(vector)]
        print file_count
        file_count += 1

    means = find_max_density(vectors, euclidean_distance);
    print 'means', len(means)

    f = open('result.txt', 'w')
    clusterer = KMeansClusterer(len(means), euclidean_distance, initial_means=means)
    clusters = clusterer.cluster(vectors, True, True)
    print 'km1', clusters
    f.write('km1: ' + str(clusters) + '\n')

    clusterer = KMeansClusterer(len(vectors) / 10, euclidean_distance, repeats=10)

    clusters = clusterer.cluster(vectors, True, True)
    print 'km2', clusters
    f.write('km2: ' + str(clusters) + '\n')

    clusterer = GAAClusterer(len(vectors) / 10)
    clusters = clusterer.cluster(vectors, True)
    print 'gaac', clusters
    f.write('gaac: ' + str(clusters) + '\n')
    f.close()

예제 #5

0

파일 보기

파일: analyze.py 프로젝트: MusharafZiaLone/twitter-archive-analysis

def get_word_clusters(tweets):
    ListTweets = get_all_text(tweets)
    ListTweets = list(ListTweets)
    #   Project tweet text onto a vector space 
    vs_tweets = list(TweetVectors(tweets))
    cluster = KMeansClusterer(10, euclidean_distance, avoid_empty_clusters = True)
    cluster.cluster(vs_tweets)
    classified_examples = [ cluster.classify(tweet) for tweet in vs_tweets ]
    for cluster_id, tweet in sorted(zip(classified_examples, ListTweets)):
        print cluster_id, tweet

예제 #6

0

파일 보기

파일: cluster.py 프로젝트: hiroaki8388/dazai

class KMeansTopics(BaseEstimator, TransformerMixin):
    def __init__(self, k=10):
        """
        :param k: クラスタ数　int
        """
        cosine = nltk.cluster.util.cosine_distance
        self.model = KMeansClusterer(
            k, distance=cosine, avoid_empty_clusters=True)

    def fit(self, sents):
        return self

    def transform(self, sents):
        self.model.cluster(sents)

예제 #7

0

파일 보기

파일: midimetadata.py 프로젝트: b0rkestra/b0rkestra

def main():
    tracknames = get_tracknames()  
    #title_file = open("example_jobs.txt", 'r')

    #job_titles = [line.strip() for line in title_file.readlines()]
    words = get_words(tracknames)

    cluster = KMeansClusterer(20, euclidean_distance, avoid_empty_clusters=True)
    cluster.cluster([vectorspaced(trakname, words) for trakname in tracknames if trakname])
    classified_examples = [cluster.classify(vectorspaced(trackname, words)) for trackname in tracknames]


    for cluster_id, title in sorted(zip(classified_examples, tracknames)):
        print cluster_id, title

예제 #8

0

파일 보기

    def clusterize(self, noClusters, noNounsToKeep, **kwargs):
        """
        """
        root = getUtility(ISiteRoot)
        catalog = getToolByName(root, 'portal_catalog')

        nounTermsIndex = catalog._catalog.getIndex('noun_terms')
        uidTermsIndex = catalog._catalog.getIndex('UID')
        nounTermsIndexIds = []
        allNouns = set()
        docnouns = []
        vectors = []

        for key in nounTermsIndex._unindex.keys():
            importantNouns = nounTermsIndex._unindex[key][:noNounsToKeep]
            if importantNouns:
                nounTermsIndexIds.append(key)
                docnouns.append(importantNouns)
                allNouns = allNouns.union(importantNouns)

        for nouns in docnouns:
            vector = [(noun in nouns and 1 or 0) for noun in allNouns]
            vectors.append(numpy.array(vector))

        clusterer = KMeansClusterer(noClusters, pearson, **kwargs)
        clusters = clusterer.cluster(vectors, True)
        result = {}
        for i in range(noClusters):
            result[i] = []
        for i in range(len(nounTermsIndexIds)):
            docid = nounTermsIndexIds[i]
            uid = uidTermsIndex._unindex[docid]
            result[clusters[i]] = result[clusters[i]] + [uid]

        return result

예제 #9

0

파일 보기

파일: Workflow_Mod.py 프로젝트: JaredDelora/Detecting_Social_Media_Misinformation

    def cluster(self,vectors):
        if self.clustering_params['method'] == "KMeans_NLTK":
            kmeans = KMeansClusterer(num_means=20, distance=nltk.cluster.util.cosine_distance, repeats=25,
                                     avoid_empty_clusters=True)
            self.dataset['cluster'] = \
                kmeans.cluster(vectors, assign_clusters=True)
        elif self.clustering_params['method'] == "KMeans":
            kmeans = KMeans(n_clusters=eval(self.clustering_params['n_clusters']))
            kmeans.fit(vectors)
            clusters = kmeans.cluster_centers_
            self.dataset['cluster'] = kmeans.predict(self.vectors)
            print(self.dataset[['cluster', 'id', 'text']])
        self.clustered_filename = f'{self.disaster_name}_{self.clustering_params["method"]}' + \
                                  f'_{self.clustering_params["n_clusters"]}'.replace(" ", "_")
        current_time = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
        self.dataset.to_csv(
            f"{self.disaster_dir}/kmeans/{self.clustering_params['method']}_{self.clustering_params['n_clusters']}_{current_time}.csv",
            index=False)

        filename = f"{self.disaster_dir}/kmeans/{self.clustering_params['method']}_{self.clustering_params['n_clusters']}_{current_time}"

        with open(filename+'.pkl', 'wb') as file:
            pickle.dump(kmeans, file)
            file.close()
        with open(filename+'.vec', 'wb') as file:
            pickle.dump(self.vectors, file)
            file.close()
        return self.dataset, filename+'.pkl'

예제 #10

0

파일 보기

파일: SentimentAnalysis.py 프로젝트: DanyuWang/P2Presearch

    def test_knn_result():
        model = gensim.models.Word2Vec.load(
            '/Users/holly/Desktop/毕设/Data/(旧)PlatformsComments/p2p.word2vec.model'
        )
        X = model[model.wv.vocab]
        from nltk.cluster import KMeansClusterer
        import nltk
        NUM_CLUSTERS = 5
        kclusterer = KMeansClusterer(
            NUM_CLUSTERS,
            distance=nltk.cluster.util.cosine_distance,
            repeats=25)
        assigned_clusters = kclusterer.cluster(X, assign_clusters=True)

        words = list(model.wv.vocab)
        cluster_dict = {0: [], 1: [], 2: [], 3: [], 4: []}
        for i, word in enumerate(words):
            cluster_dict[assigned_clusters[i]].append(word)

        for j in range(5):
            with open(
                    os.path.join(
                        '/Users/holly/Desktop/毕设/Data/(旧)PlatformsComments/result',
                        str(j) + '.txt'), 'w') as f:
                for word in cluster_dict[j]:
                    f.write("%s\n" % word)
                f.close()

예제 #11

0

파일 보기

파일: hashtag_clustering.py 프로젝트: AayushMandhyan/Who-is-doing-what-twitter

    def clustering(self, modelpath, hashtag_cluster_path, num_clusters):

        #loading word2vec model
        model = Word2Vec.load(modelpath)
        X = model.wv.vectors

        #clustering
        num_clusters = int(num_clusters)
        kclusterer = KMeansClusterer(num_clusters, distance=nltk.cluster.util.cosine_distance, repeats=25)
        assigned_clusters = kclusterer.cluster(X, assign_clusters=True)

        #distributing hashtags into their respective clusters
        words = list(model.wv.vocab)
        cluster_distribution = {}
        for i, word in enumerate(words):
            try:
                cluster_distribution[str(assigned_clusters[i])].append(word)
            except:
                cluster_distribution[str(assigned_clusters[i])] = []
                cluster_distribution[str(assigned_clusters[i])].append(word)

        #save the cluster distribution
        with open(hashtag_cluster_path, "w") as write_file:
            json.dump(cluster_distribution, write_file)
        print('saved hashtag cluster.')

예제 #12

0

파일 보기

 def cluster(self, embedding, NUM_CLUSTERS):
     kclusterer = KMeansClusterer(
         NUM_CLUSTERS,
         distance=nltk.cluster.util.cosine_distance,
         repeats=25)
     assigned_clusters = kclusterer.cluster(embedding, assign_clusters=True)
     return assigned_clusters

예제 #13

0

파일 보기

class PartionalNltk():
    def __init__(self):
        self.clf = KMeansClusterer(2,
                                   cosine_distance,
                                   repeats=30,
                                   avoid_empty_clusters=True)

    def cluster(self, data):
        clusters = self.clf.cluster(data.toarray(), True)
        return np.array(clusters)

    def f_score(self, cluster, f_score_dict):

        for cl in f_score_dict:
            docs = np.array(f_score_dict[cl]['docs'])
            nri = np.intersect1d(cluster, docs).shape[0]
            nr = docs.shape[0]
            ni = cluster.shape[0]
            #print nri, nr, ni

            try:
                recall = float(nri) / float(nr)
                precision = float(nri) / float(ni)
                f_score = (2 * precision * recall) / (precision + recall)
                f_score_dict[cl]['fscore'] = f_score if (
                    f_score > f_score_dict[cl]['fscore']
                    or not f_score_dict[cl]['fscore']
                ) else f_score_dict[cl]['fscore']
            except ZeroDivisionError, e:
                #print e
                pass

        return f_score_dict

예제 #14

0

파일 보기

def clustering(data, cluster, n_classes):
    print('\n------------------GMM\n')
    assigned_clusters = mixture.GaussianMixture(
        n_components=n_classes, covariance_type='tied').fit_predict(data)

    print('Mutual_info_score =',
          mutual_info_score(cluster - 1, assigned_clusters))
    print(
        'Adjusted_mutual_info_score =',
        adjusted_mutual_info_score(cluster - 1,
                                   assigned_clusters,
                                   average_method='min'))
    print('Adjusted_rand_scor =',
          adjusted_rand_score(cluster - 1, assigned_clusters))

    print('\nK_MEANS')

    kclusterer = KMeansClusterer(num_means=n_classes,
                                 distance=nltk.cluster.util.cosine_distance)
    assigned_clusters = kclusterer.cluster(data, assign_clusters=True)

    print('Mutual_info_score =',
          mutual_info_score(cluster - 1, assigned_clusters))
    print(
        'Adjusted_mutual_info_score =',
        adjusted_mutual_info_score(cluster - 1,
                                   assigned_clusters,
                                   average_method='min'))
    print('Adjusted_rand_scor =',
          adjusted_rand_score(cluster - 1, assigned_clusters))

예제 #15

0

파일 보기

파일: Corpus_Cluster.py 프로젝트: aham1203/fictional-barnacle

class KMeansClusters(BaseEstimator, TransformerMixin):
    """
    Cluster text data using k-means. Makes use of nltk k-means clustering.
    Allows for alternative distance measures
    """
    def __init__(self, k=7):
        self.k = k
        self.distance = nltk.cluster.util.cosine_distance
        self.model = KMeansClusterer(self.k,
                                     self.distance,
                                     avoid_empty_clusters=True)

    def fit(self, documents, labels=None):
        return self

    def transform(self, documents):
        """
        fits the K-means model to the given documents
        Parameters
        ----------
        documents :
            a string containing the normalized text.

        Returns
        -------
            fitted model
        """
        return np.array(self.model.cluster(documents, assign_clusters=True))

예제 #16

0

파일 보기

파일: word2vec_cluster.py 프로젝트: shubhampachori12110095/CRF-based-Named-Entity-Recognizer

def word2vec_cluster(in_file, out_file):
    sentences = []
    with codecs.open(in_file, 'r',encoding='utf-8', errors='ignore') as in_file:
        corpus = in_file.readlines()
        for line in corpus:
            line = line.strip('\n')
            if not line:
                continue
            line = line.lower()
            line = line.split(" ")
            sentences.append(line)

    print("training model...")
    model = Word2Vec(sentences, min_count=2)
     
    print("get vector data...")
    X = model[model.wv.vocab]

    NUM_CLUSTERS=50
    kclusterer = KMeansClusterer(NUM_CLUSTERS, distance=nltk.cluster.util.cosine_distance,avoid_empty_clusters=True, repeats=30)

    print("assigning cluster..")
    assigned_clusters = kclusterer.cluster(X, assign_clusters=True)
     
    words = list(model.wv.vocab)

    with open(out_file, 'a') as out_file: 
        for i, word in enumerate(words):  
            out_file.write(word + ":" + str(assigned_clusters[i]) + '\n')

예제 #17

0

파일 보기

파일: classifier_old.py 프로젝트: Neuro17/LOD-doc-clustering

class PartionalNltk():

    def __init__(self):
        self.clf = KMeansClusterer(2, cosine_distance, repeats=30, avoid_empty_clusters=True)

    def cluster(self, data):
        clusters = self.clf.cluster(data.toarray(), True)
        return np.array(clusters)

    def f_score(self, cluster, f_score_dict):

        for cl in f_score_dict:
            docs = np.array(f_score_dict[cl]['docs'])
            nri = np.intersect1d(cluster, docs).shape[0]
            nr = docs.shape[0]
            ni = cluster.shape[0]
            #print nri, nr, ni

            try:
                recall = float(nri) / float(nr)
                precision = float(nri) / float(ni)
                f_score = (2 * precision * recall) / (precision + recall)
                f_score_dict[cl]['fscore'] = f_score if (f_score > f_score_dict[cl]['fscore'] or not f_score_dict[cl]['fscore']) else f_score_dict[cl]['fscore']
            except ZeroDivisionError, e:
                #print e
                pass

        return f_score_dict

예제 #18

0

파일 보기

파일: clustering.py 프로젝트: soerensigfusson/collective.classification

 def clusterize(self,noClusters,noNouranksToKeep,**kwargs):
     """
     """
     storage = getUtility(INounPhraseStorage)
     
     docids = storage.rankedNouns.keys()
     docnouns = []
     allNouns = set()
     vectors = []
     
     for key in docids:
         importantNouns = storage.getNounTerms(
             key,
             noNouranksToKeep)
         docnouns.append(importantNouns)
         allNouns = allNouns.union(importantNouns)
     
     for nouns in docnouns:
         vector = [(noun in nouns and 1 or 0) for noun in allNouns]
         vectors.append(numpy.array(vector))
     
     clusterer = KMeansClusterer(noClusters,pearson,**kwargs)
     clusters = clusterer.cluster(vectors,True)
     
     result = {}
     for i in range(noClusters):
         result[i] = []
     for docid in docids:
         index = indexOf(docids,docid)
         result[clusters[index]] = result[clusters[index]] + [docid]
     return result

예제 #19

0

파일 보기

파일: clustering.py 프로젝트: canis617/team-project-team18

def main():

    NUM_CLUSTERS = 3

    model = Word2Vec.load("test_word2vec_1.model")
    model_data = model[
        model.wv.vocab]  #word2vec 모델을 kmeans clustering 하기 위하여 데이터로 변환

    # 클러스터링이 잘 되는지 확인-------------------------------------------------------------------------
    kclusterer = KMeansClusterer(NUM_CLUSTERS,
                                 distance=nltk.cluster.util.cosine_distance,
                                 repeats=25)
    assigned_clusters = kclusterer.cluster(model_data, assign_clusters=True)
    print(assigned_clusters)

    words = list(model.wv.vocab)
    for i, word in enumerate(words):
        print(word + ":" + str(assigned_clusters[i]))
    #-----------------------------------------------------------------------------------------------------

    #model_data->get Vector Data

    # clusterling -------------------------------------------------------------------------------------------
    kmeans = cluster.KMeans(n_clusters=NUM_CLUSTERS)
    kmeans.fit(model_data)  #clusterling
    labels = kmeans.labels_  # 각 데이터의 라벨 값들
    centroids = kmeans.cluster_centers_  #각 cluster의 중심점의 좌표

    # 결과 확인 -------------------------------------------------------------------------------------------
    print("Cluster id labels for inputted data")
    print(labels)
    print("Centroids data")
    print(centroids)

예제 #20

0

파일 보기

    def __kmeans(self, aspect, vectors, id_sentences, k=50):
        ''' Cluster sentences using the K-Means Algorithm '''
        k = min(k, len(vectors))
        vectors = [array(v) for v in vectors]
        means = vectors[:k]
        clusterer = KMeansClusterer(k,
                                    euclidean_distance,
                                    initial_means=means,
                                    avoid_empty_clusters=True)
        with utils.Capturing() as output:
            clusters = clusterer.cluster(vectors, True)

        for id_cluster in range(k):
            self.__clusters[aspect][id_cluster] = {
                'importance': 0,
                'sentences': [],
                'representative_words': [],
                'max_sentence': None
            }

        for index, id_cluster in enumerate(clusters):
            self.__clusters[aspect][id_cluster]['sentences'].append(
                id_sentences[index])

        for id_cluster in range(k):  # Delete empty clusters
            if len(self.__clusters[aspect][id_cluster]['sentences']) == 0:
                self.__clusters[aspect].pop(id_cluster)

        self.__search_representative_words(aspect)

예제 #21

0

파일 보기

파일: cluster.py 프로젝트: wsun/abstracts

def cluster(abstracts, mode, metric, debug=False, repeats=10):
    ''' 
    K-means clustering with evaluation metrics, using custom distance
    function and provided abstracts.
    '''

    labels = []
    vectors = []

    # create vectors and labels; k will be number of ground-truth labels
    construct(abstracts, vectors, mode)
    k = label(abstracts, labels)

    # cluster
    clusterer = KMeansClusterer(k, metric, repeats=repeats, 
                                normalise=True, avoid_empty_clusters=True)
    clusters = clusterer.cluster(vectors, assign_clusters=True, trace=debug) 
    means = clusterer.means()

    print 
    print "EVALUATION:"

    # compute evaluation metrics
    dist = sumdistance(vectors, clusters, means)
    pure = purity(clusters, labels, k)
    entr = entropy(clusters, labels, k)
    f, rand = f1(clusters, labels, k)

    print "Sum of distances: %f" % dist
    print "Purity: %f" % pure
    print "Entropy: %f" % entr
    print "Rand index: %f" % rand
    print "F1 measure: %f" % f

예제 #22

0

파일 보기

파일: evaluation.py 프로젝트: geraintpalmer/en-cy-bilingual-embeddings

 def evaluate_clustering(self):
     data = pd.read_csv(self.clustering_data, names=['Word', 'Cat'])
     full = pd.concat([data, pd.DataFrame({'v' + str(i): [self.model.wv[word][i] for word in data['Word']] for i in range(self.model.vector_size)})], axis=1)
     ncats = len(set(full['Cat']))
     kclusterer = KMeansClusterer(ncats, distance=nltk.cluster.util.euclidean_distance, repeats=25)
     assigned_clusters = kclusterer.cluster(np.array(full[['v' + str(i) for i in range(self.model.vector_size)]]), assign_clusters=True)
     data['Euclidean'] = assigned_clusters
     kclusterer = KMeansClusterer(ncats, distance=nltk.cluster.util.cosine_distance, repeats=25)
     assigned_clusters = kclusterer.cluster(np.array(full[['v' + str(i) for i in range(self.model.vector_size)]]), assign_clusters=True)
     data['Cosine'] = assigned_clusters
     self.clustering_evaluation_dataset = data
     self.summary['Clustering Purity (Euclidean)'] = self.purity('Euclidean', ncats)
     self.summary['Clustering Purity (Cosine)'] = self.purity('Cosine', ncats)
     self.summary['Clustering Entropy (Euclidean)'] = self.entropy('Euclidean')
     self.summary['Clustering Entropy (Cosine)'] = self.entropy('Cosine')
     self.summary['Clustering Rand Index (Euclidean)'] = self.rand_index('Euclidean')
     self.summary['Clustering Rand Index (Cosine)'] = self.rand_index('Cosine')

예제 #23

0

파일 보기

def clusterize( data, repeats=50 ):

    clusterer = KMeansClusterer(5, iou_dist_function, repeats=repeats, avoid_empty_clusters=True)
    clusters = clusterer.cluster(vectors, True)
    #print(clusters)
    anchors = clusterer.means()

    return anchors

예제 #24

0

파일 보기

파일: testlda.py 프로젝트: dannyoleary1/Final-year-project---Twitter-Analysis-using-Machine-Learning

def kmeans_test(model, documents):
    count = len(documents)
    vectors = []

    print("done")
    kclusterer = KMeansClusterer(20,
                                 distance=nltk.cluster.util.cosine_distance,
                                 repeats=25)
    assigned_clusters = kclusterer.cluster(model.docvecs, assign_clusters=True)

예제 #25

0

파일 보기

파일: Bakhsh_Shujaat_Assignment_6.py 프로젝트: akshaykirolikar/BIA-660-Web-Analytics

def cluster_kmean(train_file, test_file):
    """Load train and test data into data frames"""
    f_train = open(train_file, encoding="utf-8")
    train_data = json.load(f_train)
    df_train = pd.DataFrame(train_data, columns=['text'])
    f_train.close()

    f_test = open(test_file, encoding='utf-8')
    test_data = json.load(f_test)
    df_test = pd.DataFrame(test_data, columns=['text', 'labels'])
    f_test.close()

    labels = df_test.labels
    labels = list(set(sum(labels, [])))[:3]
    """"Initialize TF-IDF vectorizer"""
    tfidf_vect = TfidfVectorizer(stop_words="english", min_df=5)

    dtm = tfidf_vect.fit_transform(df_train['text'])

    num_clusters = 3
    """Initialize clutering"""
    clusterer = KMeansClusterer(num_clusters, cosine_distance, repeats=20)

    clusters = clusterer.cluster(dtm.toarray(), assign_clusters=True)

    centroids = np.array(clusterer.means())

    sorted_centroids = centroids.argsort()[:, ::-1]

    voc_lookup = tfidf_vect.get_feature_names()

    test_dtm = tfidf_vect.transform(df_test.text)

    predicted = [clusterer.classify(v) for v in test_dtm.toarray()]

    df_test['label_test'] = df_test['labels'].apply(lambda x: x[0])

    confusion_df = pd.DataFrame(list(
        zip(df_test["label_test"].values, predicted)),
                                columns=["actual_class", "cluster"])

    df_result = pd.crosstab(index=confusion_df.cluster,
                            columns=confusion_df.actual_class)

    print(df_result)

    df_clusterLabelsPredicted = list(
        df_result.apply(lambda x: x.idxmax(), axis=1))
    cluster_dict = dict(
        (i, j) for i, j in enumerate(df_clusterLabelsPredicted))

    predicted_target = [cluster_dict[i] for i in predicted]

    print(
        metrics.classification_report(df_test["label_test"], predicted_target))
    for i in cluster_dict:
        print("Cluster %d : Topic %s" % (i, cluster_dict[i]))

예제 #26

0

파일 보기

파일: sdi1500039.py 프로젝트: giorgosma/DataMiningProj2_2020

def KmeansClustering(trainX, numberOfClusters, numberOfRepeats):
    # init cluster with trainX
    # example taken from https://www.nltk.org/_modules/nltk/cluster/kmeans.html#demo
    clusterer = KMeansClusterer(numberOfClusters,
                                cosine_distance,
                                initial_means=None,
                                repeats=numberOfRepeats)
    assigned_clusters = clusterer.cluster(trainX, assign_clusters=True)
    return clusterer, assigned_clusters

예제 #27

0

파일 보기

파일: kmeans.py 프로젝트: yokeyong/atap

class KMeansTopics(object):

    def __init__(self, corpus, k=10):
        """
        corpus is a corpus object, e.g. an HTMLCorpusReader()
        or an HTMLPickledCorpusReader() object

        k is the number of clusters
        """
        self.k = k
        self.model = None
        self.vocab = list(
            set(normalize(corpus.words(categories=['news'])))
            )

    def vectorize(self, document):
        """
        Vectorizes a document consisting of a list of part of speech
        tagged tokens using the segmentation and tokenization methods.

        One-hot encode the set of documents
        """
        features = set(normalize(document))
        return np.array([
            token in features for token in self.vocab], np.short)

    def cluster(self, corpus):
        """
        Fits the K-Means model to the given data.
        """
        cosine = nltk.cluster.util.cosine_distance
        self.model = KMeansClusterer(
            self.k, distance=cosine, avoid_empty_clusters=True)
        self.model.cluster([
            self.vectorize(
                corpus.words(fileid)
            ) for fileid in corpus.fileids(categories=['news'])
        ])

    def classify(self, document):
        """
        Pass through to the internal model classify
        """
        return self.model.classify(self.vectorize(document))

예제 #28

0

파일 보기

def cluster_kmean(train_file, test_file):

    data = pd.read_json(train_file, orient='columns')
    data.columns = ["text"]
    tfidf_vect = TfidfVectorizer(min_df=5, stop_words='english')
    dtm = tfidf_vect.fit_transform(data["text"])

    num_clusters = 3
    clusterer = KMeansClusterer(num_clusters, cosine_distance, repeats=5)
    clusters = clusterer.cluster(dtm.toarray(), assign_clusters=True)

    test = pd.read_json(test_file, orient='columns')
    test.columns = ["text", "label"]

    #to convert dataframe with multiple targets to the first target
    x = test["label"]
    truth = []
    for item in x:
        truth.append(item[0])
    test["label"] = truth

    test_dtm = tfidf_vect.transform(test["text"])
    predicted = [clusterer.classify(v) for v in test_dtm.toarray()]
    confusion_df = pd.DataFrame(list(zip(test["label"].values, predicted)),
                                columns=["label", "cluster"])
    crosstab = pd.crosstab(index=confusion_df.cluster,
                           columns=confusion_df.label)
    print("using cosine: ")
    print(crosstab)
    dfmax = crosstab.idxmax(axis=1)
    print(dfmax)
    cluster_dict = {0: dfmax[0], 1: dfmax[1], 2: dfmax[2]}
    predicted_target = [cluster_dict[i] for i in predicted]

    print(metrics.classification_report(test["label"], predicted_target))

    # Kmeans with 20 different centroid seeds
    num_clusters = 3
    km = KMeans(n_clusters=num_clusters, n_init=20).fit(dtm)
    clusters = km.labels_.tolist()
    predicted2 = km.predict(test_dtm)
    confusion_df2 = pd.DataFrame(list(zip(test["label"].values, predicted2)),
                                 columns=["label", "cluster"])

    crosstab2 = pd.crosstab(index=confusion_df2.cluster,
                            columns=confusion_df2.label)
    print("using Euclidean distance")
    print(crosstab2)
    dfmax = crosstab2.idxmax(axis=1)
    print(dfmax)
    cluster_dict = {0: dfmax[0], 1: dfmax[1], 2: dfmax[2]}

    predicted_target2 = [cluster_dict[i] for i in predicted2]
    print(metrics.classification_report(test["label"], predicted_target2))

    return None

예제 #29

0

파일 보기

    def get_clusters(self, vectors):
        vectors = [numpy.array(v) for v in vectors]
        init_means = [copy(vectors[i]) for i in range(self.num_clusters)]
        clusterer = KMeansClusterer(self.num_clusters,
                                    euclidean_distance,
                                    initial_means=init_means,
                                    avoid_empty_clusters=True)
        clusters = clusterer.cluster(vectors, True)

        return clusters

예제 #30

0

파일 보기

def clustering_on_wordvecs(word_vectors, num_clusters):
    # Initalize a k-means object and use it to extract centroids
    #kmeans_clustering = KMeans(n_clusters=num_clusters, init='k-means++');
    #idx = kmeans_clustering.fit_predict(word_vectors);
    kclusterer = KMeansClusterer(num_clusters,
                                 distance=nltk.cluster.util.cosine_distance,
                                 repeats=25)
    assigned_clusters = kclusterer.cluster(X, assign_clusters=True)

    return assigned_clusters

예제 #31

0

파일 보기

파일: bert_cluster.py 프로젝트: shigristudy/wordfinder

def clustering_question(sents, sents_word2vec, NUM_CLUSTERS=15):
    kclusterer = KMeansClusterer(
        NUM_CLUSTERS, distance=nltk.cluster.util.cosine_distance,
        repeats=25, avoid_empty_clusters=True)

    assigned_clusters = kclusterer.cluster(sents_word2vec, assign_clusters=True)
    data = pd.DataFrame([], columns=['text', 'cluster', 'centroid'])
    data.loc[:, 'text'] = sents
    data.loc[:, 'cluster'] = pd.Series(assigned_clusters, index=data.index)
    data.loc[:, 'centroid'] = data['cluster'].apply(lambda x: kclusterer.means()[x])

    return data, assigned_clusters

예제 #32

0

파일 보기

 def kgen(self, num):
     num = int(num)
     clusterer = KMeansClusterer(num, distance=cosine_distance, repeats=20)
     vecs = self.model.wv[self.model.wv.vocab]
     assignments = clusterer.cluster(vecs, assign_clusters=True)
     self.vocab_to_cluster_map = dict(zip(self.model.wv.vocab, assignments))
     self.clusters = dict()
     for word, index in self.vocab_to_cluster_map.items():
         if index in self.clusters:
             self.clusters[index] += word
         else:
             self.clusters[index] = [word]

예제 #33

0

파일 보기

def cosine_cluster(num_clusters, matrix):

    print("Running k-means using cosine distance...\n")

    matrix = np.asanyarray(matrix)
    
    k_means = KMeansClusterer(num_clusters, cosine_distance, avoid_empty_clusters=True)
    clusters = k_means.cluster(matrix, assign_clusters=True, trace=False)

    print("Successfully found %d clusters in %d dimensions \n" % (num_clusters, matrix.shape[1]))

    return clusters

예제 #34

0

파일 보기

class KMeansTopics(object):
    def __init__(self, corpus, k=10):
        """
        corpus is a corpus object, e.g. an HTMLCorpusReader()
        or an HTMLPickledCorpusReader() object

        k is the number of clusters
        """
        self.k = k
        self.model = None
        self.vocab = list(set(normalize(corpus.words(categories=['news']))))

    def vectorize(self, document):
        """
        Vectorizes a document consisting of a list of part of speech
        tagged tokens using the segmentation and tokenization methods.

        One-hot encode the set of documents
        """
        features = set(normalize(document))
        return np.array([token in features for token in self.vocab], np.short)

    def cluster(self, corpus):
        """
        Fits the K-Means model to the given data.
        """
        cosine = nltk.cluster.util.cosine_distance
        self.model = KMeansClusterer(self.k,
                                     distance=cosine,
                                     avoid_empty_clusters=True)
        self.model.cluster([
            self.vectorize(corpus.words(fileid))
            for fileid in corpus.fileids(categories=['news'])
        ])

    def classify(self, document):
        """
        Pass through to the internal model classify
        """
        return self.model.classify(self.vectorize(document))

예제 #35

0

파일 보기

def get_kmeans_predicted_clusters(word_representions, Num_clusters):
    #from dictionnary type to transposed dataframe
    Y = pd.DataFrame(data=word_representions).T
    X = Y.values
    #Clustering the data using sklearn library
    kclusterer = KMeansClusterer(Num_clusters,
                                 distance=nltk.cluster.util.euclidean_distance,
                                 repeats=25,
                                 avoid_empty_clusters=False)
    predicted_clusters = kclusterer.cluster(
        X,
        assign_clusters=True,
    )
    return predicted_clusters

예제 #36

0

파일 보기

파일: cluster.py 프로젝트: tdiggelm/nltk-playground

def demo_1():

    urls = [
        "www.ai-one.com",
        "http://en.wikipedia.org/wiki/Albert_Einstein",
        "http://en.wikipedia.org/wiki/USA",
        "http://en.wikipedia.org/wiki/Microsoft"
        ]

    keywords = [get_keywords(url) for url in urls]
    all_words = set(chain(*keywords))
    vectors = [vector_from_keywords(kw, all_words) for kw in keywords]

    clusterer = KMeansClusterer(2, euclidean_distance, repeats=10)
    clusters = clusterer.cluster(vectors, True)

예제 #37

0

파일 보기

파일: Clustering_experiment.py 프로젝트: gauravkoradiya/Ticket-Clustering

def k_means_experiment(sample,distance:['euclidean_distance','cosine_distance']='cosine_distance', min_k=2, max_k=50):

    score=[]
    silhouette=[]
    k_assigment_cluster=[]
    if distance == 'euclidean_distance':

         for k in range(min_k,max_k):
             kclusterer = KMeansClusterer(num_means=k, distance=nltk.cluster.util.euclidean_distance,avoid_empty_clusters=True,repeats=1)
             assigned_clusters = kclusterer.cluster(sample,assign_clusters=True)
             # kmeans = KMeans(n_clusters=k,random_state=5,n_jobs=-1,n_init=20,max_iter=500).fit(tickets_vec)
             # labels = np.array(kmeans.labels_)
             silhouette.append(silhouette_score(X=sample, labels=np.array(assigned_clusters)))
             score.append(davies_bouldin_score(sample, assigned_clusters))
             k_assigment_cluster.append(assigned_clusters)

    else:

        for k in range(min_k,max_k):
            #kmeans = KMeans(n_clusters=k,random_state=5,n_jobs=-1,n_init=20,max_iter=500).fit(tickets_vec)
            #assigned_clusters  = kmeans.labels_
            kclusterer = KMeansClusterer(num_means=k, distance=nltk.cluster.util.cosine_distance, avoid_empty_clusters=True,repeats=1)
            assigned_clusters = kclusterer.cluster(sample,assign_clusters=True)
            silhouette.append(silhouette_score(X=sample, labels=np.array(assigned_clusters)))
            score.append(davies_bouldin_score(sample, assigned_clusters))
            k_assigment_cluster.append(assigned_clusters)

    plt.plot(np.arange(min_k,max_k),np.array(score), label='davis bouldin score')
    plt.plot(np.arange(min_k,max_k),np.array(silhouette),label='silhouette score')
    plt.xlabel('number of cluster')
    plt.ylabel('DAVIES BOULDIN SCCORE')
    plt.title('K-means Cluster Scoring')
    plt.legend()
    plt.show()

    return kclusterer,k_assigment_cluster

예제 #38

0

파일 보기

파일: clustering.py 프로젝트: ufwt/func2vec-fse2018-artifact

def nltk_kmeans(word_vectors, k):
    from nltk.cluster import KMeansClusterer
    import nltk

    #word_vectors.init_sims()
    norm_vectors = word_vectors.syn0

    kmeans = KMeansClusterer(k, nltk.cluster.util.cosine_distance, repeats=25)
    assigned_clusters = kmeans.cluster(norm_vectors, assign_clusters=True)

    clusters = defaultdict(list)
    for idx in range(0, len(word_vectors.index2word)):
        clusters[assigned_clusters[idx]].append(word_vectors.index2word[idx])

    return (clusters, kmeans)

예제 #39

0

파일 보기

def train(X, y, train_ratio):
    from sklearn.cluster import KMeans
    from sklearn.linear_model import LogisticRegression
    from sklearn import svm
    from sklearn.metrics import precision_score, recall_score, f1_score
    from sklearn.metrics import accuracy_score
    from sklearn.model_selection import train_test_split

    test_ratio = 1-train_ratio
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_ratio, random_state=36)

    """
    Classification
    """
#     clf = LogisticRegression(C=1000.0, random_state=0).fit(X_train, y_train)
#     clf = svm.SVC(kernel='linear', C=1e30).fit(X_train, y_train)
#     y_pred = clf.predict(X_test)

#     print(y_test)
#     print(y_pred)
#     print("accuracy: %.2f" %accuracy_score(y_test, y_pred))
#     print("Precision : %.3f" % precision_score(y_test, y_pred))
#     print("Recall : %.3f" % recall_score(y_test, y_pred))
#     print("F1-micro : %.3f" % f1_score(y_test, y_pred, average='micro'))
#     print("F1-macro : %.3f" % f1_score(y_test, y_pred, average='macro'))
#     f1_micro = f1_score(y_test, y_pred, average='micro')
#     f1_macro = f1_score(y_test, y_pred, average='macro')

#     print("F1-macro")
#     print(f1_macro)
#     print("F1-micro")
#     print(f1_micro)

    """
    Clustering
    """

    from sklearn.metrics.cluster import normalized_mutual_info_score
    from nltk.cluster import KMeansClusterer
    import nltk

    NUM_CLUSTERS=8
    kclusterer = KMeansClusterer(NUM_CLUSTERS, distance=nltk.cluster.util.cosine_distance,  repeats=100, normalise=True, avoid_empty_clusters=True)
    assigned_clusters = kclusterer.cluster(X, assign_clusters=True)
    nmi = normalized_mutual_info_score(assigned_clusters, y)
    return nmi

예제 #40

0

파일 보기

파일: nbac.py 프로젝트: kareemf/news-article-classification

    def cluster(self, k=5, repeats=1):
        '''
        Cluster documents into k clusters using the NLTK
        implementation of K-Means clustering. The frequency of each
        unique word across an article serves as its feature vector.
        '''
        article_freq_count = {} #frequency of each unique word in a given article
        for article in self.testing_articles:
            article_freq_count[article.id] = []
            for unique_word in self.keywords:
                #count frequency of word in article, add to frequency list
                article_freq_count[article.id].append(article.content.count(unique_word))

        #nltk k-means requires numpy array-like objects
        vectors = [array(article_freq_count[article]) for article in article_freq_count]
        clusterer = KMeansClusterer(k, cosine_distance, repeats=repeats)
        clusters = clusterer.cluster(vectors, True, trace=False)

        groups = [[] for _ in xrange(k)]

        #vector positions need to be converted back to article IDs,
        #because IDs are striped during vector construction.
        vector_ids = {} #maps positions in the vector to article IDs
        f =  article_freq_count.copy()
        for pos in xrange(len(vectors)):
            for id in f.keys():
                #equivalent to 'if article_freq_count[id] == vectors[pos]',
                #but numpy equivalence checking is weird
                t = article_freq_count[id] == vectors[pos]
                if not False in t:
                    vector_ids[pos] = id
                    f.pop(id)

        for i in xrange(len(clusters)):
            groups[clusters[i]].append(vector_ids[i])

        return groups

예제 #41

0

파일 보기

파일: cluster.py 프로젝트: srijanshetty/author-attribution

######################################
# Cluster a BOW vector in 4 clusters #
#                                    #
# Requirements: clusterVectors       #
# Usage       : %loadpy cluster.py   #
######################################

import nltk
from nltk import cluster
from nltk.cluster import cosine_distance
from nltk.cluster import KMeansClusterer

numClusters = 4
print "KMeans Clustering with %d means and using cosine distance" %numClusters
clusterer = KMeansClusterer(numClusters, cosine_distance);
clusters = clusterer.cluster(clusterVectors, assign_clusters=True, trace=False);
means = clusterer.means();

예제 #42

0

파일 보기

파일: clustering.py 프로젝트: samsheff/Filing-Cabinet

        exit()
 
    with open(filename) as title_file:
 
        print "Reading Files"
        job_titles = [unicode(line.strip(), "utf-8") for line in title_file.readlines()]
 
        print "Parsing Words"
        words = get_words(job_titles)
 
        print "Creating Cluster Instance"
        cluster = KMeansClusterer(10, euclidean_distance, 5)

        # Alternative Clusterer - Less accurate for my use
        #cluster = GAAClusterer(20)
        
        print "Clustering"
        cluster.cluster([vectorspaced(title) for title in job_titles if title])
 
        # NOTE: This is inefficient, cluster.classify should really just be
        # called when you are classifying previously unseen examples!
        print "Classifying"
        classified_examples = [
                cluster.classify(vectorspaced(title)) for title in job_titles
           ]
        print "Saving results"
        for cluster_id, title in sorted(zip(classified_examples, job_titles)):
            filename = "results/"+ str(cluster_id) + ".txt"
            list = codecs.open(filename, "a", "utf-8")
            list.write(title + "\n")

예제 #43

0

파일 보기

파일: NLTK.py 프로젝트: mkhuthir/learnPython

# ### k-Means Clustering
# [Clustering](http://www.nltk.org/api/nltk.cluster.html) groups similar items together.  
# The K-means clusterer starts with k arbitrarily chosen means (or centroids) then assigns each vector to the cluster with the closest mean. It then recalculates the means of each cluster as the centroid of its vector members. This process repeats until the cluster memberships stabilize. [NLTK docs on this example](https://www.nltk.org/_modules/nltk/cluster/kmeans.html)  
# This example clusters int vectors, which you can think of as points on a plane. But you could also use clustering to cluster similar documents by vocabulary/topic.

# In[80]:


import numpy as np
from nltk.cluster import KMeansClusterer, euclidean_distance

vectors = [np.array(f) for f in [[2, 1], [1, 3], [4, 7], [6, 7]]]
means = [[4, 3], [5, 5]]

clusterer = KMeansClusterer(2, euclidean_distance, initial_means=means)
clusters = clusterer.cluster(vectors, True, trace=True)

print('Clustered:', vectors)
print('As:', clusters)
print('Means:', clusterer.means())


# **k-Means Clustering, Example-2**  
# In this example we cluster an array of 6 points into 2 clusters.  
# The initial centroids are randomly chosen by the clusterer, and it does 10 iterations to regroup the clusters and recalculate centroids. 

# In[103]:


vectors = [np.array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]]

예제 #44

0

파일 보기

파일: kmeans.py 프로젝트: DistrictDataLabs/intro-to-nltk

def get_cluster(k=K):
    cluster = KMeansClusterer(k, euclidean_distance)
    cluster.cluster([vectorspaced(corpus.words(fileid)) for fileid in corpus.fileids()])
    return cluster

예제 #45

0

파일 보기

파일: lolcorpus.py 프로젝트: credeiki/LoL

    return freq_lore.items()

    
num_clusters=20
vec_len=len(common_words)
vector_words=common_words[:vec_len]
word_freqs=[get_word_freq(text) for [text,a,b] in annotated]
tmp_vector=[]
for champ_freq in word_freqs:
    for word in vector_words:
        appendable=None
        for (aword, afreq) in champ_freq:
            if word == aword:
                appendable=afreq
        if appendable==None:
            tmp_vector.append(0)
        else:
            tmp_vector.append(appendable)
            
vector_list=[tmp_vector[i:i+vec_len] for i in range(0, len(tmp_vector), vec_len)]
word_array=numpy.array(vector_list)
clusterer= KMeansClusterer(num_clusters, euclidean_distance, repeats=10)
clusters = clusterer.cluster(word_array, True)
enum_clusters=list(enumerate(clusters))
enum_clusters.sort(key=lambda x: x[1])
clustered_champs= [(annotated[index][0], clus_num) for (index , clus_num) in enum_clusters]


print('clustered_champs',clustered_champs)

예제 #46

0

파일 보기

파일: cluster.py 프로젝트: bennylope/python-nltk-intro

	0's are inserted otherwise. 
	@param response The survey response to generate a vector for 
    '''
    response_components = [normalize_word(word) for word in response.split()]    
    return numpy.array([
        word in response_components and not word in stopwords
        for word in words], numpy.short)
 
if __name__ == '__main__':
 
    num_clusters = DEFAULT_NUM_CLUSTERS
    if len(sys.argv) == 2:
        num_clusters = int(sys.argv[1])
 
    with open("reviews.txt") as survey_file:
 
        responses = [line.strip() for line in survey_file.readlines()]
 
        words = get_words(responses)
 
	cluster = KMeansClusterer(num_clusters, euclidean_distance,
			repeats=100, avoid_empty_clusters=True)
        cluster.cluster([vectorspaced(response) for response in responses if response])
 
        classified_examples = [
                cluster.classify(vectorspaced(response)) for response in responses
            ]
 
        for cluster_id, title in sorted(zip(classified_examples, responses)):
            print cluster_id, title

예제 #47

0

파일 보기

파일: classify.py 프로젝트: dicleoztur/subjectivity_detection

def kmeans_cluster(datamatrix, numofclusters=3):
    clusterer = KMeansClusterer(numofclusters, euclidean_distance)
    groups = clusterer.cluster(datamatrix, assign_clusters=True, trace=True)
    means = clusterer.means()
    return groups, means