def cluster(self,vectors):
        if self.clustering_params['method'] == "KMeans_NLTK":
            kmeans = KMeansClusterer(num_means=20, distance=nltk.cluster.util.cosine_distance, repeats=25,
                                     avoid_empty_clusters=True)
            self.dataset['cluster'] = \
                kmeans.cluster(vectors, assign_clusters=True)
        elif self.clustering_params['method'] == "KMeans":
            kmeans = KMeans(n_clusters=eval(self.clustering_params['n_clusters']))
            kmeans.fit(vectors)
            clusters = kmeans.cluster_centers_
            self.dataset['cluster'] = kmeans.predict(self.vectors)
            print(self.dataset[['cluster', 'id', 'text']])
        self.clustered_filename = f'{self.disaster_name}_{self.clustering_params["method"]}' + \
                                  f'_{self.clustering_params["n_clusters"]}'.replace(" ", "_")
        current_time = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
        self.dataset.to_csv(
            f"{self.disaster_dir}/kmeans/{self.clustering_params['method']}_{self.clustering_params['n_clusters']}_{current_time}.csv",
            index=False)

        filename = f"{self.disaster_dir}/kmeans/{self.clustering_params['method']}_{self.clustering_params['n_clusters']}_{current_time}"

        with open(filename+'.pkl', 'wb') as file:
            pickle.dump(kmeans, file)
            file.close()
        with open(filename+'.vec', 'wb') as file:
            pickle.dump(self.vectors, file)
            file.close()
        return self.dataset, filename+'.pkl'
示例#2
0
                                 quoting=csv.QUOTE_ALL)

#nltk GAAClusterer
model = GAAClusterer(num_clusters=cluster_number)
model.cluster(vectors, assign_clusters=True)

clusters = [model.classify_vectorspace(vector.tolist()) for vector in vectors]

data['cluster'] = pd.DataFrame(clusters)
data[['text', 'cluster']].to_csv('../data/text_clustered_nltk_gaac.csv',
                                 index=True,
                                 quoting=csv.QUOTE_ALL)

#sklearn means
model = KMeans(n_clusters=cluster_number, max_iter=epochs, n_jobs=8)
model.fit(vectors)
dump(model, '../data/advanced_sklearn_kmeans.joblib')

data['cluster'] = pd.DataFrame(model.labels_)
data[['text', 'cluster']].to_csv('../data/text_clustered_sklearn_kmeans.csv',
                                 index=True,
                                 quoting=csv.QUOTE_ALL)

#sklearn agglomerative
model = AgglomerativeClustering(n_clusters=cluster_number)
clusters = model.fit_predict(vectors)
data['cluster'] = pd.DataFrame(clusters)
data[['text',
      'cluster']].to_csv('../data/text_clustered_sklearn_agglomerative.csv',
                         index=True,
                         quoting=csv.QUOTE_ALL)