class BirchAlgo(object): labels = None clusters = [] centers = None labels_temporary = None def __init__(self, threshold = 0.2): self.birch = Birch(threshold=threshold, n_clusters=None, compute_labels=True) self.n_cluster = None def aplica_birch(self, dados): self.birch.partial_fit(dados) if (self.labels is None): self.labels = self.birch.labels_ self.labels_temporary = self.birch.labels_ else: self.labels = np.append(self.labels, self.birch.labels_) self.labels_temporary = self.birch.labels_ self.centers = self.birch.subcluster_centers_ return self.labels_temporary def atualiza_kmeans(self, dados): self.aplica_birch(dados)
class ClusteringObjectClassifierModel(object): def __init__(self): self.learned_classes = dict() self.max_classes = 10 self.estimator = Birch(n_clusters=None, threshold=10.0) def online_fit(self, X, class_name): self.estimator.partial_fit(X) cluster_id = np.asscalar(self.estimator.labels_) if cluster_id not in self.learned_classes: print("Assigning cluster id %d to class %s" % (cluster_id, class_name)) self.learned_classes[cluster_id] = class_name return self.__pca_on_cluster_centers( self.estimator.subcluster_centers_) def __pca_on_cluster_centers(self, cluster_centers): pca = PCA(n_components=2) coords = np.atleast_2d(pca.fit_transform(cluster_centers)) if len(coords) < 2: return np.zeros(1), np.zeros(1) return coords[:, 0], coords[:, 1] def predict_class(self, X): if not hasattr(self.estimator, "root_"): return False, False cluster_id = np.asscalar(self.estimator.predict(X)) if cluster_id not in self.learned_classes: return False, False return self.learned_classes[cluster_id], cluster_id
def test_partial_fit_second_call_error_checks(): # second partial fit calls will error when n_features is not consistent # with the first call X, y = make_blobs(n_samples=100) brc = Birch(n_clusters=3) brc.partial_fit(X, y) msg = "X has 1 features, but Birch is expecting 2 features" with pytest.raises(ValueError, match=msg): brc.partial_fit(X[:, [0]], y)
def test_partial_fit(): # Test that fit is equivalent to calling partial_fit multiple times X, y = make_blobs(n_samples=100) brc = Birch(n_clusters=3) brc.fit(X) brc_partial = Birch(n_clusters=None) brc_partial.partial_fit(X[:50]) brc_partial.partial_fit(X[50:]) assert_array_almost_equal(brc_partial.subcluster_centers_, brc.subcluster_centers_) # Test that same global labels are obtained after calling partial_fit # with None brc_partial.set_params(n_clusters=3) brc_partial.partial_fit(None) assert_array_equal(brc_partial.subcluster_labels_, brc.subcluster_labels_)
def birtch_partial(matrix,n_cluster): brc = Birch(branching_factor=100, n_clusters=n_cluster, threshold=1.0, compute_labels = True) model=brc.partial_fit(matrix) res = model.predict(matrix) return res
def specBIRCH(self, n_clusters, spectralptsfile): """ Use BIRCH clustering on spectral data only. """ self.classifier = "Spectral-BIRCH" self.inptsfile = spectralptsfile points = self.loadPoints() points = points[self.validhit_bool, :] print "Running BIRCH clustering on spectral data only ..." points = StandardScaler(copy=False).fit_transform(points) brc = Birch(n_clusters=n_clusters) # Feed the points to the BIRCH gradually npts = len(points) niter = int(npts / self.birch.pf_npts) + 1 for i in xrange(niter - 1): brc.partial_fit(points[i * self.birch.pf_npts:(i + 1) * self.birch.pf_npts, :]) brc.partial_fit(points[(niter - 1) * self.birch.pf_npts:, :]) self.labels[self.validhit_bool] = brc.predict(points)
def birchCluster(zD, maxd, out='dict', N=None, start=0, stop=None): #The radius of the subcluster obtained by merging a new sample and the closest subcluster should be lesser than the threshold. #Otherwise a new subcluster is started. Setting this value to be very low promotes splitting and vice-versa. data = zD.dictPos stop = len(zD.pList) if not stop else stop X = [[data['x'][i], data['y'][i], data['z'][i]] for i in range(start, stop)] brc = Birch(branching_factor=50, n_clusters=None, threshold=maxd, compute_labels=True) brc.fit(X) if N: brc.set_params(n_clusters=N) brc.partial_fit(np.matrix(X)) groups = brc.predict(X) if out == 'dict': return list2dict(zD, groups) elif out == 'list': return groups else: raise Exception("Out argument must have valus 'dict' or 'list'")
class Birch_algo_wrapper: def __init__(self): self.wrapped = Birch(n_clusters=None, threshold=0.5, branching_factor=50) def fit(self, data): return self.wrapped.fit(data) def fit_predict(self, data): self.wrapped = self.wrapped.partial_fit(data) return self.wrapped.predict(data) def predict(self, data): return self.wrapped.predict(data)
def main(): vec = HashingVectorizer(tokenizer=preprocess, ngram_range=(3, 3), analyzer='word') clu = Birch(n_clusters=3) #clu = MiniBatchKMeans(n_clusters=2) config = configparser.ConfigParser() config.read('cfg.ini') config = config['DEFAULT'] api = twitter.Api(consumer_key=config['consumer_key'], consumer_secret=config['consumer_secret'], access_token_key=config['access_token_key'], access_token_secret=config['access_token_secret']) queue = deque(maxlen=50) for n, line in enumerate( api.GetStreamFilter(track=[ 'pokemon', 'dark souls', 'darksouls', 'sonic', 'hedgehog' ], languages=['en'])): if n > 1000000: break elif len(queue) != 50: try: queue.append(line['text']) logging.warning("%s", line['text']) except KeyError: pass else: try: v = vec.transform(queue) clu = clu.partial_fit(v) logging.warning('TESTING\n.\n.\n.\n.') logging.warning("%s, %s, %s", n, clu.predict(v[-1]), queue[-1]) except KeyError: pass queue.clear() pickle.dump(clu, open('cluster_model.pkl', 'w'))
affinity_propagation_valid_performance_metrics_for_plotting[item + 1] = affinity_propagation_valid_performance_metric_array[item] affinity_propagation_test_performance_metrics_for_plotting[item + 1] = affinity_propagation_test_performance_metric_array[item] Figures.save_valid_test_performance_measures_vs_hyper_parameters_figure(affinity_propagation_parameter_search_space_for_plotting, affinity_propagation_valid_performance_metrics_for_plotting, affinity_propagation_test_performance_metrics_for_plotting, 'Adjusted Mutual Information Score', 'AffinityPropagation Clustering damping parameter', 'Affinity_Propagation_Performance', 0, 0.5, left_horizontal_limit=0.5) # Do BIRCH, optimizing number of calls to partial_fit over a validation set current_optimal_birch_number_of_calls = 1 initial_optimal_birch_clusterer = Birch() initial_optimal_birch_clusterer.partial_fit(train_data_set) initial_optimal_birch_clusterer.set_params(n_clusters=number_of_classes) initial_birch_valid_predictions = initial_optimal_birch_clusterer.predict(valid_data_set) initial_birch_test_predictions = initial_optimal_birch_clusterer.predict(test_data_set) # Add one to the predictions to make them match up with range of labels, then apply Hungarian Fix for element in range(number_of_valid_observations): initial_birch_valid_predictions[element] += 1 for element in range(number_of_test_observations): initial_birch_test_predictions[element] += 1 initial_birch_valid_predictions = Clustering.Hungarian_Fix(initial_birch_valid_predictions, valid_labels).astype('int') initial_birch_test_predictions = Clustering.Hungarian_Fix(initial_birch_test_predictions, test_labels).astype('int') # Set a starting point for optimality of the initial performance metric, to be possibly adjusted later
centers_2 = [[10, 10], [0, 1]] X2, _ = make_blobs(n_samples=n_samples_2, centers=centers_2, cluster_std=0.2) # Prints these new points plt.plot([X2[a][0] for a in range(n_samples_2)], [X2[b][1] for b in range(n_samples_2)], '.') plt.axis([-4, 12, -4, 12]) plt.show() labels = brc.labels_ cluster_centers = brc.subcluster_centers_ labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) # Adds the new points to the old clustering with "partial_fit" brc.partial_fit(X2) labels = np.concatenate([labels, brc.labels_]) cluster_centers = brc.subcluster_centers_ # All the cluster centers existing (old and new) labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) # All the points generated (old and new) X_tot = np.concatenate([X, X2]) # Prints the different clusters computed # We can see that the some new points were added to an old cluster ([0,1]), and the others created a new cluster plt.figure(1) plt.clf() colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
def spaBIRCH(self, n_clusters, spectralptsfile, mscfile, use_scales=None): """ Use BIRCH clustering on spatial data only. """ self.classifier = "Spatial-BIRCH" self.inptsfile = spectralptsfile self.mscfile = mscfile self.loadPoints() print "Running BIRCH clustering on spatial data only ..." mscfobj = dpu.openMSC(mscfile) mscheader = mscfobj.header nscales = len(mscheader[1]) if use_scales is None: use_scales = np.arange(nscales) else: if np.any(use_scales >= nscales): raise RuntimeError( "Indices to scales out of bound, {0:d} scales in input MSC\n" .format(nscales)) if np.any(use_scales < 0): raise RuntimeError( "Indices to scales out of bound, negative indices found") # Process the points in batches gradually npts = mscheader[0] niter = int(npts / self.birch.pf_npts) + 1 rusage_denom = 1024. pca_flag = False if pca_flag: # Transform the data with PCA print "\tPCA of MSC spatial data ..." ipca = IncrementalPCA(n_components=len(use_scales)) for i in xrange(niter): mscdata = mscfobj.read(npts=self.birch.pf_npts, use_scales=use_scales) mscbool = self.validhit_bool[mscdata[:, -1].astype(int) - 1] if np.sum(mscbool) == 0: if self.verbose: # debug print "\t\tno valid points, {0:d} / {1:d}".format( i, niter) continue ipca.partial_fit(mscdata[mscbool, 0:-1]) sys.stdout.write("{0:d} / {1:d} \n".format(i, niter)) print np.cumsum(ipca.explained_variance_ratio_) # Train the standard scaler to scale the input data # incrementally print print "\tTraining preprocessing scaler for MSC spatial data ..." mscfobj.next_pt_idx = 0 scaler = StandardScaler() for i in xrange(niter): mscdata = mscfobj.read(npts=self.birch.pf_npts, use_scales=use_scales) mscbool = self.validhit_bool[mscdata[:, -1].astype(int) - 1] if np.sum(mscbool) == 0: if self.verbose: # debug print "\t\tno valid points, {0:d} / {1:d}".format(i, niter) continue if pca_flag: scaler.partial_fit(ipca.transform(mscdata[mscbool, 0:-1])) else: scaler.partial_fit(mscdata[mscbool, 0:-1]) mem = resource.getrusage( resource.RUSAGE_SELF).ru_maxrss / rusage_denom sys.stdout.write("{0:d} / {1:d}: {2:.2f}\n".format(i, niter, mem)) # Train the BIRCH print print "\tTraining the BIRCH cluster ..." mscfobj.next_pt_idx = 0 brc = Birch(n_clusters=n_clusters) for i in xrange(niter): mscdata = mscfobj.read(npts=self.birch.pf_npts, use_scales=use_scales) mscbool = self.validhit_bool[mscdata[:, -1].astype(int) - 1] if np.sum(mscbool) == 0: if self.verbose: # debug print "\t\tno valid points, {0:d} / {1:d}".format(i, niter) continue if pca_flag: brc.partial_fit( scaler.transform(ipca.transform(mscdata[mscbool, 0:-1]))) else: brc.partial_fit(scaler.transform(mscdata[mscbool, 0:-1])) mem = resource.getrusage( resource.RUSAGE_SELF).ru_maxrss / rusage_denom sys.stdout.write("{0:d} / {1:d}: {2:.2f}\n".format(i, niter, mem)) # Predict the label of points after feeding all points to # BIRCH print print "\tPredicting BIRCH clustering labels ..." # Rewind the MSC file object to read points from the # beginning. mscfobj.next_pt_idx = 0 for i in xrange(niter): mscdata = mscfobj.read(npts=self.birch.pf_npts, use_scales=use_scales) mscbool = self.validhit_bool[mscdata[:, -1].astype(int) - 1] if np.sum(mscbool) == 0: if self.verbose: # debug print "\t\tno valid points, {0:d} / {1:d}".format(i, niter) continue if pca_flag: self.labels[mscdata[mscbool, -1].astype(int) - 1] = brc.predict( scaler.transform( ipca.transform(mscdata[mscbool, 0:-1]))) else: self.labels[mscdata[mscbool, -1].astype(int) - 1] = brc.predict( scaler.transform(mscdata[mscbool, 0:-1])) mem = resource.getrusage( resource.RUSAGE_SELF).ru_maxrss / rusage_denom sys.stdout.write("{0:d} / {1:d}: {2:.2f}\n".format(i, niter, mem)) mscfobj.close()
# Perform the online clustering mbkm = MiniBatchKMeans(n_clusters=nb_clusters, batch_size=batch_size, reassignment_ratio=0.001, random_state=1000) birch = Birch(n_clusters=nb_clusters, threshold=0.2, branching_factor=350) scores_mbkm = [] scores_birch = [] for i in range(0, nb_samples, batch_size): X_batch, Y_batch = X[i:i + batch_size], Y[i:i + batch_size] mbkm.partial_fit(X_batch) birch.partial_fit(X_batch) scores_mbkm.append( adjusted_rand_score(Y[:i + batch_size], mbkm.predict(X[:i + batch_size]))) scores_birch.append( adjusted_rand_score(Y[:i + batch_size], birch.predict(X[:i + batch_size]))) Y_pred_mbkm = mbkm.predict(X) Y_pred_birch = birch.predict(X) print('Adjusted Rand score Mini-Batch K-Means: {}'.format( adjusted_rand_score(Y, Y_pred_mbkm))) print('Adjusted Rand score BIRCH: {}'.format( adjusted_rand_score(Y, Y_pred_birch)))
class BorderPointStreamClustering: def __init__(self, data_frame): self.data_frame = data_frame self.birch_tree = None self.clustering_name = None self.clustering_result = None self.cluster_count = None def cluster_Birch(self, branch=10, n_Clusters=2, threshold=0.1, k=10, visualize=True): nn_indices = calculate_k_nearest_neighbours( self.data_frame.get_point_only_df(), k)[1] self.birch_tree = Birch(branching_factor=branch, n_clusters=n_Clusters, threshold=threshold, compute_labels=True) point_finnished_name = "anytime calc finnished" border_point_name = "border point" self.clustering_name = "birch streaming clustering " + str( time.process_time()) self.data_frame.add_result_name(self.clustering_name, -1, ColType.CLUSTER_LABEL) self.data_frame.add_result_name(border_point_name, 0, ColType.BORDER_POINT) self.data_frame.add_result_name(point_finnished_name, 0, ColType.UNKNOWN) # Gotta add colytype border_degree_res_name = "border degree" self.data_frame.add_result_name(border_degree_res_name, -1, ColType.BORDER_DEGREE) step_size = 50 max_border_degree = 70 stops = [0.1, 0.2, 0.5] stopped = False while not stopped: new_data_points_batch = self.data_frame.get_data_batch(step_size) new_border_points_indices = [] compute_approx_enclosing_degree_avg_for_batch( self.data_frame, new_data_points_batch, nn_indices, k, border_degree_res_name) for index, row in new_data_points_batch.iterrows(): deg = self.data_frame.df.at[index, border_degree_res_name] if deg <= max_border_degree: self.data_frame.add_result(border_point_name, index, 1) new_border_points_indices.append(index) self.data_frame.add_result(point_finnished_name, index, 1) new_border_points = self.data_frame.df.iloc[ new_border_points_indices, :] new_border_points_po = self.data_frame.get_point_only_df( new_border_points).to_numpy() print(new_border_points_po) self.birch_tree.partial_fit(new_border_points_po) print(self.birch_tree.labels_) print(self.birch_tree.labels_) labels = self.birch_tree.labels_ bp_list = self.data_frame.get_border_points_point_only_df( ).index.tolist() bp_clus = list(zip(bp_list, labels)) self.data_frame.add_result_name(self.clustering_name, -1, ColType.CLUSTER_LABEL) for ind, clus in bp_clus: self.data_frame.add_result(self.clustering_name, ind, clus) self._assign_inner_points(self.clustering_name) self.cluster_count = len( set(self.data_frame.df[self.clustering_name].tolist())) self.clustering_result = self.data_frame.df[ self.clustering_name].tolist() return self.clustering_name @staticmethod def get_degree_and_euclidean_distance_metric(modifier=1): def degree_and_euclidean_distance_metric(point_a, point_b): distance_vector_a = point_a[:len(point_a) // 2] angular_vector_a = point_a[len(point_a) // 2:] distance_vector_b = point_b[:len(point_b) // 2] angular_vector_b = point_b[len(point_b) // 2:] angular_vector_a_norm = numpy.linalg.norm( angular_vector_a.tolist()) angular_vector_b_norm = numpy.linalg.norm( angular_vector_b.tolist()) dp = numpy.dot(angular_vector_a.tolist(), angular_vector_b.tolist()) upper_term = (dp / (angular_vector_a_norm * angular_vector_b_norm)) if round(upper_term, 2) == 1: direction = 0 else: direction = numpy.degrees(numpy.arccos(upper_term)) if np.isnan(direction): direction = 0 distance = numpy.linalg.norm(distance_vector_a - distance_vector_b) linear_multiplier_max = modifier per_degree_multiplicator = ((linear_multiplier_max - 1) / 180) angular_weighted_distance = distance * ( 1 + (per_degree_multiplicator * direction)) return angular_weighted_distance return degree_and_euclidean_distance_metric def _similarity_measure_data_pre_processing(self): pre_processed_degree_euclidean_data = self.data_frame.get_border_points_point_only_df( ).values angular_measure_data = self.data_frame.get_border_points_direction_only_df( ) angular_measure_data_values = angular_measure_data.values single_array_values = angular_measure_data_values new_list = [] for j in range(len(pre_processed_degree_euclidean_data)): x = numpy.array(single_array_values[j]) y = pre_processed_degree_euclidean_data[j] new_list.append(numpy.append(y, x)) pre_processed_degree_euclidean_data = new_list return pre_processed_degree_euclidean_data def _visualize_DBSCAN(self, cluster_name): n_clusters_ = len(set(self.df[cluster_name].tolist())) if self.dimensions == 2: DataVisualization.visualize_plot_2d( self.df, hue=cluster_name, palette=DataVisualization.create_categorical_palette( n_clusters_)) def _visualize_DBSCAN_mask(self, labels, distance_measure_data_values, core_sample_indices, n_clusters_): core_samples_mask = numpy.zeros_like(labels, dtype=bool) core_samples_mask[core_sample_indices] = True if self.dimensions == 2: # Black removed and intsys used for noise instead. unique_labels = set(labels) colors = [ plt.cm.Spectral(each) for each in numpy.linspace(0, 1, len(unique_labels)) ] for k, col in zip(unique_labels, colors): if k == -1 or k == 0: # Black used for noise. col = [0, 0, 0, 1] class_member_mask = (labels == k) X = numpy.asarray(distance_measure_data_values) xy = X[class_member_mask & core_samples_mask] plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col), markeredgecolor='k', markersize=8) xy = X[class_member_mask & ~core_samples_mask] plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col), markeredgecolor='k', markersize=8) plt.title('Estimated number of clusters: %d' % n_clusters_) plt.show() def _assign_inner_points(self, cluster_name): border_points = self.data_frame.get_border_points_point_only_df() border_points_values = border_points.values border_points_indices = border_points.index.values inner_points = self.data_frame.get_inner_points_point_only_df() tree = spatial.KDTree(border_points_values) for index, row in inner_points.iterrows(): point_to_search = row.values distance, idx = tree.query(point_to_search) nearest_border_point = border_points_indices[idx] clusternr = self.data_frame.df.iloc[nearest_border_point][ cluster_name] self.data_frame.add_result(cluster_name, index, clusternr)
def birch_pipeline(args: argparse.Namespace) -> None: """ The main function of this application when birch is requested. Parameters ---------- args: `argparse.Namespace`, required The argument namespace passed from terminal """ method_parameters = parse_parameters_from_special_string(args.method_params) input_dict = fetch_input_dict(input_files=args.input_files) X = numpy.array(input_dict['vector_representation']) method_searchspace = args.method_searchspace variable_name, variable_low, variable_high = method_searchspace.split(':') variable_low = int(variable_low) variable_high = int(variable_high) history = [] output_filepath = os.path.abspath(args.output_bundle) if args.batch_size == 0: for i in tqdm(range(variable_low, variable_high + 1)): method_parameters[variable_name] = i if method_parameters['n_clusters'] == 0: method_parameters['n_clusters'] = None method_model = Birch(**method_parameters).fit(X) method_model.fit(X) history.append( {'search_on': variable_name, 'method_name': 'birch', 'parameters': method_parameters.copy(), 'input_files': args.input_files, 'labels': method_model.labels_.copy(), 'loss': compute_point_loss(X=X, labels=method_model.labels_) } ) with open(output_filepath, 'wb') as handle: pickle.dump({'history': history}, handle) del method_model else: for i in tqdm(range(variable_low, variable_high + 1)): method_parameters[variable_name] = i method_model = Birch(**method_parameters) random_index_permutation = numpy.random.permutation(X.shape[0]) labels = numpy.zeros(X.shape[0]) for epoch in range(args.num_epochs): print('>> (status): epoch {}/{}\n'.format(epoch, args.num_epochs)) cursor = 0 while (cursor + args.batch_size) <= X.shape[0]: method_model.partial_fit( X[random_index_permutation[cursor:(cursor + args.batch_size)], :]) labels[random_index_permutation[cursor:(cursor + args.batch_size)]] = method_model.labels_ cursor += args.batch_size history.append( {'search_on': variable_name, 'method_name': 'kmeans', 'parameters': method_parameters.copy(), 'input_files': args.input_files, 'labels': labels.copy(), 'loss': compute_point_loss(X=X, labels=method_model.labels_) } ) with open(output_filepath, 'wb') as handle: pickle.dump({'history': history}, handle) del method_model print('\n>> status: all finished.\n')
n_samples_2 = 200 centers_2 = [[10,10], [0,1]] X2, _ = make_blobs(n_samples=n_samples_2, centers=centers_2, cluster_std=0.2) # Prints these new points plt.plot([X2[a][0] for a in range(n_samples_2)], [X2[b][1] for b in range(n_samples_2)], '.') plt.axis([-4,12,-4,12]) plt.show() labels = brc.labels_ cluster_centers = brc.subcluster_centers_ labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) # Adds the new points to the old clustering with "partial_fit" brc.partial_fit(X2) labels = np.concatenate([labels,brc.labels_]) cluster_centers = brc.subcluster_centers_ # All the cluster centers existing (old and new) labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) # All the points generated (old and new) X_tot = np.concatenate([X,X2]) # Prints the different clusters computed # We can see that the some new points were added to an old cluster ([0,1]), and the others created a new cluster plt.figure(1) plt.clf() colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
batch_size = 80 if __name__ == '__main__': # Create the dataset X, Y = make_blobs(n_samples=nb_samples, n_features=2, centers=5, cluster_std=1.5, random_state=1000) # Create an instance of BIRCH birch = Birch(n_clusters=5, threshold=0.15, branching_factor=100) # Train the model X_batch = [] Y_preds = [] for i in range(0, nb_samples, batch_size): birch.partial_fit(X[i:i + batch_size]) X_batch.append(X[:i + batch_size]) Y_preds.append(birch.predict(X[:i + batch_size])) print(adjusted_rand_score(birch.predict(X), Y)) # Show the training steps fig, ax = plt.subplots(5, 5, figsize=(20, 12)) for i in range(5): for j in range(5): idx = (i * 5) + j for k in range(5): ax[i][j].scatter(X_batch[idx][Y_preds[idx] == k, 0], X_batch[idx][Y_preds[idx] == k, 1], s=3)
def mClassification(X, y, threshold, K=None): brc = Birch(n_clusters=K, threshold=threshold, compute_labels=True) return brc.partial_fit(X)
if index.vocab_size < args.bsize: logging.info("ERROR: Batch size [{}] must be greater than vocabulary [{}]".format(args.bsize, index.vocab_size)) exit() sparse_word_centroids = wordCentroids(db=index, vect=vectorizer) # Tal vez pueda cargar la matrix dipersa de word_centroids en ram y hacer NMF. # logging.info("Fitting Birch clustering for sparse coding ...") birch = Birch(threshold=0.5, branching_factor=50, n_clusters=args.dim) #MiniBatchKMeans(n_clusters=args.dim, init='k-means++', max_iter=4, batch_size=batch_size) words = [] for i, batch in enumerate(batches(sparse_word_centroids, batch_size)): #buffer.append(vstack(batch)) logging.info("Fitted the %d th batch..." % i) words.append(batch[0]) birch.partial_fit(batch[1]) words = list(chain(*words)) for i, batch in enumerate(batches(sparse_word_centroids, batch_size)): if i == 0: #word_embeddings = batch[1].dot(csr_matrix(birch.subcluster_centers_).T) word_embeddings = birch.transform(batch[1]) else: #word_embeddings = vstack([word_embeddings, batch[1].dot(csr_matrix(birch.subcluster_centers_).T)]) word_embeddings = np.vstack([word_embeddings, birch.transform(batch[1])]) # word_embeddings.shape = (vocab_size, args.dim) logging.info("DB Vocabulary size %d ..." % index.vocab_size) logging.info("Vectorizer vocabulary size %d ..." % len(vectorizer.vocabulary_.keys()))