def cluster(self): self.logger.info("Clustering ... (DBSCAN)") gpu_mem(0) self.clusterer.fit(self.data) self.logger.info("Clusters: %d, Core samples: %d", self.clusterer.labels_.max(), len(self.clusterer.core_sample_indices_))
def fit(self, data): t1 = process_time() self.umap.fit(data) t2 = process_time() self.logger.debug("Fit: %d samples, %d sec", self.umap.X_m.shape[0], t2 - t1) gpu_mem(0) return self
def fit_transform(self, data): t1 = process_time() result = self.umap.fit_transform(data) t2 = process_time() self.logger.debug("UMAP Fit: %d samples, %d sec", data.shape[0], t2 - t1) gpu_mem(0) return result
def load_dataset_to_gpu(filename, sampling=True, train_size=1000000, test_size=None): logging.info("Down-sampling input data to %d", train_size) train_data, test_data = read(filename, sampling=True, train_size=train_size, test_size=None) gdf = to_gpu(train_data) gpu_mem(0) return gdf
def reduce(self, data, as_df=True): t1 = process_time() result = self.umap.transform(data) t2 = process_time() self.logger.debug("Reduce: %d samples, %d sec", data.shape[0], t2 - t1) gpu_mem(0) if as_df: return result else: return result.as_matrix()
def __init__(self, ndims=5, nn=25, eps=0.1, minSamples=25, coreSamples=True, verbose=False): self.logger = logging.getLogger("T5Clustering") logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', level=logging.DEBUG) self.reducer = DimReducer(n_components=ndims, n_neighbors=nn) gpu_mem(0) self.train_data = None self.test_data = None self.data = None self.clusterer = DBSCAN(eps=eps, min_samples=minSamples, verbose=verbose, calc_core_sample_indices=coreSamples, output_type='cudf')
def cluster(gdf, eps, minSamples): # cpu clustering # clusterer = hdbscan.HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom') # clusterer.fit(data) logging.info("cuml.SCANDB Clustering - eps=%0.3f, samples=%d", eps, minSamples) # GPU clustering result = DBSCAN(eps=eps, min_samples=minSamples, verbose=False, calc_core_sample_indices=True, output_type='cudf').fit(gdf) metric1 = result.labels_.max() metric2 = len(result.core_sample_indices_) mlflow.log_metric('labels', metric1) mlflow.log_metric('core_samples', metric2) gpu_mem(0) logging.debug("cuml.DBSCAN - dims: %d, max distance: %0.3f, min samples: %d, labels: %d, core_samples: %d", ndims, eps, minSamples, metric1, metric2)
# Both import methods supported from cuml.cluster import DBSCAN from src.DimReducer import DimReducer from src.utils.file_utils import read from src.utils.gpu_utils import gpu_mem import logging logger = logging.getLogger(__name__) logger.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', level=logging.DEBUG) gpu_mem(0) train_data, test_data = read( "/mnt/m2-1/cw09b.dh.1k.spam.70.dochits.tier.5.t5-base.embeddings.0.npz", sampling=True, train_size=1000000, test_size=None) # reduce dimensions ndims = 5 nn = 25 logger.info("Dimensionality reduction: 768 to %d... (UMAP)", nn) reducer = DimReducer(n_components=ndims, n_neighbors=nn) data = reducer.fit_transform(train_data) # cluster eps = 0.05 minSamples = 25 logger.info("Clustering ... (DBSCAN)")
def dim_reduce(gdf, ndims=5, n_neighbors=15): reducer = DimReducer(n_components=ndims, n_neighbors=n_neighbors) data = reducer.fit_transform(gdf) gpu_mem(0) return data