예제 #1
0
 def cluster(self):
     self.logger.info("Clustering ... (DBSCAN)")
     gpu_mem(0)
     self.clusterer.fit(self.data)
     self.logger.info("Clusters: %d, Core samples: %d",
                      self.clusterer.labels_.max(),
                      len(self.clusterer.core_sample_indices_))
예제 #2
0
 def fit(self, data):
     t1 = process_time()
     self.umap.fit(data)
     t2 = process_time()
     self.logger.debug("Fit: %d samples, %d sec", self.umap.X_m.shape[0],
                       t2 - t1)
     gpu_mem(0)
     return self
예제 #3
0
 def fit_transform(self, data):
     t1 = process_time()
     result = self.umap.fit_transform(data)
     t2 = process_time()
     self.logger.debug("UMAP Fit: %d samples, %d sec", data.shape[0],
                       t2 - t1)
     gpu_mem(0)
     return result
예제 #4
0
def load_dataset_to_gpu(filename, sampling=True, train_size=1000000, test_size=None):
    logging.info("Down-sampling input data to %d", train_size)
    train_data, test_data = read(filename,
                                 sampling=True,
                                 train_size=train_size,
                                 test_size=None)
    gdf = to_gpu(train_data)
    gpu_mem(0)
    return gdf
예제 #5
0
 def reduce(self, data, as_df=True):
     t1 = process_time()
     result = self.umap.transform(data)
     t2 = process_time()
     self.logger.debug("Reduce: %d samples, %d sec", data.shape[0], t2 - t1)
     gpu_mem(0)
     if as_df:
         return result
     else:
         return result.as_matrix()
예제 #6
0
 def __init__(self,
              ndims=5,
              nn=25,
              eps=0.1,
              minSamples=25,
              coreSamples=True,
              verbose=False):
     self.logger = logging.getLogger("T5Clustering")
     logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s',
                         level=logging.DEBUG)
     self.reducer = DimReducer(n_components=ndims, n_neighbors=nn)
     gpu_mem(0)
     self.train_data = None
     self.test_data = None
     self.data = None
     self.clusterer = DBSCAN(eps=eps,
                             min_samples=minSamples,
                             verbose=verbose,
                             calc_core_sample_indices=coreSamples,
                             output_type='cudf')
예제 #7
0
def cluster(gdf, eps, minSamples):
    # cpu clustering
    # clusterer = hdbscan.HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom')
    # clusterer.fit(data)

    logging.info("cuml.SCANDB Clustering - eps=%0.3f, samples=%d", eps, minSamples)
    # GPU clustering
    result = DBSCAN(eps=eps,
                    min_samples=minSamples,
                    verbose=False,
                    calc_core_sample_indices=True,
                    output_type='cudf').fit(gdf)

    metric1 = result.labels_.max()
    metric2 = len(result.core_sample_indices_)

    mlflow.log_metric('labels', metric1)
    mlflow.log_metric('core_samples', metric2)

    gpu_mem(0)
    logging.debug("cuml.DBSCAN - dims: %d, max distance: %0.3f, min samples: %d, labels: %d, core_samples: %d",
                  ndims, eps, minSamples, metric1, metric2)
예제 #8
0
# Both import methods supported
from cuml.cluster import DBSCAN

from src.DimReducer import DimReducer
from src.utils.file_utils import read
from src.utils.gpu_utils import gpu_mem

import logging

logger = logging.getLogger(__name__)

logger.basicConfig(format='%(asctime)s %(levelname)s: %(message)s',
                   level=logging.DEBUG)
gpu_mem(0)
train_data, test_data = read(
    "/mnt/m2-1/cw09b.dh.1k.spam.70.dochits.tier.5.t5-base.embeddings.0.npz",
    sampling=True,
    train_size=1000000,
    test_size=None)

#  reduce dimensions
ndims = 5
nn = 25
logger.info("Dimensionality reduction: 768 to %d... (UMAP)", nn)
reducer = DimReducer(n_components=ndims, n_neighbors=nn)
data = reducer.fit_transform(train_data)

# cluster
eps = 0.05
minSamples = 25
logger.info("Clustering ... (DBSCAN)")
예제 #9
0
def dim_reduce(gdf, ndims=5, n_neighbors=15):
    reducer = DimReducer(n_components=ndims, n_neighbors=n_neighbors)
    data = reducer.fit_transform(gdf)
    gpu_mem(0)
    return data