Exemplo n.º 1
0
def fast_knn(
    X,
    *,
    n_clusters: int = 5,
    n_neighbors: Optional[int] = None,
    graph_mode='distance',
    cluster_mode='spectral',
    algorithm='brute',
    n_jobs: Optional[int] = None,
    random_state: int = 1,
    framework: Literal['auto', 'cuml', 'sklearn'] = 'auto',
) -> NearestNeighbors:
    """
  Parameters
  ----------
  X : `ndarray` or tuple of (X, y)
  n_neighbors: int (default = 5)
    The top K closest datapoints you want the algorithm to return.
    Currently, this value must be < 1024.
  graph_mode : {'distance', 'connectivity'}, default='distance'
    This mode decides which values `kneighbors_graph` will return:
      - 'connectivity' : will return the connectivity matrix with ones and
        zeros (for 'SpectralClustering').
      - 'distance' : will return the distances between neighbors according
        to the given metric (for 'DBSCAN').
  cluster_mode: {'vote', 'spectral', 'isomap'}, default='vote'
      This mode decides how to generate cluster prediction from the
      neighbors graph:
      - 'dbscan' :
      - 'spectral' :
      - 'isomap' :
      - 'kmeans' :
  algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional
      Algorithm used to compute the nearest neighbors:
      - 'ball_tree' will use :class:`BallTree`
      - 'kd_tree' will use :class:`KDTree`
      - 'brute' will use a brute-force search.
      - 'auto' will attempt to decide the most appropriate algorithm
        based on the values passed to :meth:`fit` method.
      Note: fitting on sparse input will override the setting of
      this parameter, using brute force.
  """
    kwargs = dict(locals())
    X = kwargs.pop('X')
    framework = kwargs.pop('framework')
    random_state = kwargs.pop('random_state')
    n_clusters = int(kwargs.pop('n_clusters'))
    if n_neighbors is None:
        kwargs['n_neighbors'] = n_clusters
        n_neighbors = n_clusters
    ## graph mode
    graph_mode = str(kwargs.pop('graph_mode')).strip().lower()
    assert graph_mode in ('distance', 'connectivity')
    ## cluster mode
    cluster_mode = str(kwargs.pop('cluster_mode')).strip().lower()
    ## fine-tuning the kwargs
    use_cuml = _check_cuml(framework)
    if use_cuml:
        from cuml.neighbors import NearestNeighbors as KNN
        kwargs.pop('n_jobs')
        kwargs.pop('algorithm')
    else:
        KNN = NearestNeighbors
    ## fitting
    knn = KNN(**kwargs)
    knn.fit(X)
    knn._fitid = id(X)
    ## Transform mode
    knn._random_state = random_state
    knn._n_clusters = n_clusters
    knn._graph_mode = graph_mode
    knn._cluster_mode = cluster_mode
    if use_cuml:
        knn.n_samples_fit_ = X.shape[0]
    knn.kneighbors_graph = types.MethodType(nn_kneighbors_graph, knn)
    knn.transform = types.MethodType(nn_transform, knn)
    knn.fit_transform = types.MethodType(nn_fit_transform, knn)
    knn.predict = types.MethodType(nn_predict, knn)
    return knn
Exemplo n.º 2
0
# In[14]:

train['oof_cnn'] = preds

if COMPUTE_CV:
    train['f1'] = train.apply(getMetric('oof_cnn'), axis=1)
    print('CV score for baseline =', train.f1.mean())

# # title TFIDF

# In[15]:

# from sklearn.feature_extraction.text import TfidfVectorizer
model = TfidfVectorizer(stop_words=None, binary=True, max_features=25000)
text_embeddings = model.fit_transform(train_gf.title).toarray()
print('text embeddings shape', text_embeddings.shape)

# In[16]:

preds = []
CHUNK = 1024 * 4

print('Finding similar titles...')
CTS = len(train) // CHUNK
if len(train) % CHUNK != 0: CTS += 1
for j in range(CTS):

    a = j * CHUNK
    b = (j + 1) * CHUNK
    b = min(b, len(train))