Exemplo n.º 1
0
def preprocess(prod, C, p, algo, nEst=10, nCand=40, feasibles=None):
    t0 = time.time()

    if algo == 'special_case_LSH':
        print "\tLSH DB Special init..."
        db = LSHForest(n_estimators=nEst, n_candidates=nCand, n_neighbors=C)
    elif algo == 'general_case_LSH':
        print "\tLSH DB General init..."
        db = LSHForest(n_estimators=nEst, n_candidates=nCand, n_neighbors=1)
    elif algo == 'special_case_exact':
        print "\tExact DB Special init..."
        db = NearestNeighbors(n_neighbors=C,
                              metric='cosine',
                              algorithm='brute')
    else:
        print "\tExact DB General init..."
        db = NearestNeighbors(n_neighbors=1,
                              metric='cosine',
                              algorithm='brute')

    if ((algo == 'special_case_LSH') | (algo == 'special_case_exact')):
        U = np.eye(prod)
        normConst = np.sqrt(2 + np.max(p)**2)
        ptsTemp = np.concatenate(
            (U * np.array(p[1:]), U), axis=1) * 1.0 / normConst
        # print ptsTemp,ptsTemp.shape,1.0/normConst
        feasibles = [0 for i in range(ptsTemp.shape[0])]  #dummy
    else:
        normConst = C * np.sqrt(1 + np.max(p)**2)
        ptsTemp = np.zeros((len(feasibles), 2 * prod))
        for idx, feasible in enumerate(feasibles):
            ptsTemp[idx] = np.concatenate(
                (np.array(p[1:]) * feasible, feasible)) * 1.0 / normConst

    #MIPS to NN transformation of all points
    lastCol = np.linalg.norm(ptsTemp, axis=1)**2
    lastCol = np.sqrt(1 - lastCol)
    pts = np.concatenate((ptsTemp, lastCol.reshape((len(feasibles), 1))),
                         axis=1)

    # for e,fe in enumerate(feasibles):
    #   print e,np.linalg.norm(p[1:]*feasibles[e]/normConst),np.linalg.norm(pts[e])

    db.fit(pts)

    build_time = time.time() - t0
    print "\t\tIndex build time: ", build_time

    return db, build_time, normConst  #,pts
def Classify(nlp, keywords,
             categories):  #keywords  - list; categories - dict: {name; vector}
    counterDict = Counter(keywords)  #optimization for keywords duplicates
    sumVector = numpy.zeros(nlp.vocab.vectors_length)

    #temp
    text = ' '.join(keywords)

    for word, repCount in counterDict.items():  #summurizing words vectors
        curVect = nlp(word).vector
        sumVector += (curVect * repCount)

    vec = nlp(text).vector
    sim = cosine_similarity(vec, sumVector)
    print("Sim: " + str(sim))

    catArray = numpy.array(list(categories.values()))
    catKeys = list(categories.keys())
    #tree = KDTree(catArray, metric='pyfunc', func=cosine_similarity)
    #dist, ind = tree.query(sumVector, k=TOP_N_COUNT) #.reshape(-1, 1)

    print("Creating LSHForest...")

    lshf = LSHForest(n_candidates=70, n_estimators=30, n_neighbors=TOP_N_COUNT)
    lshf.fit(catArray)
    print("LSHForest was created")

    print("Getting neighbors...")
    distances, indices = lshf.kneighbors(sumVector.reshape((1, -1)))
    print("Got neighbors.")

    for curIndex in numpy.nditer(indices):
        print("Found category: " + str(catKeys[curIndex]))
        print("with distance: " + str(distances))
Exemplo n.º 3
0
def score(factors):
    verifyCount = 3
    X, y = Sets.trainingSet
    test_set, databases = Sets.testSet
    X = FactorizeVectors(X, factors)
    test_set = FactorizeVectors(test_set, factors)
    correctionAverage = 0
    for i in range(verifyCount):
        best_predictions = 0
        clf = LSHForest(n_estimators = 10, n_candidates = 10)
        clf.fit(X)

        correct = 0
        total = 0

        for j in range(len(test_set)):
            total += 1
            actual = databases[j]
            distances, indices = clf.kneighbors(test_set[j], n_neighbors=5)
            predicted = GetPrediction(y, distances[0], indices[0])
            if (actual == predicted):
                correct += 1

        if (correct > best_predictions):
            best_predictions = correct
        correctionAverage += best_predictions
    correctionAverage = float(correctionAverage)/verifyCount
    return correctionAverage
Exemplo n.º 4
0
 def __init__(self, lsh_init=None):
     if lsh_init == None:
         self._lsh_forest = LSHForest(n_estimators=25, n_candidates=1000)
     else:
         self._lsh_forest = lsh_init
     self.iw = None
     self.m = None
Exemplo n.º 5
0
    def fit_model(self, data, n_estimators, n_neighbours):

        LSHf = LSHForest(random_state=42,
                         n_estimators=n_estimators,
                         n_neighbors=n_neighbours)
        LSHf.fit(data)
        return LSHf
Exemplo n.º 6
0
def persist_attraction_similarities_to_db():
    # build LSHForest model for reduced dimension dataset
    svd = TruncatedSVD(n_components=10, n_iter=7)
    red_dim_itemuserdf = svd.fit_transform(itemuserdf)
    item_user_model = LSHForest()
    item_user_model.fit(red_dim_itemuserdf)

    # persist attractions similarities to db
    K=20        # query for K neighbors
    k=10        # return k neighbors
    for i in range(itemuserdf.shape[0]):
        distance, indices = item_user_model.kneighbors(
            red_dim_itemuserdf[i].reshape(1, -1), n_neighbors=K
        )
        weights = 1 - distance
        for j in range(k):
            if i != indices[0][j]:
                e = SimilarAttractions(
                    attraction_id=Attraction.objects.filter(
                        app_id=int(i)).values('attraction_id')[0]['attraction_id'],
                    similar_attraction_id=Attraction.objects.filter(
                        app_id=int(indices[0][j])).values('attraction_id')[0]['attraction_id'],
                    similarity=weights[0][j],
                    ts=timezone.now()
                )
                e.save()
Exemplo n.º 7
0
def train():
    # 构建匹配语料库 398872 samples
    sku_names_texts = get_train_datas()
    sku_names_jieba = get_text_jieba(sku_names_texts)
    sku_names_with_spaces = []
    for sku_names in sku_names_jieba:
        sku_names_with_spaces.append(' '.join(sku_names))

    # 测试数据 1000 samples
    keywords_texts = get_test_datas()
    keywords_jieba = get_text_jieba(keywords_texts)
    keywords_with_spaces = []
    for keywords in keywords_jieba:
        keywords_with_spaces.append(' '.join(keywords))

    tfidf_vec = TfidfVectorizer(min_df=3, max_features=None, ngram_range=(1, 2), use_idf=1, smooth_idf=1, sublinear_tf=1)
    x_train = tfidf_vec.fit_transform(sku_names_with_spaces)

    lshf = LSHForest(random_state=42)
    #lshf.fit(np.array(x_train))
    lshf.fit(x_train)

    for i, kw in enumerate(keywords_with_spaces):
        x_test = tfidf_vec.transform([kw])
        distances, indices = lshf.kneighbors(x_test.toarray(), n_neighbors=1)
        idx = indices[0][0]
        print(i, "||", keywords_texts[i], "||", sku_names_texts[idx])

        with open("result/lsh_v1_results.txt", 'a', encoding='utf8') as wf:
            wf.write(str(i) + "||" + keywords_texts[i] + "||" + sku_names_texts[idx] + "\n")
Exemplo n.º 8
0
 def fit_lsh(self):
     self.lsh = LSHForest(random_state=12345)
     train_data = [
         self.encode_sentence(self.indexed_background[i], True)
         for i in range(len(self.indexed_background))
     ]
     self.lsh.fit(train_data)
    def get_nearest_neighbor_iterable(self,
                                      graphlist,
                                      start_graphs,
                                      start_is_subset=True):

        # vectorize all
        graphlist = list(graphlist)
        graphlist_ = copy.deepcopy(graphlist)
        X = self.vectorizer.transform_single(graphlist_)

        start_graphs = list(start_graphs)
        graphlist_ = copy.deepcopy(start_graphs)
        Y = self.vectorizer.transform_single(graphlist_)

        forest = LSHForest()
        forest.fit(X)
        #http://scikit-learn.org/stable/modules/neighbors.html
        distances, indices = forest.kneighbors(Y, n_neighbors=2)

        # we just assume that this is short...
        index = 0
        if start_is_subset:
            index += 1

        #matches= ( X_index ,Y_index, distance  )
        matches = [(indices[i, index], i, distances[i, index])
                   for i in range(len(indices))]
        matches.sort()

        # this looks super confusing....
        #for index, graph in enumerate(selection_iterator(graphlist, [a[0] for a in matches])):
        #    yield ((graph, start_graphs[matches[index][1]], X[matches[index][0]]))
        # so i wrote this:,,, you may even get rid of the matches variable i think.. and use indices directly
        for Xi, Yi, dist in matches:
            yield ((start_graphs[Yi], graphlist[Xi], X[Xi]))
Exemplo n.º 10
0
def predict(login, file):
    login_features = mfcc(login, file)
    lshf = LSHForest(random_state=42)
    gmm = joblib.load(path + '/speaker_models/' + login + '.pkl')
    ubm = joblib.load(path + '/speaker_models/' + 'ubm.pkl')
    model = joblib.load(path + '/speaker_models/' + login + 'Model.pkl')
    gmm_likelihood_score = gmm.score(login_features)
    ubm_likelihood_score = ubm.score(login_features)
    likelihood_score = gmm_likelihood_score - ubm_likelihood_score
    login_features = [j for i in login_features for j in i]
    if len(model) > len(login_features):
        array = model[:len(login_features)]
        lshf.fit([array])
        distances, indices = lshf.kneighbors([login_features], n_neighbors=2)
        dist = pairwise_distances_argmin_min([array], [login_features])
    else:
        array = login_features[:len(model)]
        lshf.fit([array])
        distances, indices = lshf.kneighbors([model], n_neighbors=2)
        dist = pairwise_distances_argmin_min([array], [model])
    result = {}
    result['score'] = [likelihood_score, distances]
    result['distance'] = dist
    if likelihood_score > 0:
        result['Message'] = 'Authenticated'
    else:
        result['Message'] = 'Not Authenticated'   
    return result
Exemplo n.º 11
0
    def optimise(self, num_train_points, num_val_points, parameters):

        max_accuracy = -1
        optimal_estimators = -1
        optimal_n_neighbours = -1

        for item in self.get_generator(parameters):

            LSHf = LSHForest(random_state=42,
                             n_estimators=item['n_est'],
                             n_neighbors=item['n_neigh'])
            LSHf.fit(self.train.images[:num_train_points])
            distances, indices = LSHf.kneighbors(
                self.validation.images[:num_val_points], n_neighbors=5)

            accuracy, positions = self.model_accuracy(indices,
                                                      is_optimising=True)

            if accuracy > max_accuracy:
                max_accuracy = accuracy
                optimal_estimators = item['n_est']
                optimal_n_neighbours = item['n_neigh']

#         print(optimal_n_neighbours_predict)
        return max_accuracy, optimal_estimators, optimal_n_neighbours
Exemplo n.º 12
0
    def single_batch(self, tweets):
        """Performs an approximate nearest neighbors search on tweets in the database
        passed to it. The database must be a list of tweets (text of the tweets only).
        
        Returns the indices of tweets with nearby neighbors (i.e. spam tweets).
        These indices correspond to indices within the batch of tweets fed to
        this function."""

        # Vectorize and fit tree:
        vect2 = CountVectorizer(stop_words=self.common_twitter_handles)
        X2 = vect2.fit_transform(tweets)
        tree2 = LSHForest()
        tree2.fit(X2)

        # Build tree:
        n_neighbors = []
        neighbors_indices = []
        for x in vect2.transform(tweets):
            if len(n_neighbors) % 100 == 0:
                print "%r tweets analyzed out of %r for this batch" % (
                    len(n_neighbors), self.batch_size)
            neighbors = tree2.radius_neighbors(x, radius=.4)[1]
            n_neighbors.append(len(neighbors[0]))
            neighbors_indices.append(neighbors)

        neighbors_indices = [
            x for x in range(len(neighbors_indices))
            if len(neighbors_indices[x][0]) > 2
        ]

        return neighbors_indices
Exemplo n.º 13
0
    def __init__(self,
                 use_lsh_forest=False,
                 n_neighbors=20,
                 max_iterations=300,
                 count_concepts=False,
                 number_of_concepts=0,
                 count_terms=False,
                 training_validation_split=0.8,
                 algorithm_id='7',
                 l2r_metric="ERR@k",
                 n_jobs=1,
                 translation_probability=False,
                 **kwargs):

        self.n_neighbors = n_neighbors
        nn = LSHForest(n_neighbors=n_neighbors, **
                       kwargs) if use_lsh_forest else NearestNeighbors(
                           n_neighbors=n_neighbors, **kwargs)
        self.knn = BatchKNeighbors(nn)
        self.y = None
        self.max_iterations = max_iterations
        self.count_concepts = count_concepts
        self.count_terms = count_terms
        self.number_of_concepts = number_of_concepts
        self.training_validation_split = training_validation_split
        self.algorithm_id = algorithm_id
        self.l2r_metric = l2r_metric
        self.n_jobs = n_jobs
        self.translation_probability = translation_probability
Exemplo n.º 14
0
def test_fit():
    """Checks whether `fit` method sets all attribute values correctly."""
    n_samples = 12
    n_features = 2
    n_estimators = 5
    rng = np.random.RandomState(42)
    X = rng.rand(n_samples, n_features)

    lshf = LSHForest(n_estimators=n_estimators)
    lshf.fit(X)

    # _input_array = X
    assert_array_equal(X, lshf._fit_X)
    # A hash function g(p) for each tree
    assert_equal(n_estimators, len(lshf.hash_functions_))
    # Hash length = 32
    assert_equal(32, lshf.hash_functions_[0].components_.shape[0])
    # Number of trees_ in the forest
    assert_equal(n_estimators, len(lshf.trees_))
    # Each tree has entries for every data point
    assert_equal(n_samples, len(lshf.trees_[0]))
    # Original indices after sorting the hashes
    assert_equal(n_estimators, len(lshf.original_indices_))
    # Each set of original indices in a tree has entries for every data point
    assert_equal(n_samples, len(lshf.original_indices_[0]))
Exemplo n.º 15
0
def test_partial_fit():
    """Checks whether inserting array is consitent with fitted data.

    `partial_fit` method should set all attribute values correctly.
    """
    n_samples = 12
    n_samples_partial_fit = 3
    n_features = 2
    rng = np.random.RandomState(42)
    X = rng.rand(n_samples, n_features)
    X_partial_fit = rng.rand(n_samples_partial_fit, n_features)

    lshf = LSHForest()

    # Test unfitted estimator
    lshf.partial_fit(X)
    assert_array_equal(X, lshf._fit_X)

    lshf.fit(X)

    # Insert wrong dimension
    assert_raises(ValueError, lshf.partial_fit,
                  np.random.randn(n_samples_partial_fit, n_features - 1))

    lshf.partial_fit(X_partial_fit)

    # size of _input_array = samples + 1 after insertion
    assert_equal(lshf._fit_X.shape[0], n_samples + n_samples_partial_fit)
    # size of original_indices_[1] = samples + 1
    assert_equal(len(lshf.original_indices_[0]),
                 n_samples + n_samples_partial_fit)
    # size of trees_[1] = samples + 1
    assert_equal(len(lshf.trees_[1]), n_samples + n_samples_partial_fit)
Exemplo n.º 16
0
def test_neighbors_accuracy_with_n_estimators():
    """Checks whether accuracy increases as `n_estimators` increases."""
    n_estimators = np.array([1, 10, 100])
    n_samples = 100
    n_features = 10
    n_iter = 10
    n_points = 5
    rng = np.random.RandomState(42)
    accuracies = np.zeros(n_estimators.shape[0], dtype=float)
    X = rng.rand(n_samples, n_features)

    for i, t in enumerate(n_estimators):
        lshf = LSHForest(n_candidates=500, n_estimators=t)
        lshf.fit(X)
        for j in range(n_iter):
            query = X[rng.randint(0, n_samples)]
            neighbors = lshf.kneighbors(query,
                                        n_neighbors=n_points,
                                        return_distance=False)
            distances = pairwise_distances(query, X, metric='cosine')
            ranks = np.argsort(distances)[0, :n_points]

            intersection = np.intersect1d(ranks, neighbors).shape[0]
            ratio = intersection / float(n_points)
            accuracies[i] = accuracies[i] + ratio

        accuracies[i] = accuracies[i] / float(n_iter)
    # Sorted accuracies should be equal to original accuracies
    assert_true(np.all(np.diff(accuracies) >= 0),
                msg="Accuracies are not non-decreasing.")
    # Highest accuracy should be strictly greater than the lowest
    assert_true(np.ptp(accuracies) > 0,
                msg="Highest accuracy is not strictly greater than lowest.")
Exemplo n.º 17
0
def knn_indices_func_approx(
        rep_pts: FloatTensor,  # (N, pts, dim)
        pts: FloatTensor,  # (N, x, dim)
        K: int,
        D: int) -> LongTensor:  # (N, pts, K)
    """
    Approximate CPU-based Indexing function based on K-Nearest Neighbors search.
    :param rep_pts: Representative points.
    :param pts: Point cloud to get indices from.
    :param K: Number of nearest neighbors to collect.
    :param D: "Spread" of neighboring points.
    :return: Array of indices, P_idx, into pts such that pts[n][P_idx[n],:]
    is the set k-nearest neighbors for the representative points in pts[n].
    """
    if rep_pts.is_cuda:
        rep_pts = rep_pts.cpu()
    if pts.is_cuda:
        pts = pts.cpu()
    rep_pts = rep_pts.data.numpy()
    pts = pts.data.numpy()

    region_idx = []

    for n, p in enumerate(rep_pts):
        P_particular = pts[n]
        lshf = LSHForest(n_estimators=20,
                         n_candidates=100,
                         n_neighbors=D * K + 1)
        lshf.fit(P_particular)
        indices = lshf.kneighbors(p, return_distance=False)
        region_idx.append(indices[:, 1::D])
Exemplo n.º 18
0
def runForestLSHSizeAnalysis(argsdict, data, inlbl, fPath, fName, fileN, i):
    start = time.time()
    tree = LSHForest(random_state=42)
    tree.fit(data)
    end = time.time()

    return sys.getsizeof(tree), (end - start)
Exemplo n.º 19
0
    def __init__(self, params: Dict[str, Any]):
        # Location of corpus to use for background knowledge search. This corpus is assumed to be
        # gzipped, one sentence per line.
        self.corpus_path = params.pop('corpus_path', None)

        # Number of background sentences to collect for each input.
        self.num_background = params.pop('num_background', 10)
        # Wait this many epochs before running differentiable search. This lets you train with the
        # base memory network code using external background knowledge for a time, then, once the
        # encoder is trained sufficiently, you can turn on the differentiable search.
        self.num_epochs_delay = params.pop('num_epochs_delay', 10)

        # Number of epochs we wait in between re-encoding the corpus.
        # TODO(matt): consider only re-encoding at early stopping, instead of a
        # number-of-epoch-based parameter.
        self.num_epochs_per_encoding = params.pop('num_epochs_per_encoding', 2)

        # Only meaningful if you are loading a model.  When loading, should we load a pickled LSH,
        # or should we re-initialize the LSH from the input corpus?  Note that if you give a corpus
        # path, and you load a saved LSH that was constructed from a _different_ corpus, you could
        # end up with really weird behavior.
        self.load_saved_lsh = params.pop('load_saved_lsh', False)

        # Now that we've popped our parameters, we can call the superclass constructor.
        super(DifferentiableSearchMemoryNetwork, self).__init__(params)

        # And then set some member variables.
        self._sentence_encoder_model = self.__build_sentence_encoder_model()
        self.lsh = LSHForest(random_state=12345)
        self.instance_index = {}  # type: Dict[int, str]
def CreateAndconfigureLSHForest(categories): # categories - dict: {name; vector}
    print("Creating LSHForest...")
    catArray = numpy.array(list(categories.values()))
    lshf = LSHForest(n_candidates=70, n_estimators=30, n_neighbors=TOP_N_COUNT)
    lshf.fit(catArray)
    print("LSHForest was created")
    return lshf
Exemplo n.º 21
0
def lof(X, k, outlier_threshold=1.5, verbose=False):
    """Knn with KD trees"""
    start = time.time()
    lshf = LSHForest(random_state=42)
    lshf.fit(X)

    distance, index= lshf.kneighbors(X,n_neighbors=k)
    distance, index = distance[:, 1:], index[:, 1:]
    radius = distance[:, -1]

    """Calculate LRD."""
    LRD = np.mean(np.maximum(distance, radius[index]), axis=1)
    r = 1. / np.array(LRD)

    """Calculate outlier score."""
    outlier_score = np.sum(r[index], axis=1) / np.array(r, dtype=np.float16)
    outlier_score *= 1. / k

    # print ('Compute time: %g seconds.' % ((time.time() - start)))

    if verbose: print("Recording all outliers with outlier score greater than %s." \
                      % (outlier_threshold))

    outliers = []
    """ Could parallelize this for loop, but really not worth the overhead...
        Would get insignificant performance gain."""
    for i, score in enumerate(outlier_score):
        if score > outlier_threshold:
            outliers.append([i,X[i], score])

    if verbose:
        print("Detected outliers:")
        print(outliers)

    return outliers
Exemplo n.º 22
0
def test_hash_functions():
    """Checks randomness of hash functions.

    Variance and mean of each hash function (projection vector)
    should be different from flattened array of hash functions.
    If hash functions are not randomly built (seeded with
    same value), variances and means of all functions are equal.
    """
    n_samples = 12
    n_features = 2
    n_estimators = 5
    rng = np.random.RandomState(42)
    X = rng.rand(n_samples, n_features)

    lshf = LSHForest(n_estimators=n_estimators,
                     random_state=rng.randint(0,
                                              np.iinfo(np.int32).max))
    lshf.fit(X)

    hash_functions = []
    for i in range(n_estimators):
        hash_functions.append(lshf.hash_functions_[i].components_)

    for i in range(n_estimators):
        assert_not_equal(np.var(hash_functions),
                         np.var(lshf.hash_functions_[i].components_))

    for i in range(n_estimators):
        assert_not_equal(np.mean(hash_functions),
                         np.mean(lshf.hash_functions_[i].components_))
Exemplo n.º 23
0
def test_distances():
    """Checks whether returned neighbors are from closest to farthest."""
    n_samples = 12
    n_features = 2
    n_iter = 10
    rng = np.random.RandomState(42)
    X = rng.rand(n_samples, n_features)

    lshf = LSHForest()
    lshf.fit(X)

    for i in range(n_iter):
        n_neighbors = rng.randint(0, n_samples)
        query = X[rng.randint(0, n_samples)]
        distances, neighbors = lshf.kneighbors(query,
                                               n_neighbors=n_neighbors,
                                               return_distance=True)
        # Returned neighbors should be from closest to farthest.
        assert_true(np.all(np.diff(distances[0]) >= 0))

        mean_dist = np.mean(pairwise_distances(query, X, metric='cosine'))
        distances, neighbors = lshf.radius_neighbors(query,
                                                     radius=mean_dist,
                                                     return_distance=True)
        assert_true(np.all(np.diff(distances[0]) >= 0))
Exemplo n.º 24
0
def hash_movie_similarity(um, num_neighbors=6):
    lsh = LSHForest(random_state=470957)
    lsh.fit(um.T)

    # Don't compare to self, remove first column, call 7 neighbors
    dist, ind = lsh.kneighbors(um.T, n_neighbors=num_neighbors+1, return_distance=True)
    sim = 1 - dist
    return sim[:,1:], ind[:,1:]
Exemplo n.º 25
0
 def _fit(self, xs):
     """ Fit index
     :param samples: list of Samples
     :return:
     """
     self.index = LSHForest(
         n_estimators=self.parameters.get('n_estimators', 20))
     self.index.fit(xs)
Exemplo n.º 26
0
def test_radius_neighbors_boundary_handling():
    X = [[0.999, 0.001], [0.5, 0.5], [0, 1.], [-1., 0.001]]
    n_points = len(X)

    # Build an exact nearest neighbors model as reference model to ensure
    # consistency between exact and approximate methods
    nnbrs = NearestNeighbors(algorithm='brute', metric='cosine').fit(X)

    # Build a LSHForest model with hyperparameter values that always guarantee
    # exact results on this toy dataset.
    lsfh = LSHForest(min_hash_match=0, n_candidates=n_points,
                     random_state=42).fit(X)

    # define a query aligned with the first axis
    query = [[1., 0.]]

    # Compute the exact cosine distances of the query to the four points of
    # the dataset
    dists = pairwise_distances(query, X, metric='cosine').ravel()

    # The first point is almost aligned with the query (very small angle),
    # the cosine distance should therefore be almost null:
    assert_almost_equal(dists[0], 0, decimal=5)

    # The second point form an angle of 45 degrees to the query vector
    assert_almost_equal(dists[1], 1 - np.cos(np.pi / 4))

    # The third point is orthogonal from the query vector hence at a distance
    # exactly one:
    assert_almost_equal(dists[2], 1)

    # The last point is almost colinear but with opposite sign to the query
    # therefore it has a cosine 'distance' very close to the maximum possible
    # value of 2.
    assert_almost_equal(dists[3], 2, decimal=5)

    # If we query with a radius of one, all the samples except the last sample
    # should be included in the results. This means that the third sample
    # is lying on the boundary of the radius query:
    exact_dists, exact_idx = nnbrs.radius_neighbors(query, radius=1)
    approx_dists, approx_idx = lsfh.radius_neighbors(query, radius=1)

    assert_array_equal(np.sort(exact_idx[0]), [0, 1, 2])
    assert_array_equal(np.sort(approx_idx[0]), [0, 1, 2])
    assert_array_almost_equal(np.sort(exact_dists[0]), dists[:-1])
    assert_array_almost_equal(np.sort(approx_dists[0]), dists[:-1])

    # If we perform the same query with a slightly lower radius, the third
    # point of the dataset that lay on the boundary of the previous query
    # is now rejected:
    eps = np.finfo(np.float64).eps
    exact_dists, exact_idx = nnbrs.radius_neighbors(query, radius=1 - eps)
    approx_dists, approx_idx = lsfh.radius_neighbors(query, radius=1 - eps)

    assert_array_equal(np.sort(exact_idx[0]), [0, 1])
    assert_array_equal(np.sort(approx_idx[0]), [0, 1])
    assert_array_almost_equal(np.sort(exact_dists[0]), dists[:-2])
    assert_array_almost_equal(np.sort(approx_dists[0]), dists[:-2])
Exemplo n.º 27
0
def test_radius_neighbors():
    """Checks whether Returned distances are less than `radius`

    At least one point should be returned when the `radius` is set
    to mean distance from the considering point to other points in
    the database.
    Moreover, this test compares the radius neighbors of LSHForest
    with the `sklearn.neighbors.NearestNeighbors`.
    """
    n_samples = 12
    n_features = 2
    n_iter = 10
    rng = np.random.RandomState(42)
    X = rng.rand(n_samples, n_features)

    lshf = LSHForest()
    # Test unfitted estimator
    assert_raises(ValueError, lshf.radius_neighbors, X[0])

    lshf.fit(X)

    for i in range(n_iter):
        query = X[rng.randint(0, n_samples)]
        mean_dist = np.mean(pairwise_distances(query, X, metric='cosine'))
        neighbors = lshf.radius_neighbors(query,
                                          radius=mean_dist,
                                          return_distance=False)
        # At least one neighbor should be returned.
        assert_greater(neighbors.shape[0], 0)
        # All distances should be less than mean_dist
        distances, neighbors = lshf.radius_neighbors(query,
                                                     radius=mean_dist,
                                                     return_distance=True)
        assert_array_less(distances[0], mean_dist)

    # Multiple points
    n_queries = 5
    queries = X[rng.randint(0, n_samples, n_queries)]
    distances, neighbors = lshf.radius_neighbors(queries, return_distance=True)
    assert_equal(neighbors.shape[0], n_queries)
    assert_equal(distances.shape[0], n_queries)
    # dists and inds should not be 2D arrays
    assert_equal(distances.ndim, 1)
    assert_equal(neighbors.ndim, 1)

    # Compare with exact neighbor search
    query = X[rng.randint(0, n_samples)]
    mean_dist = np.mean(pairwise_distances(query, X, metric='cosine'))
    nbrs = NearestNeighbors(algorithm='brute', metric='cosine')
    nbrs.fit(X)

    distances_approx, _ = lshf.radius_neighbors(query, radius=mean_dist)
    distances_exact, _ = nbrs.radius_neighbors(query, radius=mean_dist)
    # Distances of exact neighbors is less than or equal to approximate
    assert_true(
        np.all(
            np.less_equal(np.sort(distances_exact[0]),
                          np.sort(distances_approx[0]))))
Exemplo n.º 28
0
def test_candidates():
    """Checks whether candidates are sufficient.

    This should handle the cases when number of candidates is 0.
    User should be warned when number of candidates is less than
    requested number of neighbors.
    """
    X_train = np.array(
        [[5, 5, 2], [21, 5, 5], [1, 1, 1], [8, 9, 1], [6, 10, 2]],
        dtype=np.float32)
    X_test = np.array([7, 10, 3], dtype=np.float32)

    # For zero candidates
    lshf = LSHForest(min_hash_match=32)
    lshf.fit(X_train)

    message = ("Number of candidates is not sufficient to retrieve"
               " %i neighbors with"
               " min_hash_match = %i. Candidates are filled up"
               " uniformly from unselected"
               " indices." % (3, 32))
    assert_warns_message(UserWarning,
                         message,
                         lshf.kneighbors,
                         X_test,
                         n_neighbors=3)
    distances, neighbors = lshf.kneighbors(X_test, n_neighbors=3)
    assert_equal(distances.shape[1], 3)

    # For candidates less than n_neighbors
    lshf = LSHForest(min_hash_match=31)
    lshf.fit(X_train)

    message = ("Number of candidates is not sufficient to retrieve"
               " %i neighbors with"
               " min_hash_match = %i. Candidates are filled up"
               " uniformly from unselected"
               " indices." % (5, 31))
    assert_warns_message(UserWarning,
                         message,
                         lshf.kneighbors,
                         X_test,
                         n_neighbors=5)
    distances, neighbors = lshf.kneighbors(X_test, n_neighbors=5)
    assert_equal(distances.shape[1], 5)
Exemplo n.º 29
0
 def BuildModel(self, data, labels):
   # Create and train the classifier.
   lshf = LSHForest(n_estimators = self.n_estimators,
                    min_hash_match = self.min_hash_match,
                    n_candidates = self.n_candidates,
                    radius_cutoff_ratio = self.radius_cutoff_ratio,
                    radius = self.radius,
                    n_neighbors = self.n_neighbors)
   lshf.fit(data)
   return lshf
Exemplo n.º 30
0
 def test_real_model(self):
     """
     Test that model name works for sklearn estimators
     """
     model1 = LassoCV()
     model2 = LSHForest()
     model3 = KMeans()
     self.assertEqual(get_model_name(model1), 'LassoCV')
     self.assertEqual(get_model_name(model2), 'LSHForest')
     self.assertEqual(get_model_name(model3), 'KMeans')