Exemplo n.º 1
0
def test_radius_neighbors_boundary_handling():
    X = [[0.999, 0.001], [0.5, 0.5], [0, 1.], [-1., 0.001]]
    n_points = len(X)

    # Build an exact nearest neighbors model as reference model to ensure
    # consistency between exact and approximate methods
    nnbrs = NearestNeighbors(algorithm='brute', metric='cosine').fit(X)

    # Build a LSHForest model with hyperparameter values that always guarantee
    # exact results on this toy dataset.
    lsfh = LSHForest(min_hash_match=0, n_candidates=n_points,
                     random_state=42).fit(X)

    # define a query aligned with the first axis
    query = [[1., 0.]]

    # Compute the exact cosine distances of the query to the four points of
    # the dataset
    dists = pairwise_distances(query, X, metric='cosine').ravel()

    # The first point is almost aligned with the query (very small angle),
    # the cosine distance should therefore be almost null:
    assert_almost_equal(dists[0], 0, decimal=5)

    # The second point form an angle of 45 degrees to the query vector
    assert_almost_equal(dists[1], 1 - np.cos(np.pi / 4))

    # The third point is orthogonal from the query vector hence at a distance
    # exactly one:
    assert_almost_equal(dists[2], 1)

    # The last point is almost colinear but with opposite sign to the query
    # therefore it has a cosine 'distance' very close to the maximum possible
    # value of 2.
    assert_almost_equal(dists[3], 2, decimal=5)

    # If we query with a radius of one, all the samples except the last sample
    # should be included in the results. This means that the third sample
    # is lying on the boundary of the radius query:
    exact_dists, exact_idx = nnbrs.radius_neighbors(query, radius=1)
    approx_dists, approx_idx = lsfh.radius_neighbors(query, radius=1)

    assert_array_equal(np.sort(exact_idx[0]), [0, 1, 2])
    assert_array_equal(np.sort(approx_idx[0]), [0, 1, 2])
    assert_array_almost_equal(np.sort(exact_dists[0]), dists[:-1])
    assert_array_almost_equal(np.sort(approx_dists[0]), dists[:-1])

    # If we perform the same query with a slightly lower radius, the third
    # point of the dataset that lay on the boundary of the previous query
    # is now rejected:
    eps = np.finfo(np.float64).eps
    exact_dists, exact_idx = nnbrs.radius_neighbors(query, radius=1 - eps)
    approx_dists, approx_idx = lsfh.radius_neighbors(query, radius=1 - eps)

    assert_array_equal(np.sort(exact_idx[0]), [0, 1])
    assert_array_equal(np.sort(approx_idx[0]), [0, 1])
    assert_array_almost_equal(np.sort(exact_dists[0]), dists[:-2])
    assert_array_almost_equal(np.sort(approx_dists[0]), dists[:-2])
Exemplo n.º 2
0
def test_radius_neighbors():
    """Checks whether Returned distances are less than `radius`

    At least one point should be returned when the `radius` is set
    to mean distance from the considering point to other points in
    the database.
    Moreover, this test compares the radius neighbors of LSHForest
    with the `sklearn.neighbors.NearestNeighbors`.
    """
    n_samples = 12
    n_features = 2
    n_iter = 10
    rng = np.random.RandomState(42)
    X = rng.rand(n_samples, n_features)

    lshf = LSHForest()
    # Test unfitted estimator
    assert_raises(ValueError, lshf.radius_neighbors, X[0])

    lshf.fit(X)

    for i in range(n_iter):
        query = X[rng.randint(0, n_samples)]
        mean_dist = np.mean(pairwise_distances(query, X, metric='cosine'))
        neighbors = lshf.radius_neighbors(query,
                                          radius=mean_dist,
                                          return_distance=False)
        # At least one neighbor should be returned.
        assert_greater(neighbors.shape[0], 0)
        # All distances should be less than mean_dist
        distances, neighbors = lshf.radius_neighbors(query,
                                                     radius=mean_dist,
                                                     return_distance=True)
        assert_array_less(distances[0], mean_dist)

    # Multiple points
    n_queries = 5
    queries = X[rng.randint(0, n_samples, n_queries)]
    distances, neighbors = lshf.radius_neighbors(queries, return_distance=True)
    assert_equal(neighbors.shape[0], n_queries)
    assert_equal(distances.shape[0], n_queries)
    # dists and inds should not be 2D arrays
    assert_equal(distances.ndim, 1)
    assert_equal(neighbors.ndim, 1)

    # Compare with exact neighbor search
    query = X[rng.randint(0, n_samples)]
    mean_dist = np.mean(pairwise_distances(query, X, metric='cosine'))
    nbrs = NearestNeighbors(algorithm='brute', metric='cosine')
    nbrs.fit(X)

    distances_approx, _ = lshf.radius_neighbors(query, radius=mean_dist)
    distances_exact, _ = nbrs.radius_neighbors(query, radius=mean_dist)
    # Distances of exact neighbors is less than or equal to approximate
    assert_true(
        np.all(
            np.less_equal(np.sort(distances_exact[0]),
                          np.sort(distances_approx[0]))))
def test_radius_neighbors_boundary_handling():
    X = [[0.999, 0.001], [0.5, 0.5], [0, 1.], [-1., 0.001]]
    n_points = len(X)

    # Build an exact nearest neighbors model as reference model to ensure
    # consistency between exact and approximate methods
    nnbrs = NearestNeighbors(algorithm='brute', metric='cosine').fit(X)

    # Build a LSHForest model with hyperparameter values that always guarantee
    # exact results on this toy dataset.
    lsfh = LSHForest(min_hash_match=0, n_candidates=n_points,
                     random_state=42).fit(X)

    # define a query aligned with the first axis
    query = [[1., 0.]]

    # Compute the exact cosine distances of the query to the four points of
    # the dataset
    dists = pairwise_distances(query, X, metric='cosine').ravel()

    # The first point is almost aligned with the query (very small angle),
    # the cosine distance should therefore be almost null:
    assert_almost_equal(dists[0], 0, decimal=5)

    # The second point form an angle of 45 degrees to the query vector
    assert_almost_equal(dists[1], 1 - np.cos(np.pi / 4))

    # The third point is orthogonal from the query vector hence at a distance
    # exactly one:
    assert_almost_equal(dists[2], 1)

    # The last point is almost colinear but with opposite sign to the query
    # therefore it has a cosine 'distance' very close to the maximum possible
    # value of 2.
    assert_almost_equal(dists[3], 2, decimal=5)

    # If we query with a radius of one, all the samples except the last sample
    # should be included in the results. This means that the third sample
    # is lying on the boundary of the radius query:
    exact_dists, exact_idx = nnbrs.radius_neighbors(query, radius=1)
    approx_dists, approx_idx = lsfh.radius_neighbors(query, radius=1)

    assert_array_equal(np.sort(exact_idx[0]), [0, 1, 2])
    assert_array_equal(np.sort(approx_idx[0]), [0, 1, 2])
    assert_array_almost_equal(np.sort(exact_dists[0]), dists[:-1])
    assert_array_almost_equal(np.sort(approx_dists[0]), dists[:-1])

    # If we perform the same query with a slightly lower radius, the third
    # point of the dataset that lay on the boundary of the previous query
    # is now rejected:
    eps = np.finfo(np.float64).eps
    exact_dists, exact_idx = nnbrs.radius_neighbors(query, radius=1 - eps)
    approx_dists, approx_idx = lsfh.radius_neighbors(query, radius=1 - eps)

    assert_array_equal(np.sort(exact_idx[0]), [0, 1])
    assert_array_equal(np.sort(approx_idx[0]), [0, 1])
    assert_array_almost_equal(np.sort(exact_dists[0]), dists[:-2])
    assert_array_almost_equal(np.sort(approx_dists[0]), dists[:-2])
Exemplo n.º 4
0
def test_radius_neighbors():
    """Checks whether Returned distances are less than `radius`

    At least one point should be returned when the `radius` is set
    to mean distance from the considering point to other points in
    the database.
    Moreover, this test compares the radius neighbors of LSHForest
    with the `sklearn.neighbors.NearestNeighbors`.
    """
    n_samples = 12
    n_features = 2
    n_iter = 10
    rng = np.random.RandomState(42)
    X = rng.rand(n_samples, n_features)

    lshf = LSHForest()
    # Test unfitted estimator
    assert_raises(ValueError, lshf.radius_neighbors, X[0])

    lshf.fit(X)

    for i in range(n_iter):
        query = X[rng.randint(0, n_samples)]
        mean_dist = np.mean(pairwise_distances(query, X, metric='cosine'))
        neighbors = lshf.radius_neighbors(query, radius=mean_dist,
                                          return_distance=False)
        # At least one neighbor should be returned.
        assert_greater(neighbors.shape[0], 0)
        # All distances should be less than mean_dist
        distances, neighbors = lshf.radius_neighbors(query,
                                                     radius=mean_dist,
                                                     return_distance=True)
        assert_array_less(distances[0], mean_dist)

    # Multiple points
    n_queries = 5
    queries = X[rng.randint(0, n_samples, n_queries)]
    distances, neighbors = lshf.radius_neighbors(queries,
                                                 return_distance=True)
    assert_equal(neighbors.shape[0], n_queries)
    assert_equal(distances.shape[0], n_queries)
    # dists and inds should not be 2D arrays
    assert_equal(distances.ndim, 1)
    assert_equal(neighbors.ndim, 1)

    # Compare with exact neighbor search
    query = X[rng.randint(0, n_samples)]
    mean_dist = np.mean(pairwise_distances(query, X, metric='cosine'))
    nbrs = NearestNeighbors(algorithm='brute', metric='cosine')
    nbrs.fit(X)

    distances_approx, _ = lshf.radius_neighbors(query, radius=mean_dist)
    distances_exact, _ = nbrs.radius_neighbors(query, radius=mean_dist)
    # Distances of exact neighbors is less than or equal to approximate
    assert_true(np.all(np.less_equal(np.sort(distances_exact[0]),
                                     np.sort(distances_approx[0]))))
Exemplo n.º 5
0
def test_distances():
    """Checks whether returned neighbors are from closest to farthest."""
    n_samples = 12
    n_features = 2
    n_iter = 10
    rng = np.random.RandomState(42)
    X = rng.rand(n_samples, n_features)

    lshf = LSHForest()
    lshf.fit(X)

    for i in range(n_iter):
        n_neighbors = rng.randint(0, n_samples)
        query = X[rng.randint(0, n_samples)]
        distances, neighbors = lshf.kneighbors(query,
                                               n_neighbors=n_neighbors,
                                               return_distance=True)
        # Returned neighbors should be from closest to farthest.
        assert_true(np.all(np.diff(distances[0]) >= 0))

        mean_dist = np.mean(pairwise_distances(query, X, metric='cosine'))
        distances, neighbors = lshf.radius_neighbors(query,
                                                     radius=mean_dist,
                                                     return_distance=True)
        assert_true(np.all(np.diff(distances[0]) >= 0))
    def single_batch(self, tweets):
        """Performs an approximate nearest neighbors search on tweets in the database
        passed to it. The database must be a list of tweets (text of the tweets only).
        
        Returns the indices of tweets with nearby neighbors (i.e. spam tweets).
        These indices correspond to indices within the batch of tweets fed to
        this function."""

        # Vectorize and fit tree:
        vect2 = CountVectorizer(stop_words = self.common_twitter_handles)
        X2 = vect2.fit_transform(tweets)
        tree2 = LSHForest()
        tree2.fit(X2)

        # Build tree:
        n_neighbors = []
        neighbors_indices = []
        for x in vect2.transform(tweets):
            if len(n_neighbors) % 100 == 0: print "%r tweets analyzed out of %r for this batch" % (len(n_neighbors), self.batch_size)
            neighbors = tree2.radius_neighbors(x, radius = .4)[1]
            n_neighbors.append(len(neighbors[0]))
            neighbors_indices.append(neighbors)

        neighbors_indices = [x for x in range(len(neighbors_indices)) if len(neighbors_indices[x][0]) > 2]

        return neighbors_indices
    def single_batch(self, tweets):
        """Performs an approximate nearest neighbors search on tweets in the database
        passed to it. The database must be a list of tweets (text of the tweets only).
        
        Returns the indices of tweets with nearby neighbors (i.e. spam tweets).
        These indices correspond to indices within the batch of tweets fed to
        this function."""

        # Vectorize and fit tree:
        vect2 = CountVectorizer(stop_words = self.custom_stop_words)
        X2 = vect2.fit_transform(tweets)
        tree2 = LSHForest()
        tree2.fit(X2)

        # Build tree:
        n_neighbors = []
        neighbors_indices = []
        working_batch_size = len(tweets)
        for x in vect2.transform(tweets):
            if len(n_neighbors) % 100 == 0: print "%r tweets analyzed out of %r for this batch" % (len(n_neighbors), working_batch_size)
            # Only deal with tweets that are longer than 3 words.
            neighbors = tree2.radius_neighbors(x, radius = self.sensitivity)[1]
            if x.getnnz() > 2:
                n_neighbors.append(len(neighbors[0]))
                neighbors_indices.append(neighbors)
            else:
                n_neighbors.append(1)
                neighbors_indices.append(np.array([np.array([0])]))

        neighbors_indices = [x for x in range(len(neighbors_indices)) if len(neighbors_indices[x][0]) > 2]

        return neighbors_indices
Exemplo n.º 8
0
def test_distances():
    """Checks whether returned neighbors are from closest to farthest."""
    n_samples = 12
    n_features = 2
    n_iter = 10
    rng = np.random.RandomState(42)
    X = rng.rand(n_samples, n_features)

    lshf = LSHForest()
    lshf.fit(X)

    for i in range(n_iter):
        n_neighbors = rng.randint(0, n_samples)
        query = X[rng.randint(0, n_samples)]
        distances, neighbors = lshf.kneighbors(query,
                                               n_neighbors=n_neighbors,
                                               return_distance=True)
        # Returned neighbors should be from closest to farthest.
        assert_true(np.all(np.diff(distances[0]) >= 0))

        mean_dist = np.mean(pairwise_distances(query, X, metric='cosine'))
        distances, neighbors = lshf.radius_neighbors(query,
                                                     radius=mean_dist,
                                                     return_distance=True)
        assert_true(np.all(np.diff(distances[0]) >= 0))
Exemplo n.º 9
0
    def single_batch(self, tweets):
        """Performs an approximate nearest neighbors search on tweets in the database
        passed to it. The database must be a list of tweets (text of the tweets only).
        
        Returns the indices of tweets with nearby neighbors (i.e. spam tweets).
        These indices correspond to indices within the batch of tweets fed to
        this function."""

        # Vectorize and fit tree:
        vect2 = CountVectorizer(stop_words=self.common_twitter_handles)
        X2 = vect2.fit_transform(tweets)
        tree2 = LSHForest()
        tree2.fit(X2)

        # Build tree:
        n_neighbors = []
        neighbors_indices = []
        for x in vect2.transform(tweets):
            if len(n_neighbors) % 100 == 0:
                print "%r tweets analyzed out of %r for this batch" % (
                    len(n_neighbors), self.batch_size)
            neighbors = tree2.radius_neighbors(x, radius=.4)[1]
            n_neighbors.append(len(neighbors[0]))
            neighbors_indices.append(neighbors)

        neighbors_indices = [
            x for x in range(len(neighbors_indices))
            if len(neighbors_indices[x][0]) > 2
        ]

        return neighbors_indices
Exemplo n.º 10
0
def test_sparse_input():
    # note: Fixed random state in sp.rand is not supported in older scipy.
    #       The test should succeed regardless.
    X1 = sp.rand(50, 100)
    X2 = sp.rand(10, 100)
    forest_sparse = LSHForest(radius=1, random_state=0).fit(X1)
    forest_dense = LSHForest(radius=1, random_state=0).fit(X1.A)

    d_sparse, i_sparse = forest_sparse.kneighbors(X2, return_distance=True)
    d_dense, i_dense = forest_dense.kneighbors(X2.A, return_distance=True)
    assert_array_equal(d_sparse, d_dense)
    assert_array_equal(i_sparse, i_dense)

    d_sparse, i_sparse = forest_sparse.radius_neighbors(X2,
                                                        return_distance=True)
    d_dense, i_dense = forest_dense.radius_neighbors(X2.A,
                                                     return_distance=True)
    assert_equal(d_sparse.shape, d_dense.shape)
    for a, b in zip(d_sparse, d_dense):
        assert_array_equal(a, b)
    for a, b in zip(i_sparse, i_dense):
        assert_array_equal(a, b)
Exemplo n.º 11
0
def test_sparse_input():
    # note: Fixed random state in sp.rand is not supported in older scipy.
    #       The test should succeed regardless.
    X1 = sp.rand(50, 100)
    X2 = sp.rand(10, 100)
    forest_sparse = LSHForest(radius=1, random_state=0).fit(X1)
    forest_dense = LSHForest(radius=1, random_state=0).fit(X1.A)

    d_sparse, i_sparse = forest_sparse.kneighbors(X2, return_distance=True)
    d_dense, i_dense = forest_dense.kneighbors(X2.A, return_distance=True)
    assert_array_equal(d_sparse, d_dense)
    assert_array_equal(i_sparse, i_dense)

    d_sparse, i_sparse = forest_sparse.radius_neighbors(X2,
                                                        return_distance=True)
    d_dense, i_dense = forest_dense.radius_neighbors(X2.A,
                                                     return_distance=True)
    assert_equal(d_sparse.shape, d_dense.shape)
    for a, b in zip(d_sparse, d_dense):
        assert_array_equal(a, b)
    for a, b in zip(i_sparse, i_dense):
        assert_array_equal(a, b)
def single_batch(tweet_db):
    """Performs an approximate nearest neighbors search on tweets in the database
    passed to it. The database must be a list of tweets (text of the tweets only).
    Returns the mean number of neighbors (nearly-identical tweets) that a given
    tweet has, the tweets that are considered neighbors (i.e. spam), the number
    of tweets that are spam (number of tweets with at least 1 other neighbor),
    and the amount of time that it took to run the search on the database."""

    # Vectorize and fit tree:
    timer = time.time()
    vect2 = TfidfVectorizer()
    X2 = vect2.fit_transform(tweet_db)
    tree2 = LSHForest()
    tree2.fit(X2)
    print "that took %2f seconds" % (time.time() - timer)

    # Build tree:
    timer = time.time()
    n_neighbors = []
    neighbors_indices = []
    for x in vect2.transform(tweet_db):
        if len(n_neighbors) % 100 == 0: print len(n_neighbors)
        neighbors = tree2.radius_neighbors(x, radius=.3)[1]
        n_neighbors.append(len(neighbors[0]))
        neighbors_indices.append(neighbors)
    tree_build_time = (time.time() - timer)

    # Find neighbors:
    l = list(n_neighbors)
    l = [l.index(x) for x in l if x > 2]

    # Get indices of the tweets that are parts of close clusters:
    len_l = len(set(l))
    actual_neighbors = []
    for x in set(l):
        for neigh in neighbors_indices[x][0]:
            actual_neighbors.append(tweet_db[neigh])

    return np.mean(
        n_neighbors
    ), actual_neighbors, len_l, tree_build_time, neighbors_indices
def single_batch(tweet_db):
    """Performs an approximate nearest neighbors search on tweets in the database
    passed to it. The database must be a list of tweets (text of the tweets only).
    Returns the mean number of neighbors (nearly-identical tweets) that a given
    tweet has, the tweets that are considered neighbors (i.e. spam), the number
    of tweets that are spam (number of tweets with at least 1 other neighbor),
    and the amount of time that it took to run the search on the database."""

    # Vectorize and fit tree:
    timer = time.time()
    vect2 = TfidfVectorizer()
    X2 = vect2.fit_transform(tweet_db)
    tree2 = LSHForest()
    tree2.fit(X2)
    print "that took %2f seconds" % (time.time()-timer)

    # Build tree:
    timer = time.time()
    n_neighbors = []
    neighbors_indices = []
    for x in vect2.transform(tweet_db):
        if len(n_neighbors) % 100 == 0: print len(n_neighbors)
        neighbors = tree2.radius_neighbors(x, radius = .3)[1]
        n_neighbors.append(len(neighbors[0]))
        neighbors_indices.append(neighbors)
    tree_build_time = (time.time() - timer)

    # Find neighbors:
    l = list(n_neighbors)
    l = [l.index(x) for x in l if x > 2]

    # Get indices of the tweets that are parts of close clusters:
    len_l = len(set(l))
    actual_neighbors =[]
    for x in set(l):
        for neigh in neighbors_indices[x][0]:
            actual_neighbors.append(tweet_db[neigh])

    return np.mean(n_neighbors), actual_neighbors, len_l, tree_build_time, neighbors_indices
Exemplo n.º 14
0
    def single_batch(self, tweets):
        """Performs an approximate nearest neighbors search on tweets in the database
        passed to it. The database must be a list of tweets (text of the tweets only).
        
        Returns the indices of tweets with nearby neighbors (i.e. spam tweets).
        These indices correspond to indices within the batch of tweets fed to
        this function."""

        # Vectorize and fit tree:
        vect2 = CountVectorizer(stop_words=self.custom_stop_words)
        X2 = vect2.fit_transform(tweets)
        tree2 = LSHForest()
        tree2.fit(X2)

        # Build tree:
        n_neighbors = []
        neighbors_indices = []
        working_batch_size = len(tweets)
        for x in vect2.transform(tweets):
            if len(n_neighbors) % 100 == 0:
                print "%r tweets analyzed out of %r for this batch" % (
                    len(n_neighbors), working_batch_size)
            # Only deal with tweets that are longer than 3 words.
            neighbors = tree2.radius_neighbors(x, radius=self.sensitivity)[1]
            if x.getnnz() > 2:
                n_neighbors.append(len(neighbors[0]))
                neighbors_indices.append(neighbors)
            else:
                n_neighbors.append(1)
                neighbors_indices.append(np.array([np.array([0])]))

        neighbors_indices = [
            x for x in range(len(neighbors_indices))
            if len(neighbors_indices[x][0]) > 2
        ]

        return neighbors_indices
Exemplo n.º 15
0
def test_radius_neighbors():
    # Checks whether Returned distances are less than `radius`
    # At least one point should be returned when the `radius` is set
    # to mean distance from the considering point to other points in
    # the database.
    # Moreover, this test compares the radius neighbors of LSHForest
    # with the `sklearn.neighbors.NearestNeighbors`.
    n_samples = 12
    n_features = 2
    n_iter = 10
    rng = np.random.RandomState(42)
    X = rng.rand(n_samples, n_features)

    lshf = LSHForest()
    # Test unfitted estimator
    assert_raises(ValueError, lshf.radius_neighbors, X[0])

    ignore_warnings(lshf.fit)(X)

    for i in range(n_iter):
        # Select a random point in the dataset as the query
        query = X[rng.randint(0, n_samples)].reshape(1, -1)

        # At least one neighbor should be returned when the radius is the
        # mean distance from the query to the points of the dataset.
        mean_dist = np.mean(pairwise_distances(query, X, metric='cosine'))
        neighbors = lshf.radius_neighbors(query,
                                          radius=mean_dist,
                                          return_distance=False)

        assert_equal(neighbors.shape, (1, ))
        assert_equal(neighbors.dtype, object)
        assert_greater(neighbors[0].shape[0], 0)
        # All distances to points in the results of the radius query should
        # be less than mean_dist
        distances, neighbors = lshf.radius_neighbors(query,
                                                     radius=mean_dist,
                                                     return_distance=True)
        assert_array_less(distances[0], mean_dist)

    # Multiple points
    n_queries = 5
    queries = X[rng.randint(0, n_samples, n_queries)]
    distances, neighbors = lshf.radius_neighbors(queries, return_distance=True)

    # dists and inds should not be 1D arrays or arrays of variable lengths
    # hence the use of the object dtype.
    assert_equal(distances.shape, (n_queries, ))
    assert_equal(distances.dtype, object)
    assert_equal(neighbors.shape, (n_queries, ))
    assert_equal(neighbors.dtype, object)

    # Compare with exact neighbor search
    query = X[rng.randint(0, n_samples)].reshape(1, -1)
    mean_dist = np.mean(pairwise_distances(query, X, metric='cosine'))
    nbrs = NearestNeighbors(algorithm='brute', metric='cosine').fit(X)

    distances_exact, _ = nbrs.radius_neighbors(query, radius=mean_dist)
    distances_approx, _ = lshf.radius_neighbors(query, radius=mean_dist)

    # Radius-based queries do not sort the result points and the order
    # depends on the method, the random_state and the dataset order. Therefore
    # we need to sort the results ourselves before performing any comparison.
    sorted_dists_exact = np.sort(distances_exact[0])
    sorted_dists_approx = np.sort(distances_approx[0])

    # Distances to exact neighbors are less than or equal to approximate
    # counterparts as the approximate radius query might have missed some
    # closer neighbors.
    assert_true(np.all(np.less_equal(sorted_dists_exact, sorted_dists_approx)))
Exemplo n.º 16
0
class Deduper_NN(object):
    '''
    DESIGN of this class
    I need to re-evaluate whether or not I want the state of the model/vector space 
    being saved in the event that I dont' I should just kill the self.model = model.fit() stuff
    and pass parameters from one function to another.

    *Figure out a rigorous statistical way to measure quality of NN tree. 
    Would it be whatever's skewed to the left?... how can you gaurantee that it's clustered well?
    
    methods
    ------
    train
        - model type
    predict
    preprocess
        - various stuff
    '''

    metrics = [
        'cosine', 
        'euclidean',
        'dice', 
        'jaccard', 
        'braycurtis',
        'canberra', 
    ]
    
    vector_space = None

    def read_in_the_file(self, file_name):
        
        #read in subject file
        with open(file_name) as f:
             self.orig_file = [line.strip() for line in f]
    
    def build_vectorizer(self, corpus, model_type='bag of words', ngrams=1, tokenizer='char'):
        '''
        *add word2vec
        '''
        
        #think of params
        params = {
            'analyzer': tokenizer,
            'ngram_range' : (1, ngrams)
        }
    
        if model_type == 'bag of words':
            vectorizer = CountVectorizer(**params)
        elif model_type ==  'tfidf':
            vectorizer = TfidfVectorizer(**params)
        
        self.vector_space = vectorizer.fit_transform(corpus) 
        self.vectorizer = vectorizer 
    
    def find_all_duplicates(self):
        
        #find all duplicates
        all_dups_dict = {idx : self.predict(line) for idx, line in enumerate(self.orig_file)}
        return all_dups_dict
    
    def fit_model(self, model_type='brute', params=None):
        '''
        fits model operating under the assumption that there's a model already built
        '''

        if model_type == 'brute':
            self.model = NearestNeighbors(algorithm='brute', **params)
        elif model_type == 'lsh':
            self.model = LSHForest( **params)
        # elif model_type == 'annoy':
        #     self.model = Annoy(**params)

        self.model.fit(self.vector_space)
        print self.model        

    def predict(self, new_data_pt, radius_threshold=.25):
        '''
        not sure how to find the optimal threshold here
        '''
        #careful to note that it takes a single string and converts to a list object of strings
        pt = self.vectorizer.transform([new_data_pt])
        
        #how to find optimal radius?
        distance_from_origin, indices = self.model.radius_neighbors(pt, radius=radius_threshold)
        
        #unpacking
        distance_from_origin = distance_from_origin[0]
        indices = indices[0]

        grabbing_the_lines_from_file = [self.orig_file[index] for index in indices]

        return grabbing_the_lines_from_file
    
    def grid_search(self):
        '''
        I: target string
        O: prints all combinations of comparisons
        
        * this goes in the master deduper class
        '''
        
        #preprocessing variables
            #spaces or no spaces
            #combinations there of.
        
        vector_space_params = {
            #fit the vector-space
            #char-grams, words
                #unagrams, bigrams, tri-grams

            #or some combination there of, to do this we need to output and concat
            
            'model_type' : ['bag of words', 'tfidf'], #add lsi and word2vec
            'ngrams' : [1,2,3,4],
            'tokenizer' : ['char', 'word'],
        }
        
        
        
        #model selection
        model_params = {
            #add annoy later
            #build out a wrapper for the class to make it more like sciki

            #add lsh later
            #need to build a seperate parameters dict for it.

        
            'model_type' : [ 'brute']
            #fill the rest in later
        }
        
         
        #distances
        metrics = [
            # work for sparse input
            'cosine', 
            'euclidean',
            'l1',
            'l2',
            'manhattan',

            # do not work for sparese input
            # 'dice', 
            # 'jaccard', 
            # 'braycurtis',
            # 'canberra', 
            # 'mahalanobis', # this is supposed to be the shit for outlier detection
        ]
        
        
        all_params = {
            'preprocessing': None,
            'vector_space': vector_space_params,
            'nn_algo': model_params,
        }
        
        

        for nn_algo in all_params['nn_algo']['model_type']:
            for vector_space_model in all_params['vector_space']['model_type']:
                for gram in  all_params['vector_space']['ngrams']:
                    for type_of_tokenizer in  all_params['vector_space']['tokenizer']:
                        for dist_metric in metrics:
                            
                            nn_model_params = {
                                # 'model_type' : nn_algo,
                                'metric' : dist_metric,
                            }

                            vectorizer_params = {
                                'model_type' : vector_space_model,
                                'tokenizer' : type_of_tokenizer,
                                'ngrams' : gram
                            }

                            self.build_vectorizer(self.orig_file, **vectorizer_params)
                            self.fit_model(nn_algo, nn_model_params)
                            hist_arr = self.make_hist()
                            print_prof_data()
                            clear_prof_data()
                            self.plot_histogram(hist_arr)

                            
                
        
        #how do you gauge the quality of matches?
        
        pass
    
    #since this isn't a nn search model it belongs in the biggest deduper
    def brute_force_deduper(self, list_of_strings, comparison_algo, threshold=None):
        '''
        I: self explanatory
        O: dictionary {string: sorted list of matches}
        '''
        big_bag = {}
        #to deep copy or not to deep copy

        for index, s1 in enumerate(list_of_strings):
            small_bag = get_all_comparisons(list_of_strings[index:], comparison_algo)
            big_bag[s1] = sorted(small_bag, key=lambda x: x[0], reverse=True)

        return big_bag
    
    @profile
    def make_hist(self):
        '''
        these queries take while

        *add timer bit
        '''
        import sys

        #use a numpy array since the size is already pre-defined
        hist_bag = []
        
        print 'set size -- ', self.vector_space.shape[0]

        for l, observation in enumerate(self.vector_space):

            #just a way to keep track of where it's at
            if l % 30 == 0: 
                sys.stdout.write(str(l))
            

            dist, idx = self.model.kneighbors(observation, n_neighbors=2)
            dist, idx = dist[0], idx[0]

            #operating under the assumption that 
            #the first one is might be a good thing

            #find out which position the current index is in
            # remove_this_arg = [k for k, i in enumerate(idx) if i == index]
            # dist = [i for k, i in enumerate(dist) if i != remove_this_arg[0]]
            


            hist_bag.append(dist[1])


        return pd.Series(hist_bag)

    def plot_histogram(self, histogram_arr, text_pos=None):
        
        figure = plt.figure(figsize=(10,5))
        plt.hist(histogram_arr, bins=50, alpha=0.75) 
        plt.title("not scaled") 
        if text_pos:
            self.distribution_stats_text_label(text_pos[0], text_pos[1], histogram_arr)
        plt.show()


    def distribution_stats_text_label(self, position_x, position_y, data):
        label_position_decrement = 0.08 * position_y
        plt.text(position_x, position_y, "Skewness: {0:.2f}".format(skew(data))) 
        plt.text(position_x, position_y - label_position_decrement, "Mean: {0:.2f}".format(data.mean())) 
        plt.text(position_x, position_y - 2 * label_position_decrement, "Std: {0:.2f}".format(data.std())) 
        return None

    def get_all_comparisons(self, main_str, strings, comparison_algo, threshold=None):
        '''
        I: string, list of strings, string comparison algo eg. levenshtien, threshold
        O: list of tuples (match score, weight) 
        
        Takes a target string and compares it to the rest of strings in the list
        USE --

        get_all_comparisons('check', ['check1'], fuzz.ratio) 
        >>> [(91, 'check1')]
        '''
        match_bag = []

        for str_ in strings:
            match_rating = comparison_algo(main_str, str_)

            if threshold:
                if match_rating > threshold:
                    match_bag.append((match_rating, str_))
            else:
                match_bag.append((match_rating, str_))

        return match_bag
def test_radius_neighbors():
    # Checks whether Returned distances are less than `radius`
    # At least one point should be returned when the `radius` is set
    # to mean distance from the considering point to other points in
    # the database.
    # Moreover, this test compares the radius neighbors of LSHForest
    # with the `sklearn.neighbors.NearestNeighbors`.
    n_samples = 12
    n_features = 2
    n_iter = 10
    rng = np.random.RandomState(42)
    X = rng.rand(n_samples, n_features)

    lshf = LSHForest()
    # Test unfitted estimator
    assert_raises(ValueError, lshf.radius_neighbors, X[0])

    ignore_warnings(lshf.fit)(X)

    for i in range(n_iter):
        # Select a random point in the dataset as the query
        query = X[rng.randint(0, n_samples)].reshape(1, -1)

        # At least one neighbor should be returned when the radius is the
        # mean distance from the query to the points of the dataset.
        mean_dist = np.mean(pairwise_distances(query, X, metric='cosine'))
        neighbors = lshf.radius_neighbors(query, radius=mean_dist,
                                          return_distance=False)

        assert_equal(neighbors.shape, (1,))
        assert_equal(neighbors.dtype, object)
        assert_greater(neighbors[0].shape[0], 0)
        # All distances to points in the results of the radius query should
        # be less than mean_dist
        distances, neighbors = lshf.radius_neighbors(query,
                                                     radius=mean_dist,
                                                     return_distance=True)
        assert_array_less(distances[0], mean_dist)

    # Multiple points
    n_queries = 5
    queries = X[rng.randint(0, n_samples, n_queries)]
    distances, neighbors = lshf.radius_neighbors(queries,
                                                 return_distance=True)

    # dists and inds should not be 1D arrays or arrays of variable lengths
    # hence the use of the object dtype.
    assert_equal(distances.shape, (n_queries,))
    assert_equal(distances.dtype, object)
    assert_equal(neighbors.shape, (n_queries,))
    assert_equal(neighbors.dtype, object)

    # Compare with exact neighbor search
    query = X[rng.randint(0, n_samples)].reshape(1, -1)
    mean_dist = np.mean(pairwise_distances(query, X, metric='cosine'))
    nbrs = NearestNeighbors(algorithm='brute', metric='cosine').fit(X)

    distances_exact, _ = nbrs.radius_neighbors(query, radius=mean_dist)
    distances_approx, _ = lshf.radius_neighbors(query, radius=mean_dist)

    # Radius-based queries do not sort the result points and the order
    # depends on the method, the random_state and the dataset order. Therefore
    # we need to sort the results ourselves before performing any comparison.
    sorted_dists_exact = np.sort(distances_exact[0])
    sorted_dists_approx = np.sort(distances_approx[0])

    # Distances to exact neighbors are less than or equal to approximate
    # counterparts as the approximate radius query might have missed some
    # closer neighbors.
    assert_true(np.all(np.less_equal(sorted_dists_exact,
                                     sorted_dists_approx)))