Exemplo n.º 1
0
def test_partial_fit():
    """Checks whether inserting array is consitent with fitted data.

    `partial_fit` method should set all attribute values correctly.
    """
    n_samples = 12
    n_samples_partial_fit = 3
    n_features = 2
    rng = np.random.RandomState(42)
    X = rng.rand(n_samples, n_features)
    X_partial_fit = rng.rand(n_samples_partial_fit, n_features)

    lshf = LSHForest()

    # Test unfitted estimator
    lshf.partial_fit(X)
    assert_array_equal(X, lshf._fit_X)

    lshf.fit(X)

    # Insert wrong dimension
    assert_raises(ValueError, lshf.partial_fit,
                  np.random.randn(n_samples_partial_fit, n_features - 1))

    lshf.partial_fit(X_partial_fit)

    # size of _input_array = samples + 1 after insertion
    assert_equal(lshf._fit_X.shape[0],
                 n_samples + n_samples_partial_fit)
    # size of original_indices_[1] = samples + 1
    assert_equal(len(lshf.original_indices_[0]),
                 n_samples + n_samples_partial_fit)
    # size of trees_[1] = samples + 1
    assert_equal(len(lshf.trees_[1]),
                 n_samples + n_samples_partial_fit)
    def single_batch(self, tweets):
        """Performs an approximate nearest neighbors search on tweets in the database
        passed to it. The database must be a list of tweets (text of the tweets only).
        
        Returns the indices of tweets with nearby neighbors (i.e. spam tweets).
        These indices correspond to indices within the batch of tweets fed to
        this function."""

        # Vectorize and fit tree:
        vect2 = CountVectorizer(stop_words = self.custom_stop_words)
        X2 = vect2.fit_transform(tweets)
        tree2 = LSHForest()
        tree2.fit(X2)

        # Build tree:
        n_neighbors = []
        neighbors_indices = []
        working_batch_size = len(tweets)
        for x in vect2.transform(tweets):
            if len(n_neighbors) % 100 == 0: print "%r tweets analyzed out of %r for this batch" % (len(n_neighbors), working_batch_size)
            # Only deal with tweets that are longer than 3 words.
            neighbors = tree2.radius_neighbors(x, radius = self.sensitivity)[1]
            if x.getnnz() > 2:
                n_neighbors.append(len(neighbors[0]))
                neighbors_indices.append(neighbors)
            else:
                n_neighbors.append(1)
                neighbors_indices.append(np.array([np.array([0])]))

        neighbors_indices = [x for x in range(len(neighbors_indices)) if len(neighbors_indices[x][0]) > 2]

        return neighbors_indices
Exemplo n.º 3
0
def text_hist():
    """
    Calculate histogram of text of images
    """
    with open('data/sift_names.pkl', 'r') as f:
        names = cPickle.load(f)
    with open('data/sift_hist.pkl', 'r') as f:
        sift_hists = cPickle.load(f)
    filenames = []
    for name in names:
        name = name.replace('img', 'descr')
        name = name.replace('.jpg', '.txt')
        filenames.append('shopping/images/' + name)
    vectorizer = CountVectorizer(input='filename', token_pattern="(?u)"+'\w+', ngram_range=(1, 1), min_df=2)
    xall_transformed = vectorizer.fit_transform(filenames).tocsr()
    preprocessing.normalize(xall_transformed, copy=False)

    lamb = .5
    hists = scipy.sparse.hstack([xall_transformed * lamb, sift_hists * (1-lamb)]).toarray()
    preprocessing.normalize(hists, copy=False)
    model = LSHForest()
    model.fit(hists)
    with open('data/text_hist.pkl', 'w') as f:
        cPickle.dump(xall_transformed, f)
    with open('data/vectorizer.pkl', 'w') as f:
        cPickle.dump(vectorizer, f)
    with open('data/lshforest_combine.pkl', 'w') as f:
        cPickle.dump(model, f)
Exemplo n.º 4
0
def test_neighbors_accuracy_with_n_estimators():
    """Checks whether accuracy increases as `n_estimators` increases."""
    n_estimators = np.array([1, 10, 100])
    n_samples = 100
    n_features = 10
    n_iter = 10
    n_points = 5
    rng = np.random.RandomState(42)
    accuracies = np.zeros(n_estimators.shape[0], dtype=float)
    X = rng.rand(n_samples, n_features)

    for i, t in enumerate(n_estimators):
        lshf = LSHForest(n_candidates=500, n_estimators=t)
        lshf.fit(X)
        for j in range(n_iter):
            query = X[rng.randint(0, n_samples)]
            neighbors = lshf.kneighbors(query,
                                        n_neighbors=n_points,
                                        return_distance=False)
            distances = pairwise_distances(query, X, metric='cosine')
            ranks = np.argsort(distances)[0, :n_points]

            intersection = np.intersect1d(ranks, neighbors).shape[0]
            ratio = intersection / float(n_points)
            accuracies[i] = accuracies[i] + ratio

        accuracies[i] = accuracies[i] / float(n_iter)
    # Sorted accuracies should be equal to original accuracies
    assert_true(np.all(np.diff(accuracies) >= 0),
                msg="Accuracies are not non-decreasing.")
    # Highest accuracy should be strictly greater than the lowest
    assert_true(np.ptp(accuracies) > 0,
                msg="Highest accuracy is not strictly greater than lowest.")
Exemplo n.º 5
0
    def get_nearest_neighbor_iterable(self, graphlist, start_graphs, start_is_subset=True):

        # vectorize all
        graphlist= list(graphlist)
        graphlist_ = copy.deepcopy(graphlist)
        X = self.vectorizer.transform_single(graphlist_)


        start_graphs= list(start_graphs)
        graphlist_= copy.deepcopy(start_graphs)
        Y = self.vectorizer.transform_single(graphlist_)
        
        
        forest = LSHForest()
        forest.fit(X)
        #http://scikit-learn.org/stable/modules/neighbors.html
        distances, indices = forest.kneighbors(Y, n_neighbors=2)

        # we just assume that this is short...
        index = 0
        if start_is_subset:
            index += 1
        
        #matches= ( X_index ,Y_index, distance  )
        matches = [(indices[i, index], i, distances[i, index]) for i in range(len(indices))]
        matches.sort()

        # this looks super confusing....
        #for index, graph in enumerate(selection_iterator(graphlist, [a[0] for a in matches])):
        #    yield ((graph, start_graphs[matches[index][1]], X[matches[index][0]]))
        # so i wrote this:,,, you may even get rid of the matches variable i think.. and use indices directly
        for Xi,Yi,dist in matches:
            yield ((start_graphs[Yi],graphlist[Xi],X[Xi]))
Exemplo n.º 6
0
def lof(X, k, outlier_threshold=1.5, verbose=False):
    """Knn with KD trees"""
    start = time.time()
    lshf = LSHForest(random_state=42)
    lshf.fit(X)

    distance, index= lshf.kneighbors(X,n_neighbors=k)
    distance, index = distance[:, 1:], index[:, 1:]
    radius = distance[:, -1]

    """Calculate LRD."""
    LRD = np.mean(np.maximum(distance, radius[index]), axis=1)
    r = 1. / np.array(LRD)

    """Calculate outlier score."""
    outlier_score = np.sum(r[index], axis=1) / np.array(r, dtype=np.float16)
    outlier_score *= 1. / k

    # print ('Compute time: %g seconds.' % ((time.time() - start)))

    if verbose: print("Recording all outliers with outlier score greater than %s." \
                      % (outlier_threshold))

    outliers = []
    """ Could parallelize this for loop, but really not worth the overhead...
        Would get insignificant performance gain."""
    for i, score in enumerate(outlier_score):
        if score > outlier_threshold:
            outliers.append([i,X[i], score])

    if verbose:
        print("Detected outliers:")
        print(outliers)

    return outliers
Exemplo n.º 7
0
class EmbeddingNetworkBuilder:
    """ Basically a wrapper around sklearns LSH forest """
    def __init__(self, lsh_init=None):
        if lsh_init == None:
            self._lsh_forest = LSHForest(n_estimators=25, n_candidates=1000)
        else:
            self._lsh_forest = lsh_init
        self.iw = None
        self.m = None

    def fit_lsh_forest(self, embedding):
        self._lsh_forest.fit(embedding.m)
        self._embedding = embedding

    def extract_nn_network(self, nn=20):
        dir_graph_mat = self._lsh_forest.kneighbors_graph(X=self._embedding.m,
                                                          n_neighbors=nn + 1)
        return dir_graph_mat

    def make_undirected(self, dir_graph_mat):
        nodes = set(range(dir_graph_mat.shape[0]))
        edges = set([])
        for node_i in dir_graph_mat.shape[0]:
            for node_j in dir_graph_mat[node_i].nonzero()[1]:
                edges.add((node_i, node_j))
        return nodes, edges

    def get_forest(self):
        return self._lsh_forest

    def get_node_to_word(self):
        return self.iw
Exemplo n.º 8
0
class EmbeddingNetworkBuilder:
    """ Basically a wrapper around sklearns LSH forest """

    def __init__(self, lsh_init=None):
        if lsh_init == None:
            self._lsh_forest = LSHForest(n_estimators=25, n_candidates=1000)
        else:
            self._lsh_forest = lsh_init 
        self.iw = None
        self.m = None

    def fit_lsh_forest(self, embedding):
        self._lsh_forest.fit(embedding.m)
        self._embedding = embedding

    def extract_nn_network(self, nn=20):
        dir_graph_mat = self._lsh_forest.kneighbors_graph(X=self._embedding.m, n_neighbors=nn+1)
        return dir_graph_mat

    def make_undirected(self, dir_graph_mat):
        nodes = set(range(dir_graph_mat.shape[0]))
        edges = set([])
        for node_i in dir_graph_mat.shape[0]:
            for node_j in dir_graph_mat[node_i].nonzero()[1]:
                edges.add((node_i, node_j))
        return nodes, edges

    def get_forest(self):
        return self._lsh_forest
    
    def get_node_to_word(self):
        return self.iw
    def single_batch(self, tweets):
        """Performs an approximate nearest neighbors search on tweets in the database
        passed to it. The database must be a list of tweets (text of the tweets only).
        
        Returns the indices of tweets with nearby neighbors (i.e. spam tweets).
        These indices correspond to indices within the batch of tweets fed to
        this function."""

        # Vectorize and fit tree:
        vect2 = CountVectorizer(stop_words = self.common_twitter_handles)
        X2 = vect2.fit_transform(tweets)
        tree2 = LSHForest()
        tree2.fit(X2)

        # Build tree:
        n_neighbors = []
        neighbors_indices = []
        for x in vect2.transform(tweets):
            if len(n_neighbors) % 100 == 0: print "%r tweets analyzed out of %r for this batch" % (len(n_neighbors), self.batch_size)
            neighbors = tree2.radius_neighbors(x, radius = .4)[1]
            n_neighbors.append(len(neighbors[0]))
            neighbors_indices.append(neighbors)

        neighbors_indices = [x for x in range(len(neighbors_indices)) if len(neighbors_indices[x][0]) > 2]

        return neighbors_indices
Exemplo n.º 10
0
def test_hash_functions():
    """Checks randomness of hash functions.

    Variance and mean of each hash function (projection vector)
    should be different from flattened array of hash functions.
    If hash functions are not randomly built (seeded with
    same value), variances and means of all functions are equal.
    """
    n_samples = 12
    n_features = 2
    n_estimators = 5
    rng = np.random.RandomState(42)
    X = rng.rand(n_samples, n_features)

    lshf = LSHForest(n_estimators=n_estimators,
                     random_state=rng.randint(0, np.iinfo(np.int32).max))
    lshf.fit(X)

    hash_functions = []
    for i in range(n_estimators):
        hash_functions.append(lshf.hash_functions_[i].components_)

    for i in range(n_estimators):
        assert_not_equal(np.var(hash_functions),
                         np.var(lshf.hash_functions_[i].components_))

    for i in range(n_estimators):
        assert_not_equal(np.mean(hash_functions),
                         np.mean(lshf.hash_functions_[i].components_))
Exemplo n.º 11
0
class Index(BaseIndex):
    """ LSH Forest Index
    """

    name = 'lsh_forest'

    def _fit(self, xs):
        """ Fit index
        :param samples: list of Samples
        :return:
        """
        self.index = LSHForest(
            n_estimators=self.parameters.get('n_estimators', 20))
        self.index.fit(xs)

    def _query(self, sample, k=5, **kwargs):
        """ Query index
        :param sample: Sample
        :param k:
        :param kwargs:
        :return:
        """
        x, _, = self.transform([sample])
        distances, idxs = self.index.kneighbors(x, n_neighbors=k + 1)
        neighbors = []
        for idx, d in zip(idxs[0], distances[0]):
            hashval = self.ys[idx]
            neighbors.append({
                'hashval': hashval,
                'similarity': min(1 - float(d), 1.0)
            })
        return neighbors
Exemplo n.º 12
0
def search_neighbors(request):
	designs = Design.objects.all()

	image_list = []
	for design in designs:
		image_list.append(str(design.uid) + ".png")

	d_geometry = settings.D_GEOMETRY
	designed_images = np.empty((len(image_list), d_geometry[0]*d_geometry[1]*3), dtype="float32")
	for i in range(len(image_list)):
		designed_images[i] = img2numpy_arr(settings.DESIGN_PATH + image_list[i]).reshape(d_geometry[0]*d_geometry[1]*3)
	designed_images /= 255
	
	lshf = LSHForest(random_state=42)
	lshf.fit(designed_images) 

	num = int(request.GET['num'])
	input_fname = str(request.GET['input'])
	input_image = img2numpy_arr(settings.DESIGN_PATH + input_fname)
	input_image = input_image.reshape(1, -1)/255
	_, indices = lshf.kneighbors(input_image, n_neighbors=num)

	similar_images = []
	for i in list(indices.reshape(-1)):
		similar_images.append({ 
			"image": str(designs[i].uid) + ".png", 
			"text": str(designs[i].history_text), 
			"like": int(designs[i].like),
			"filtered": str(designs[i].filtered)
		})

	return JsonResponse({
		"results": similar_images
	})
def Classify(nlp, keywords,
             categories):  #keywords  - list; categories - dict: {name; vector}
    counterDict = Counter(keywords)  #optimization for keywords duplicates
    sumVector = numpy.zeros(nlp.vocab.vectors_length)

    #temp
    text = ' '.join(keywords)

    for word, repCount in counterDict.items():  #summurizing words vectors
        curVect = nlp(word).vector
        sumVector += (curVect * repCount)

    vec = nlp(text).vector
    sim = cosine_similarity(vec, sumVector)
    print("Sim: " + str(sim))

    catArray = numpy.array(list(categories.values()))
    catKeys = list(categories.keys())
    #tree = KDTree(catArray, metric='pyfunc', func=cosine_similarity)
    #dist, ind = tree.query(sumVector, k=TOP_N_COUNT) #.reshape(-1, 1)

    print("Creating LSHForest...")

    lshf = LSHForest(n_candidates=70, n_estimators=30, n_neighbors=TOP_N_COUNT)
    lshf.fit(catArray)
    print("LSHForest was created")

    print("Getting neighbors...")
    distances, indices = lshf.kneighbors(sumVector.reshape((1, -1)))
    print("Got neighbors.")

    for curIndex in numpy.nditer(indices):
        print("Found category: " + str(catKeys[curIndex]))
        print("with distance: " + str(distances))
Exemplo n.º 14
0
    def single_batch(self, tweets):
        """Performs an approximate nearest neighbors search on tweets in the database
        passed to it. The database must be a list of tweets (text of the tweets only).
        
        Returns the indices of tweets with nearby neighbors (i.e. spam tweets).
        These indices correspond to indices within the batch of tweets fed to
        this function."""

        # Vectorize and fit tree:
        vect2 = CountVectorizer(stop_words=self.common_twitter_handles)
        X2 = vect2.fit_transform(tweets)
        tree2 = LSHForest()
        tree2.fit(X2)

        # Build tree:
        n_neighbors = []
        neighbors_indices = []
        for x in vect2.transform(tweets):
            if len(n_neighbors) % 100 == 0:
                print "%r tweets analyzed out of %r for this batch" % (
                    len(n_neighbors), self.batch_size)
            neighbors = tree2.radius_neighbors(x, radius=.4)[1]
            n_neighbors.append(len(neighbors[0]))
            neighbors_indices.append(neighbors)

        neighbors_indices = [
            x for x in range(len(neighbors_indices))
            if len(neighbors_indices[x][0]) > 2
        ]

        return neighbors_indices
Exemplo n.º 15
0
def Main():
    trainingSet, people = LoadTrainingSet()
    # Uncomment when running from console:
    # colorama.init()
    if loadPreviousResults:
        previouslyLearnedVectors, previouslyLearnedPeople = LoadPreviouslyLearnedResults()
        trainingSet.extend(previouslyLearnedVectors)
        people.extend(previouslyLearnedPeople)
    else:
        client.drop_database(Constants.PreviousResultsDb)

    chartsForest = LSHForest(n_neighbors = ChartsNeighbors, n_estimators = ChartsEstimators, n_candidates = ChartsCandidates)
    chartsForest.fit(trainingSet)

    peopleForest = LSHForest(n_neighbors = PeopleNeighbors, n_estimators = PeopleEstimators, n_candidates = PeopleCandidates)
    peopleForest.fit(people)

    while True:
        try:
            featureVector, person = GetNewInput()
            ShowCurrentPatient(person)
            warnings = DumbDiagnoser.GetDumbDiagnosis(featureVector, person)
            diagnosis, closestChartsPeople = Diagnose(chartsForest, featureVector)
            closestPeople = GetClosestPeople(peopleForest, person)
            ShowWarnings(warnings)
            ShowResults(diagnosis, closestChartsPeople, closestPeople)
            Learn(chartsForest, featureVector, peopleForest, person, diagnosis)
        except EOFError:
            print('Exiting')
            client.close()
            break
        except NoSuchRecordException as details:
            print(details)
        finally:
            print
Exemplo n.º 16
0
    def get_heap_and_forest(self, griter, k):
        '''
        so we create the heap and the forest...
        heap is (dist to hyperplane, count, graph)
        and the forest ist just a nearest neighbor from sklearn
        '''

        graphs = list(griter)
        graphs2 = copy.deepcopy(graphs)
        # transform doess mess up the graph objects
        X = self.vectorizer.transform(graphs)

        forest = LSHForest()
        forest.fit(X)
        print 'got forest'

        heap = []
        for vector, graph in zip(X, graphs2):
            graph2 = nx.Graph(graph)
            heapq.heappush(heap, (
                self.sampler.estimator.predict_proba(self.sampler.vectorizer.transform_single(graph2))[0][1],
                # score ~ dist from hyperplane
                k + 1,  # making sure that the counter is high so we dont output the startgraphz at the end
                graph))  # at last the actual graph

        print 'got heap'
        distances, unused = forest.kneighbors(X, n_neighbors=2)
        distances = [a[1] for a in distances]  # the second element should be the dist we want
        avg_dist = distances[len(distances) / 2]  # sum(distances)/len(distances)
        print 'got dist'

        return heap, forest, avg_dist
Exemplo n.º 17
0
def persist_attraction_similarities_to_db():
    # build LSHForest model for reduced dimension dataset
    svd = TruncatedSVD(n_components=10, n_iter=7)
    red_dim_itemuserdf = svd.fit_transform(itemuserdf)
    item_user_model = LSHForest()
    item_user_model.fit(red_dim_itemuserdf)

    # persist attractions similarities to db
    K=20        # query for K neighbors
    k=10        # return k neighbors
    for i in range(itemuserdf.shape[0]):
        distance, indices = item_user_model.kneighbors(
            red_dim_itemuserdf[i].reshape(1, -1), n_neighbors=K
        )
        weights = 1 - distance
        for j in range(k):
            if i != indices[0][j]:
                e = SimilarAttractions(
                    attraction_id=Attraction.objects.filter(
                        app_id=int(i)).values('attraction_id')[0]['attraction_id'],
                    similar_attraction_id=Attraction.objects.filter(
                        app_id=int(indices[0][j])).values('attraction_id')[0]['attraction_id'],
                    similarity=weights[0][j],
                    ts=timezone.now()
                )
                e.save()
Exemplo n.º 18
0
def score(factors):
    verifyCount = 3
    X, y = Sets.trainingSet
    test_set, databases = Sets.testSet
    X = FactorizeVectors(X, factors)
    test_set = FactorizeVectors(test_set, factors)
    correctionAverage = 0
    for i in range(verifyCount):
        best_predictions = 0
        clf = LSHForest(n_estimators = 10, n_candidates = 10)
        clf.fit(X)

        correct = 0
        total = 0

        for j in range(len(test_set)):
            total += 1
            actual = databases[j]
            distances, indices = clf.kneighbors(test_set[j], n_neighbors=5)
            predicted = GetPrediction(y, distances[0], indices[0])
            if (actual == predicted):
                correct += 1

        if (correct > best_predictions):
            best_predictions = correct
        correctionAverage += best_predictions
    correctionAverage = float(correctionAverage)/verifyCount
    return correctionAverage
Exemplo n.º 19
0
def test_partial_fit():
    """Checks whether inserting array is consitent with fitted data.

    `partial_fit` method should set all attribute values correctly.
    """
    n_samples = 12
    n_samples_partial_fit = 3
    n_features = 2
    rng = np.random.RandomState(42)
    X = rng.rand(n_samples, n_features)
    X_partial_fit = rng.rand(n_samples_partial_fit, n_features)

    lshf = LSHForest()

    # Test unfitted estimator
    lshf.partial_fit(X)
    assert_array_equal(X, lshf._fit_X)

    lshf.fit(X)

    # Insert wrong dimension
    assert_raises(ValueError, lshf.partial_fit,
                  np.random.randn(n_samples_partial_fit, n_features - 1))

    lshf.partial_fit(X_partial_fit)

    # size of _input_array = samples + 1 after insertion
    assert_equal(lshf._fit_X.shape[0], n_samples + n_samples_partial_fit)
    # size of original_indices_[1] = samples + 1
    assert_equal(len(lshf.original_indices_[0]),
                 n_samples + n_samples_partial_fit)
    # size of trees_[1] = samples + 1
    assert_equal(len(lshf.trees_[1]), n_samples + n_samples_partial_fit)
Exemplo n.º 20
0
def test_hash_functions():
    """Checks randomness of hash functions.

    Variance and mean of each hash function (projection vector)
    should be different from flattened array of hash functions.
    If hash functions are not randomly built (seeded with
    same value), variances and means of all functions are equal.
    """
    n_samples = 12
    n_features = 2
    n_estimators = 5
    rng = np.random.RandomState(42)
    X = rng.rand(n_samples, n_features)

    lshf = LSHForest(n_estimators=n_estimators,
                     random_state=rng.randint(0,
                                              np.iinfo(np.int32).max))
    lshf.fit(X)

    hash_functions = []
    for i in range(n_estimators):
        hash_functions.append(lshf.hash_functions_[i].components_)

    for i in range(n_estimators):
        assert_not_equal(np.var(hash_functions),
                         np.var(lshf.hash_functions_[i].components_))

    for i in range(n_estimators):
        assert_not_equal(np.mean(hash_functions),
                         np.mean(lshf.hash_functions_[i].components_))
Exemplo n.º 21
0
def test_distances():
    """Checks whether returned neighbors are from closest to farthest."""
    n_samples = 12
    n_features = 2
    n_iter = 10
    rng = np.random.RandomState(42)
    X = rng.rand(n_samples, n_features)

    lshf = LSHForest()
    lshf.fit(X)

    for i in range(n_iter):
        n_neighbors = rng.randint(0, n_samples)
        query = X[rng.randint(0, n_samples)]
        distances, neighbors = lshf.kneighbors(query,
                                               n_neighbors=n_neighbors,
                                               return_distance=True)
        # Returned neighbors should be from closest to farthest.
        assert_true(np.all(np.diff(distances[0]) >= 0))

        mean_dist = np.mean(pairwise_distances(query, X, metric='cosine'))
        distances, neighbors = lshf.radius_neighbors(query,
                                                     radius=mean_dist,
                                                     return_distance=True)
        assert_true(np.all(np.diff(distances[0]) >= 0))
Exemplo n.º 22
0
def test_distances():
    """Checks whether returned neighbors are from closest to farthest."""
    n_samples = 12
    n_features = 2
    n_iter = 10
    rng = np.random.RandomState(42)
    X = rng.rand(n_samples, n_features)

    lshf = LSHForest()
    lshf.fit(X)

    for i in range(n_iter):
        n_neighbors = rng.randint(0, n_samples)
        query = X[rng.randint(0, n_samples)]
        distances, neighbors = lshf.kneighbors(query,
                                               n_neighbors=n_neighbors,
                                               return_distance=True)
        # Returned neighbors should be from closest to farthest.
        assert_true(np.all(np.diff(distances[0]) >= 0))

        mean_dist = np.mean(pairwise_distances(query, X, metric='cosine'))
        distances, neighbors = lshf.radius_neighbors(query,
                                                     radius=mean_dist,
                                                     return_distance=True)
        assert_true(np.all(np.diff(distances[0]) >= 0))
Exemplo n.º 23
0
def test_fit():
    """Checks whether `fit` method sets all attribute values correctly."""
    n_samples = 12
    n_features = 2
    n_estimators = 5
    rng = np.random.RandomState(42)
    X = rng.rand(n_samples, n_features)

    lshf = LSHForest(n_estimators=n_estimators)
    lshf.fit(X)

    # _input_array = X
    assert_array_equal(X, lshf._fit_X)
    # A hash function g(p) for each tree
    assert_equal(n_estimators, len(lshf.hash_functions_))
    # Hash length = 32
    assert_equal(32, lshf.hash_functions_[0].components_.shape[0])
    # Number of trees_ in the forest
    assert_equal(n_estimators, len(lshf.trees_))
    # Each tree has entries for every data point
    assert_equal(n_samples, len(lshf.trees_[0]))
    # Original indices after sorting the hashes
    assert_equal(n_estimators, len(lshf.original_indices_))
    # Each set of original indices in a tree has entries for every data point
    assert_equal(n_samples, len(lshf.original_indices_[0]))
Exemplo n.º 24
0
def train():
    # 构建匹配语料库 398872 samples
    sku_names_texts = get_train_datas()
    sku_names_jieba = get_text_jieba(sku_names_texts)
    sku_names_with_spaces = []
    for sku_names in sku_names_jieba:
        sku_names_with_spaces.append(' '.join(sku_names))

    # 测试数据 1000 samples
    keywords_texts = get_test_datas()
    keywords_jieba = get_text_jieba(keywords_texts)
    keywords_with_spaces = []
    for keywords in keywords_jieba:
        keywords_with_spaces.append(' '.join(keywords))

    tfidf_vec = TfidfVectorizer(min_df=3, max_features=None, ngram_range=(1, 2), use_idf=1, smooth_idf=1, sublinear_tf=1)
    x_train = tfidf_vec.fit_transform(sku_names_with_spaces)

    lshf = LSHForest(random_state=42)
    #lshf.fit(np.array(x_train))
    lshf.fit(x_train)

    for i, kw in enumerate(keywords_with_spaces):
        x_test = tfidf_vec.transform([kw])
        distances, indices = lshf.kneighbors(x_test.toarray(), n_neighbors=1)
        idx = indices[0][0]
        print(i, "||", keywords_texts[i], "||", sku_names_texts[idx])

        with open("result/lsh_v1_results.txt", 'a', encoding='utf8') as wf:
            wf.write(str(i) + "||" + keywords_texts[i] + "||" + sku_names_texts[idx] + "\n")
Exemplo n.º 25
0
def predict(login, file):
    login_features = mfcc(login, file)
    lshf = LSHForest(random_state=42)
    gmm = joblib.load(path + '/speaker_models/' + login + '.pkl')
    ubm = joblib.load(path + '/speaker_models/' + 'ubm.pkl')
    model = joblib.load(path + '/speaker_models/' + login + 'Model.pkl')
    gmm_likelihood_score = gmm.score(login_features)
    ubm_likelihood_score = ubm.score(login_features)
    likelihood_score = gmm_likelihood_score - ubm_likelihood_score
    login_features = [j for i in login_features for j in i]
    if len(model) > len(login_features):
        array = model[:len(login_features)]
        lshf.fit([array])
        distances, indices = lshf.kneighbors([login_features], n_neighbors=2)
        dist = pairwise_distances_argmin_min([array], [login_features])
    else:
        array = login_features[:len(model)]
        lshf.fit([array])
        distances, indices = lshf.kneighbors([model], n_neighbors=2)
        dist = pairwise_distances_argmin_min([array], [model])
    result = {}
    result['score'] = [likelihood_score, distances]
    result['distance'] = dist
    if likelihood_score > 0:
        result['Message'] = 'Authenticated'
    else:
        result['Message'] = 'Not Authenticated'   
    return result
Exemplo n.º 26
0
    def optimise(self, num_train_points, num_val_points, parameters):

        max_accuracy = -1
        optimal_estimators = -1
        optimal_n_neighbours = -1

        for item in self.get_generator(parameters):

            LSHf = LSHForest(random_state=42,
                             n_estimators=item['n_est'],
                             n_neighbors=item['n_neigh'])
            LSHf.fit(self.train.images[:num_train_points])
            distances, indices = LSHf.kneighbors(
                self.validation.images[:num_val_points], n_neighbors=5)

            accuracy, positions = self.model_accuracy(indices,
                                                      is_optimising=True)

            if accuracy > max_accuracy:
                max_accuracy = accuracy
                optimal_estimators = item['n_est']
                optimal_n_neighbours = item['n_neigh']

#         print(optimal_n_neighbours_predict)
        return max_accuracy, optimal_estimators, optimal_n_neighbours
Exemplo n.º 27
0
    def fit_model(self, data, n_estimators, n_neighbours):

        LSHf = LSHForest(random_state=42,
                         n_estimators=n_estimators,
                         n_neighbors=n_neighbours)
        LSHf.fit(data)
        return LSHf
Exemplo n.º 28
0
def knn_indices_func_approx(
        rep_pts: FloatTensor,  # (N, pts, dim)
        pts: FloatTensor,  # (N, x, dim)
        K: int,
        D: int) -> LongTensor:  # (N, pts, K)
    """
    Approximate CPU-based Indexing function based on K-Nearest Neighbors search.
    :param rep_pts: Representative points.
    :param pts: Point cloud to get indices from.
    :param K: Number of nearest neighbors to collect.
    :param D: "Spread" of neighboring points.
    :return: Array of indices, P_idx, into pts such that pts[n][P_idx[n],:]
    is the set k-nearest neighbors for the representative points in pts[n].
    """
    if rep_pts.is_cuda:
        rep_pts = rep_pts.cpu()
    if pts.is_cuda:
        pts = pts.cpu()
    rep_pts = rep_pts.data.numpy()
    pts = pts.data.numpy()

    region_idx = []

    for n, p in enumerate(rep_pts):
        P_particular = pts[n]
        lshf = LSHForest(n_estimators=20,
                         n_candidates=100,
                         n_neighbors=D * K + 1)
        lshf.fit(P_particular)
        indices = lshf.kneighbors(p, return_distance=False)
        region_idx.append(indices[:, 1::D])
def CreateAndconfigureLSHForest(categories): # categories - dict: {name; vector}
    print("Creating LSHForest...")
    catArray = numpy.array(list(categories.values()))
    lshf = LSHForest(n_candidates=70, n_estimators=30, n_neighbors=TOP_N_COUNT)
    lshf.fit(catArray)
    print("LSHForest was created")
    return lshf
Exemplo n.º 30
0
def build_index(data, n_estimators=20, n_candidates=100, n_neighbors=10, seed=0):
    lshf = LSHForest(n_estimators=n_estimators, n_candidates=n_candidates,
                     n_neighbors=n_neighbors, random_state=seed)
    t0 = time()
    lshf.fit(data)
    duration = time() - t0
    return lshf, duration
    def get_nearest_neighbor_iterable(self,
                                      graphlist,
                                      start_graphs,
                                      start_is_subset=True):

        # vectorize all
        graphlist = list(graphlist)
        graphlist_ = copy.deepcopy(graphlist)
        X = self.vectorizer.transform_single(graphlist_)

        start_graphs = list(start_graphs)
        graphlist_ = copy.deepcopy(start_graphs)
        Y = self.vectorizer.transform_single(graphlist_)

        forest = LSHForest()
        forest.fit(X)
        #http://scikit-learn.org/stable/modules/neighbors.html
        distances, indices = forest.kneighbors(Y, n_neighbors=2)

        # we just assume that this is short...
        index = 0
        if start_is_subset:
            index += 1

        #matches= ( X_index ,Y_index, distance  )
        matches = [(indices[i, index], i, distances[i, index])
                   for i in range(len(indices))]
        matches.sort()

        # this looks super confusing....
        #for index, graph in enumerate(selection_iterator(graphlist, [a[0] for a in matches])):
        #    yield ((graph, start_graphs[matches[index][1]], X[matches[index][0]]))
        # so i wrote this:,,, you may even get rid of the matches variable i think.. and use indices directly
        for Xi, Yi, dist in matches:
            yield ((start_graphs[Yi], graphlist[Xi], X[Xi]))
Exemplo n.º 32
0
def runForestLSHSizeAnalysis(argsdict, data, inlbl, fPath, fName, fileN, i):
    start = time.time()
    tree = LSHForest(random_state=42)
    tree.fit(data)
    end = time.time()

    return sys.getsizeof(tree), (end - start)
Exemplo n.º 33
0
def test_neighbors_accuracy_with_n_estimators():
    """Checks whether accuracy increases as `n_estimators` increases."""
    n_estimators = np.array([1, 10, 100])
    n_samples = 100
    n_features = 10
    n_iter = 10
    n_points = 5
    rng = np.random.RandomState(42)
    accuracies = np.zeros(n_estimators.shape[0], dtype=float)
    X = rng.rand(n_samples, n_features)

    for i, t in enumerate(n_estimators):
        lshf = LSHForest(n_candidates=500, n_estimators=t)
        lshf.fit(X)
        for j in range(n_iter):
            query = X[rng.randint(0, n_samples)]
            neighbors = lshf.kneighbors(query, n_neighbors=n_points,
                                        return_distance=False)
            distances = pairwise_distances(query, X, metric='cosine')
            ranks = np.argsort(distances)[0, :n_points]

            intersection = np.intersect1d(ranks, neighbors).shape[0]
            ratio = intersection / float(n_points)
            accuracies[i] = accuracies[i] + ratio

        accuracies[i] = accuracies[i] / float(n_iter)
    # Sorted accuracies should be equal to original accuracies
    assert_true(np.all(np.diff(accuracies) >= 0),
                msg="Accuracies are not non-decreasing.")
    # Highest accuracy should be strictly greater than the lowest
    assert_true(np.ptp(accuracies) > 0,
                msg="Highest accuracy is not strictly greater than lowest.")
Exemplo n.º 34
0
def test_fit():
    """Checks whether `fit` method sets all attribute values correctly."""
    n_samples = 12
    n_features = 2
    n_estimators = 5
    rng = np.random.RandomState(42)
    X = rng.rand(n_samples, n_features)

    lshf = LSHForest(n_estimators=n_estimators)
    lshf.fit(X)

    # _input_array = X
    assert_array_equal(X, lshf._fit_X)
    # A hash function g(p) for each tree
    assert_equal(n_estimators, len(lshf.hash_functions_))
    # Hash length = 32
    assert_equal(32, lshf.hash_functions_[0].components_.shape[0])
    # Number of trees_ in the forest
    assert_equal(n_estimators, len(lshf.trees_))
    # Each tree has entries for every data point
    assert_equal(n_samples, len(lshf.trees_[0]))
    # Original indices after sorting the hashes
    assert_equal(n_estimators, len(lshf.original_indices_))
    # Each set of original indices in a tree has entries for every data point
    assert_equal(n_samples, len(lshf.original_indices_[0]))
class ScikitLearnLsh(NearestNeighborAlgorithm):
    """
    This ``NearestNeighborAlgorithm`` uses scikit-learn's implementation of a locality sensitive
    hash to find approximate nearest neighbors.

    Parameters
    ----------
    random_state: int, optional (default=12345)
        Used to initialize the LSHForest, so that runs are consistent.
    """
    def __init__(self, params: Dict[str, Any]):
        random_state = params.pop('random_state', 12345)
        self.lsh = LSHForest(random_state=random_state)

    def fit(self, vectors: List[numpy.array]):
        logger.info("Fitting LSH with %d vectors", len(vectors))
        self.lsh.fit(vectors)

    def get_neighbors(self, query_vector: numpy.array,
                      num_neighbors: int) -> List[Tuple[int, float]]:
        if len(query_vector.shape) == 1:
            query_vector = [query_vector]
        logger.info("Getting neighbors for %d vectors", len(query_vector))
        scores, neighbor_indices = self.lsh.kneighbors(
            query_vector, n_neighbors=num_neighbors)
        logger.info("Neighbors retrieved")
        result = [
            zip(neighbor_indices[i], scores[i])
            for i in range(len(neighbor_indices))
        ]
        if len(result) == 1:
            result = result[0]
        return result
Exemplo n.º 36
0
 def create_tree(self,listNames,variableName):
     #LSHForest. only once for the main database
     lshf = LSHForest(n_estimators=50,n_candidates=500)
     TF, tfidfs = self.create_TDIDF(self.tokenize(listNames))
     lshf.fit(tfidfs)        
     pickle.dump(lshf,open("{0}/{1}_lshf.dump".format(self.folderSaveData,variableName),"wb+"))
     pickle.dump(listNames,open("{0}/{1}_listNames.dump".format(self.folderSaveData,variableName),"wb+"))
     pickle.dump(TF,open("{0}/{1}_TF.dump".format(self.folderSaveData,variableName),"wb+"))
Exemplo n.º 37
0
def hash_movie_similarity(um, num_neighbors=6):
    lsh = LSHForest(random_state=470957)
    lsh.fit(um.T)

    # Don't compare to self, remove first column, call 7 neighbors
    dist, ind = lsh.kneighbors(um.T, n_neighbors=num_neighbors+1, return_distance=True)
    sim = 1 - dist
    return sim[:,1:], ind[:,1:]
Exemplo n.º 38
0
def test_radius_neighbors():
    """Checks whether Returned distances are less than `radius`

    At least one point should be returned when the `radius` is set
    to mean distance from the considering point to other points in
    the database.
    Moreover, this test compares the radius neighbors of LSHForest
    with the `sklearn.neighbors.NearestNeighbors`.
    """
    n_samples = 12
    n_features = 2
    n_iter = 10
    rng = np.random.RandomState(42)
    X = rng.rand(n_samples, n_features)

    lshf = LSHForest()
    # Test unfitted estimator
    assert_raises(ValueError, lshf.radius_neighbors, X[0])

    lshf.fit(X)

    for i in range(n_iter):
        query = X[rng.randint(0, n_samples)]
        mean_dist = np.mean(pairwise_distances(query, X, metric='cosine'))
        neighbors = lshf.radius_neighbors(query,
                                          radius=mean_dist,
                                          return_distance=False)
        # At least one neighbor should be returned.
        assert_greater(neighbors.shape[0], 0)
        # All distances should be less than mean_dist
        distances, neighbors = lshf.radius_neighbors(query,
                                                     radius=mean_dist,
                                                     return_distance=True)
        assert_array_less(distances[0], mean_dist)

    # Multiple points
    n_queries = 5
    queries = X[rng.randint(0, n_samples, n_queries)]
    distances, neighbors = lshf.radius_neighbors(queries, return_distance=True)
    assert_equal(neighbors.shape[0], n_queries)
    assert_equal(distances.shape[0], n_queries)
    # dists and inds should not be 2D arrays
    assert_equal(distances.ndim, 1)
    assert_equal(neighbors.ndim, 1)

    # Compare with exact neighbor search
    query = X[rng.randint(0, n_samples)]
    mean_dist = np.mean(pairwise_distances(query, X, metric='cosine'))
    nbrs = NearestNeighbors(algorithm='brute', metric='cosine')
    nbrs.fit(X)

    distances_approx, _ = lshf.radius_neighbors(query, radius=mean_dist)
    distances_exact, _ = nbrs.radius_neighbors(query, radius=mean_dist)
    # Distances of exact neighbors is less than or equal to approximate
    assert_true(
        np.all(
            np.less_equal(np.sort(distances_exact[0]),
                          np.sort(distances_approx[0]))))
Exemplo n.º 39
0
class LSH_KNN:
    def __init__(self, weights='uniform', **kwargs):
        self.n_neighbors = kwargs['n_neighbors']
        self.lsh = LSHForest(**kwargs)
        self.weights = weights

    def fit(self, X, y):
        self.y = y
        self.X = X
        self.lsh.fit(X)

    def predict_top_n(self, test_X, n):
        _, indices = self.lsh.kneighbors(test_X, n_neighbors=self.n_neighbors)
        votes = np.zeros((len(test_X), n))
        for i in range(len(indices)):
            votes[i] = np.bincount([self.y[j] for j in indices[i]]).argsort()[-n:][::-1]
        return votes.astype(int)

    def predict_proba(self, test_X, return_dists=False):
        # SMOOTHING PARAMETER TO PREVENT 0 PROBA; https://stats.stacketest_xchange.com/questions/83600/how-to-obtain-the-class-conditional-probability-when-using-knn-classifier
        s = 0.1
        _, neighbor_indices = self.lsh.kneighbors(test_X, n_neighbors=self.n_neighbors)
        dists = []
        proba = np.zeros((len(test_X), np.amatest_x(self.y) + 1))
        for test_point in range(len(neighbor_indices)):
            if self.weights == 'uniform':
                weights = np.ones(len(neighbor_indices[test_point]))
            elif self.weights == 'distance':
                weights = [1 / self.dist(test_X[test_point], self.y[j]) for j in neighbor_indices[test_point]]
            weighted_class_counts = np.bincount([self.y[j] for j in neighbor_indices[test_point]], weights=weights, minlength=np.amatest_x(self.y)+1)
            proba[test_point] = np.true_divide(weighted_class_counts + s, np.sum(weighted_class_counts) + len(weighted_class_counts)*s)
            if return_dists:
                test_point_dists = {}
                for neighbor_index in neighbor_indices[test_point]:
                    if self.y[neighbor_index] not in test_point_dists:
                        self.y[neighbor_index] = []
                    test_point_dists[self.y[neighbor_index]].append(dist(test_X[test_point], self.X[neighbor_index]))
                dists.append(test_point_dists)
        if return_dists:
            return proba, dists
        return proba

    def predict(self, test_X):
        _, neighbor_indices = self.lsh.kneighbors(test_X, n_neighbors=self.n_neighbors)
        result = np.zeros(len(test_X))
        for test_point in range(len(neighbor_indices)):
            if self.weights == 'uniform':
                weights = np.ones(len(neighbor_indices[test_point]))
            elif self.weights == 'distance':
                weights = [1 / self.dist(test_X[test_point], self.y[j]) for j in neighbor_indices[test_point]]
            weighted_class_counts = np.bincount([self.y[j] for j in neighbor_indices[test_point]], weights=weights)
            result[test_point] = np.argmatest_x(weighted_class_counts)
        return result.astype(int)

    def dist(self, a, b):
        return np.linalg.norm(a - b)
Exemplo n.º 40
0
def test_radius_neighbors():
    """Checks whether Returned distances are less than `radius`

    At least one point should be returned when the `radius` is set
    to mean distance from the considering point to other points in
    the database.
    Moreover, this test compares the radius neighbors of LSHForest
    with the `sklearn.neighbors.NearestNeighbors`.
    """
    n_samples = 12
    n_features = 2
    n_iter = 10
    rng = np.random.RandomState(42)
    X = rng.rand(n_samples, n_features)

    lshf = LSHForest()
    # Test unfitted estimator
    assert_raises(ValueError, lshf.radius_neighbors, X[0])

    lshf.fit(X)

    for i in range(n_iter):
        query = X[rng.randint(0, n_samples)]
        mean_dist = np.mean(pairwise_distances(query, X, metric='cosine'))
        neighbors = lshf.radius_neighbors(query, radius=mean_dist,
                                          return_distance=False)
        # At least one neighbor should be returned.
        assert_greater(neighbors.shape[0], 0)
        # All distances should be less than mean_dist
        distances, neighbors = lshf.radius_neighbors(query,
                                                     radius=mean_dist,
                                                     return_distance=True)
        assert_array_less(distances[0], mean_dist)

    # Multiple points
    n_queries = 5
    queries = X[rng.randint(0, n_samples, n_queries)]
    distances, neighbors = lshf.radius_neighbors(queries,
                                                 return_distance=True)
    assert_equal(neighbors.shape[0], n_queries)
    assert_equal(distances.shape[0], n_queries)
    # dists and inds should not be 2D arrays
    assert_equal(distances.ndim, 1)
    assert_equal(neighbors.ndim, 1)

    # Compare with exact neighbor search
    query = X[rng.randint(0, n_samples)]
    mean_dist = np.mean(pairwise_distances(query, X, metric='cosine'))
    nbrs = NearestNeighbors(algorithm='brute', metric='cosine')
    nbrs.fit(X)

    distances_approx, _ = lshf.radius_neighbors(query, radius=mean_dist)
    distances_exact, _ = nbrs.radius_neighbors(query, radius=mean_dist)
    # Distances of exact neighbors is less than or equal to approximate
    assert_true(np.all(np.less_equal(np.sort(distances_exact[0]),
                                     np.sort(distances_approx[0]))))
Exemplo n.º 41
0
def trainLSH(train, test, val):
    n_feat = train[0].size
    train_data = train[:, :-1]
    train_labels = train[:, n_feat - 1]
    val_data = val[:, :-1]
    val_labels = val[:, n_feat - 1]
    lshf = LSHForest(random_state=42)
    lshf.fit(train_data)
    countarrLSH = lshFunct(test, val, n_feat, lshf, train_labels)
    return countarrLSH
Exemplo n.º 42
0
 def BuildModel(self, data, labels):
   # Create and train the classifier.
   lshf = LSHForest(n_estimators = self.n_estimators,
                    min_hash_match = self.min_hash_match,
                    n_candidates = self.n_candidates,
                    radius_cutoff_ratio = self.radius_cutoff_ratio,
                    radius = self.radius,
                    n_neighbors = self.n_neighbors)
   lshf.fit(data)
   return lshf
Exemplo n.º 43
0
class LSHForestSearch:
    def __init__(self, features, k):
        self.lshf = LSHForest(n_estimators=1, n_candidates=1,
                     n_neighbors=k)
        self.k = k
        
        self.lshf.fit(features)
        
    def search(self, features):
        
        return self.lshf.kneighbors(features, return_distance=False, n_neighbors=self.k)    
Exemplo n.º 44
0
def findNN(rawData_review):
    # fetch user_id and review text + xstar
    data_review = rawData_review.map(lambda x: (x[1], x[3] + " " + str(x[5]) + "star"))

    # knn parameter
    k = 10

    # nltk
    nltk.download('stopwords')
    stop_words = stopwords.words('english')

    # find top N users similar to user
    # go through train_review and generate word cloud for each user
    userReviewText = data_review.reduceByKey(strjoin)
    userReviewText = userReviewText.map(lambda x: (x[0], filterStopwords(x[1], stop_words)))

    # find corpus
    # if os.path.exists("feature.pkl"):
    #    # Load it
    #    transformer = pickle.load(open("transformer.pkl", "rb"))
    #    features = pickle.load(open("feature.pkl", "rb"))
    # else:
    # (id, text)
    corpusFullMap = userReviewText.map(lambda x: (x[0], x[1])).collect()
    corpusFullMap = zip(*corpusFullMap)

    # get the partial map of (idx, id) in the form of dict and (text)
    idMap = dict(enumerate(corpusFullMap[0]))
    textMap = corpusFullMap[1]

    transformer = TfidfVectorizer(min_df=20, ngram_range=(1, 3), analyzer='word', max_features=1000)
    features = transformer.fit_transform(textMap)
    # Save transformer
    # with open('transformer.pkl', 'wb') as f:
    #    pickle.dump(transformer, f, pickle.HIGHEST_PROTOCOL)
    # Save features
    # with open('feature.pkl', 'wb') as f:
    #    pickle.dump(features, f, pickle.HIGHEST_PROTOCOL)

    # initialize lsh
    lshf = LSHForest(random_state=42)
    lshf.fit(features)

    # find tf-idf of word vector
    userReviewVec = userReviewText.map(lambda x: (x[0], tfidf(x[1], transformer, lshf, k, idMap)))

    # test load and save of rdd

    # if os.path.exists("result"):
    #    shutil.rmtree("result")

    # userReviewVec.map(lambda x: (str(x[0]), " ".join(map(str, x[1][0])), " ".join(map(str, x[1][1]))))\
    #    .saveAsTextFile('result')
    return userReviewVec, lshf, transformer, idMap, stop_words
Exemplo n.º 45
0
def build_LSH_Forest(orig_frames):
    '''
    Inputs: The list of feature encodings for the full movie.
    Outputs: A locality-specific hashing (LSH) forest object (as implemented in the
    scikit-learn.neighbors module)
    Purpose: Efficiently creates a neighbor-based system so that single frames can be
    placed near similar frames in terms of their mutual encodings emerging from the 
    VGG16 network model.
    '''
    lshf = LSHForest(n_estimators=20, n_candidates=1000, random_state=42)
    lshf.fit(orig_frames)
    return lshf
Exemplo n.º 46
0
class LHSForestEngine:

    def __init__(self):
        self.engine = LSHForest(random_state=42)
        self.name = "LHS"

    def fit(self, data):
        self.engine.fit(data)

    def dist(self, data):
        distances, indices = self.engine.kneighbors(data, n_neighbors=1)
        return distances.ravel()
Exemplo n.º 47
0
 def calculate_duplication_number(self,text_list):
     print "length is ", len(text_list)
     tf_vectorizer = CountVectorizer(stop_words=None,analyzer='word',ngram_range=(5,5))
     #print text_list
     tf = tf_vectorizer.fit_transform(text_list)
     #print tf_vectorizer.get_feature_names()
     print tf[0]
     #print tf[123]
     lshf = LSHForest()
     #print tf
     lshf.fit(tf)
     distance,index = lshf.kneighbors(tf,n_neighbors=1)
     print distance, index
Exemplo n.º 48
0
  def metric(self):
    totalTimer = Timer()
    with totalTimer:
      model = LSHForest(**self.build_opts)
      model.fit(self.data[0])

      distances,indices = model.kneighbors(self.data[1],
        n_neighbors=self.n_neighbors)

    metric = {}
    metric["runtime"] = totalTimer.ElapsedTime()

    return metric
Exemplo n.º 49
0
def startQuery():
    while True:

      try:
          ipt = raw_input('Directory of query:')
      except ImportError:
          print 'invalid type'
      else:
          query = ipt
      if query == 'exit()':
          break

      

    

      print 'loading query...'
      try:
          token = get_tokens_by_dir(query)
      except IOError:
          print 'invalid file name'
      else:
##########################################query preprocessing
           print 'query pre-processing...'
           stopped_tokens = [i for i in token if not i in en_stop]
           p_stemmer = PorterStemmer()
           stemed_tokens = []
           for i in stopped_tokens:
               try:
                   temp_token = str(p_stemmer.stem(i))
                   stemed_tokens.append(temp_token)
               except IndexError:
                   pass
           tokens = [stemed_tokens]
######################################################################################
           dictionary_new = corpora.Dictionary(tokens)
           corpus_new = [dictionary_new.doc2bow(text) for text in tokens]
           QUERY_TOPIC = np.zeros([1,num_topic]) ## topic vector for query

           new_topics = LDA[corpus_new]


           for i in new_topics[0]:
               print(i)
               QUERY_TOPIC[0,i[0]] = i[1] ##assign new topics to query doc-topic matrix

           print 'fetching results for you...'
           lshf = LSHForest(random_state=42)
           lshf.fit(DOC_TOPICS) ##fit the local sensitive hash forest with training data POINT_SET
           dist,indices=lshf.kneighbors(QUERY_TOPIC,n_neighbors=20)
           print indices
Exemplo n.º 50
0
def fit_lshf(data):
    logger.info('Fitting  LSHForest...')
    from sklearn.neighbors import LSHForest
    lshf = LSHForest(
        n_estimators=20,
        min_hash_match=4,
        n_candidates=200,
        n_neighbors=2,
        radius=1.0,
        radius_cutoff_ratio=0.9,
        random_state=None,
    )
    lshf.fit(data)
    return lshf
Exemplo n.º 51
0
def test_kneighbors():
    """Checks whether desired number of neighbors are returned.

    It is guaranteed to return the requested number of neighbors
    if `min_hash_match` is set to 0. Returned distances should be
    in ascending order.
    """
    n_samples = 12
    n_features = 2
    n_iter = 10
    rng = np.random.RandomState(42)
    X = rng.rand(n_samples, n_features)

    lshf = LSHForest(min_hash_match=0)
    # Test unfitted estimator
    assert_raises(ValueError, lshf.kneighbors, X[0])

    lshf.fit(X)

    for i in range(n_iter):
        n_neighbors = rng.randint(0, n_samples)
        query = X[rng.randint(0, n_samples)]
        neighbors = lshf.kneighbors(query, n_neighbors=n_neighbors,
                                    return_distance=False)
        # Desired number of neighbors should be returned.
        assert_equal(neighbors.shape[1], n_neighbors)

    # Multiple points
    n_queries = 5
    queries = X[rng.randint(0, n_samples, n_queries)]
    distances, neighbors = lshf.kneighbors(queries,
                                           n_neighbors=1,
                                           return_distance=True)
    assert_equal(neighbors.shape[0], n_queries)
    assert_equal(distances.shape[0], n_queries)
    # Test only neighbors
    neighbors = lshf.kneighbors(queries, n_neighbors=1,
                                return_distance=False)
    assert_equal(neighbors.shape[0], n_queries)
    # Test random point(not in the data set)
    query = rng.randn(n_features)
    lshf.kneighbors(query, n_neighbors=1,
                    return_distance=False)
    # Test n_neighbors at initialization
    neighbors = lshf.kneighbors(query, return_distance=False)
    assert_equal(neighbors.shape[1], 5)
    # Test `neighbors` has an integer dtype
    assert_true(neighbors.dtype.kind == 'i',
                msg="neighbors are not in integer dtype.")
Exemplo n.º 52
0
def test_graphs():
    """Smoke tests for graph methods."""
    n_samples_sizes = [5, 10, 20]
    n_features = 3
    rng = np.random.RandomState(42)

    for n_samples in n_samples_sizes:
        X = rng.rand(n_samples, n_features)
        lshf = LSHForest(min_hash_match=0)
        lshf.fit(X)

        kneighbors_graph = lshf.kneighbors_graph(X)
        radius_neighbors_graph = lshf.radius_neighbors_graph(X)

        assert_equal(kneighbors_graph.shape[0], n_samples)
        assert_equal(kneighbors_graph.shape[1], n_samples)
        assert_equal(radius_neighbors_graph.shape[0], n_samples)
        assert_equal(radius_neighbors_graph.shape[1], n_samples)
Exemplo n.º 53
0
def lshf_scikit(data, n_neighbors=4,
               n_estimators=10,
               min_hash_match=4,
               n_candidates=10,
               random_state=None):
   n_neighbors += 1

   # initialize nearest neighbor model
   nbrs = LSHForest(n_neighbors=n_neighbors,
                    n_estimators = 10,
                    min_hash_match = 4,
                    n_candidates = 10,
                    random_state = 0)

   # fit nearest neighbor model to the data
   nbrs.fit(data)

   # return the distances and indices
   return nbrs.kneighbors(data)
Exemplo n.º 54
0
def test_distances():
    """Checks whether returned neighbors are from closest to farthest."""
    n_samples = 12
    n_features = 2
    n_iter = 10
    rng = np.random.RandomState(42)
    X = rng.rand(n_samples, n_features)

    lshf = LSHForest()
    lshf.fit(X)

    for i in range(n_iter):
        n_neighbors = rng.randint(0, n_samples)
        query = X[rng.randint(0, n_samples)]
        distances, neighbors = lshf.kneighbors(query,
                                               n_neighbors=n_neighbors,
                                               return_distance=True)

        # Returned neighbors should be from closest to farthest, that is
        # increasing distance values.
        assert_true(np.all(np.diff(distances[0]) >= 0))
Exemplo n.º 55
0
class LSHForestSearch:
    def __init__(self, docs):
        self.lshf = LSHForest(n_estimators=1, n_candidates=1,
                     n_neighbors=1)
        self.dv = DictVectorizer()
        dicts = []
        for d in docs:
            dicts.append(dict([(w, 1) for w in d]))
        self.dv.fit(dicts)
        features = self.dv.transform(dicts)
        # floats are faster
        # features = csr_matrix(features, dtype=int)
        self.lshf.fit(features)
        
    def search(self, docs):
        dicts = []
        for d in docs:
            dicts.append(dict([(w, 1) for w in d]))
        features = self.dv.transform(dicts)
        # floats are faster
        # features = csr_matrix(features, dtype=int)
        return self.lshf.kneighbors(features, return_distance=False)    
def single_batch(tweet_db):
    """Performs an approximate nearest neighbors search on tweets in the database
    passed to it. The database must be a list of tweets (text of the tweets only).
    Returns the mean number of neighbors (nearly-identical tweets) that a given
    tweet has, the tweets that are considered neighbors (i.e. spam), the number
    of tweets that are spam (number of tweets with at least 1 other neighbor),
    and the amount of time that it took to run the search on the database."""

    # Vectorize and fit tree:
    timer = time.time()
    vect2 = TfidfVectorizer()
    X2 = vect2.fit_transform(tweet_db)
    tree2 = LSHForest()
    tree2.fit(X2)
    print "that took %2f seconds" % (time.time()-timer)

    # Build tree:
    timer = time.time()
    n_neighbors = []
    neighbors_indices = []
    for x in vect2.transform(tweet_db):
        if len(n_neighbors) % 100 == 0: print len(n_neighbors)
        neighbors = tree2.radius_neighbors(x, radius = .3)[1]
        n_neighbors.append(len(neighbors[0]))
        neighbors_indices.append(neighbors)
    tree_build_time = (time.time() - timer)

    # Find neighbors:
    l = list(n_neighbors)
    l = [l.index(x) for x in l if x > 2]

    # Get indices of the tweets that are parts of close clusters:
    len_l = len(set(l))
    actual_neighbors =[]
    for x in set(l):
        for neigh in neighbors_indices[x][0]:
            actual_neighbors.append(tweet_db[neigh])

    return np.mean(n_neighbors), actual_neighbors, len_l, tree_build_time, neighbors_indices
Exemplo n.º 57
0
def test_candidates():
    """Checks whether candidates are sufficient.

    This should handle the cases when number of candidates is 0.
    User should be warned when number of candidates is less than
    requested number of neighbors.
    """
    X_train = np.array([[5, 5, 2], [21, 5, 5], [1, 1, 1], [8, 9, 1],
                        [6, 10, 2]], dtype=np.float32)
    X_test = np.array([7, 10, 3], dtype=np.float32)

    # For zero candidates
    lshf = LSHForest(min_hash_match=32)
    lshf.fit(X_train)

    message = ("Number of candidates is not sufficient to retrieve"
               " %i neighbors with"
               " min_hash_match = %i. Candidates are filled up"
               " uniformly from unselected"
               " indices." % (3, 32))
    assert_warns_message(UserWarning, message, lshf.kneighbors,
                         X_test, n_neighbors=3)
    distances, neighbors = lshf.kneighbors(X_test, n_neighbors=3)
    assert_equal(distances.shape[1], 3)

    # For candidates less than n_neighbors
    lshf = LSHForest(min_hash_match=31)
    lshf.fit(X_train)

    message = ("Number of candidates is not sufficient to retrieve"
               " %i neighbors with"
               " min_hash_match = %i. Candidates are filled up"
               " uniformly from unselected"
               " indices." % (5, 31))
    assert_warns_message(UserWarning, message, lshf.kneighbors,
                         X_test, n_neighbors=5)
    distances, neighbors = lshf.kneighbors(X_test, n_neighbors=5)
    assert_equal(distances.shape[1], 5)
Exemplo n.º 58
0
def filter_data(folder, same_dist_threshold):
    lshf = LSHForest(n_estimators=20, n_candidates=200, n_neighbors=10)
    data, pic_list = load_unknown_data(folder)
    for index in range(len(pic_list)):
        pic_list[index] = os.path.join(folder, pic_list[index])
    new_data = np.transpose(data, (0,3,1,2))
    # print 'load deepid model'
    # model, get_Conv_FeatureMap = load_deepid_model(deepid_model_file, deepid_weight_file)
    start = time()
    data_feature = get_Conv_FeatureMap([new_data,0])[0]
    # print 'data_feature.shape :', data_feature.shape
    # print 'pic_list :', pic_list

    lshf.fit(data_feature, pic_list)
    need_remove_list = set()
    no_same_feature_list = []
    for index_i in range(len(data_feature)):
        if len(no_same_feature_list) == 0:
            no_same_feature_list.append(data_feature[index_i:index_i+1])
        else:
            tmp = lshf.kneighbors(data_feature[index_i:index_i+1], n_neighbors=10, return_distance=True)
            tmp = zip(tmp[0][0], tmp[1][0])
            for index_j in range(len(tmp)):
                if tmp[index_j][1] == index_i:
                    continue
                if tmp[index_j][0] < same_dist_threshold:
                    need_remove_list.add(pic_list[index_i])
    for path in need_remove_list:
        try:
            os.remove(path)
        except:
            print 'error path :', path
            continue

    end = time()
    print 'filter time :', (end - start)
    return len(no_same_feature_list)
Exemplo n.º 59
0
def score(factors):
    verifyCount = 50
    neighbors = 5
    estimators = 10
    candidates = 10
    X, y = Sets.trainingSet
    test_set, databases = Sets.testSet
    X = Common.FactorizeVectors(X, factors)
    test_set = Common.FactorizeVectors(test_set, factors)
    vectorAndDatabaseList = zip(test_set, databases)
    best_neighbor, best_candidates, best_estimator, best_predictions = 0, 0, 0, 0
    correctionAverage = 0
    for i in range(verifyCount):
        best_predictions = 0
        clf = LSHForest(n_neighbors=5, n_estimators = 10, n_candidates = 10)
        clf.fit(X)

        correct = 0
        total = 0

        for vectorAndDb in vectorAndDatabaseList:
            total += 1
            actual = vectorAndDb[1]
            #predicted = clf.predict(vectorAndDb[0])[0]
            distances, indices = clf.kneighbors(vectorAndDb[0], n_neighbors=neighbors)
            predicted = GetPrediction(y, distances[0], indices[0])
            if (actual == predicted):
                correct += 1
            #print('Actual: ' + actual +', predicted: ' + predicted)

        if (correct > best_predictions):
            best_predictions = correct
            best_neighbor, best_candidates, best_estimator = neighbors, candidates, estimators
        correctionAverage += best_predictions
    correctionAverage = float(correctionAverage)/verifyCount
    return correctionAverage, best_neighbor, best_candidates, best_estimator
Exemplo n.º 60
0
class SMOTE(OverSampler):
    """Class to perform over-sampling using SMOTE.

    This object is an implementation of SMOTE - Synthetic Minority
    Over-sampling Technique, and the variations Borderline SMOTE 1, 2 and
    SVM-SMOTE.

    Parameters
    ----------
    ratio : str or float, optional (default='auto')
        If 'auto', the ratio will be defined automatically to balanced
        the dataset. Otherwise, the ratio will corresponds to the number
        of samples in the minority class over the the number of samples
        in the majority class.

    random_state : int or None, optional (default=None)
        Seed for random number generation.

    verbose : bool, optional (default=True)
        Boolean to either or not print information about the processing.

    k : int, optional (default=5)
        Number of nearest neighbours to used to construct synthetic samples.

    m : int, optional (default=10)
        Number of nearest neighbours to use to determine if a minority sample
        is in danger.

    out_step : float, optional (default=0.5)
        Step size when extrapolating.

    kind : str, optional (default='regular')
        The type of SMOTE algorithm to use one of the following options:
        'regular', 'borderline1', 'borderline2', 'svm'

    nn_method : str, optional (default='exact')
        The nearest neighbors method to use which can be either: 'approximate'
        or 'exact'. 'approximate' will use LSH Forest while 'exact' will be an
        exact search.

    Attributes
    ----------
    ratio_ : str or float, optional (default='auto')
        If 'auto', the ratio will be defined automatically to balanced
        the dataset. Otherwise, the ratio will corresponds to the number
        of samples in the minority class over the the number of samples
        in the majority class.

    rs_ : int or None, optional (default=None)
        Seed for random number generation.

    min_c_ : str or int
        The identifier of the minority class.

    max_c_ : str or int
        The identifier of the majority class.

    stats_c_ : dict of str/int : int
        A dictionary in which the number of occurences of each class is
        reported.

    Notes
    -----
    See the original papers: [1]_, [2]_, [3]_ for more details.

    It does not support multiple classes automatically, but can be called
    multiple times.

    References
    ----------
    .. [1] N. V. Chawla, K. W. Bowyer, L. O.Hall, W. P. Kegelmeyer, "SMOTE:
       synthetic minority over-sampling technique," Journal of artificial
       intelligence research, 321-357, 2002.

    .. [2] H. Han, W. Wen-Yuan, M. Bing-Huan, "Borderline-SMOTE: a new
       over-sampling method in imbalanced data sets learning," Advances in
       intelligent computing, 878-887, 2005.

    .. [3] H. M. Nguyen, E. W. Cooper, K. Kamei, "Borderline over-sampling for
       imbalanced data classification," International Journal of Knowledge
       Engineering and Soft Data Paradigms, 3(1), pp.4-21, 2001.

    """

    def __init__(self, ratio='auto', random_state=None, verbose=True,
                 k=5, m=10, out_step=0.5, kind='regular', nn_method='exact',
                 n_jobs=-1, **kwargs):
        """Initialisation of SMOTE object.

        Parameters
        ----------
        ratio : str or float, optional (default='auto')
            If 'auto', the ratio will be defined automatically to balanced
            the dataset. Otherwise, the ratio will corresponds to the
            number of samples in the minority class over the the number of
            samples in the majority class.

        random_state : int or None, optional (default=None)
            Seed for random number generation.

        verbose : bool, optional (default=True)
            Boolean to either or not print information about the
            processing.

        k : int, optional (default=5)
            Number of nearest neighbours to used to construct synthetic
            samples.

        m : int, optional (default=10)
            Number of nearest neighbours to use to determine if a minority
            sample is in danger.

        out_step : float, optional (default=0.5)
            Step size when extrapolating.

        kind : str, optional (default='regular')
            The type of SMOTE algorithm to use one of the following
            options: 'regular', 'borderline1', 'borderline2', 'svm'

        nn_method : str, optional (default='exact')
            The nearest neighbors method to use which can be either:
            'approximate' or 'exact'. 'approximate' will use LSH Forest while
            'exact' will be an exact search.

        n_jobs : int, optional (default=-1)
            Number of threads to run the algorithm when it is possible.

        """
        super(SMOTE, self).__init__(ratio=ratio,
                                    random_state=random_state,
                                    verbose=verbose)

        # Check the number of thread to use
        self.n_jobs = n_jobs

        # --- The type of smote
        # This object can perform regular smote over-sampling, borderline 1,
        # borderline 2 and svm smote. Since the algorithms are fairly simple
        # they share most methods.
        possible_kind = ('regular', 'borderline1', 'borderline2', 'svm')
        if kind in possible_kind:
            self.kind = kind
        else:
            raise ValueError('Unknown kind for SMOTE algorithm.')

        # --- Verbose
        # Control whether or not status and progress information should be
        self.verbose = verbose

        # --- Nearest Neighbours for synthetic samples
        # The smote algorithm uses the k-th nearest neighbours of a minority
        # sample to generate new synthetic samples.
        self.k = k

        # --- NN object
        # Import the NN object from scikit-learn library. Since in the smote
        # variations we must first find samples that are in danger, we
        # initialize the NN object differently depending on the method chosen
        if kind == 'regular':
            # Regular smote does not look for samples in danger, instead it
            # creates synthetic samples directly from the k-th nearest
            # neighbours with not filtering
            if nn_method == 'exact':
                self.nearest_neighbour_ = NearestNeighbors(n_neighbors=k + 1,
                                                           n_jobs=self.n_jobs)
            elif nn_method == 'approximate':
                self.nearest_neighbour_ = LSHForest(n_estimators=50,
                                                    n_candidates=500,
                                                    n_neighbors=k+1)
        else:
            # Borderline1, 2 and SVM variations of smote must first look for
            # samples that could be considered noise and samples that live
            # near the boundary between the classes. Therefore, before
            # creating synthetic samples from the k-th nns, it first look
            # for m nearest neighbors to decide whether or not a sample is
            # noise or near the boundary.
            if nn_method == 'exact':
                self.nearest_neighbour_ = NearestNeighbors(n_neighbors=m + 1,
                                                           n_jobs=self.n_jobs)
            elif nn_method == 'approximate':
                self.nearest_neighbour_ = LSHForest(n_estimators=50,
                                                    n_candidates=500,
                                                    n_neighbors=m+1)

            # --- Nearest Neighbours for noise and boundary (in danger)
            # Before creating synthetic samples we must first decide if
            # a given entry is noise or in danger. We use m nns in this step
            self.m = m

        # --- SVM smote
        # Unlike the borderline variations, the SVM variation uses the support
        # vectors to decide which samples are in danger (near the boundary).
        # Additionally it also introduces extrapolation for samples that are
        # considered safe (far from boundary) and interpolation for samples
        # in danger (near the boundary). The level of extrapolation is
        # controled by the out_step.
        if kind == 'svm':
            # Store extrapolation size
            self.out_step = out_step

            # Store SVM object with any parameters
            self.svm_ = SVC(**kwargs)

    def fit(self, X, y):
        """Find the classes statistics before to perform sampling.

        Parameters
        ----------
        X : ndarray, shape (n_samples, n_features)
            Matrix containing the data which have to be sampled.

        y : ndarray, shape (n_samples, )
            Corresponding label for each sample in X.

        Returns
        -------
        self : object,
            Return self.

        """
        # Check the consistency of X and y
        X, y = check_X_y(X, y)

        # Call the parent function
        super(SMOTE, self).fit(X, y)

        return self

    def _in_danger_noise(self, samples, y, kind='danger'):
        """Estimate if a set of sample are in danger or not.

        Parameters
        ----------
        samples : ndarray, shape (n_samples, n_features)
            The samples to check if either they are in danger or not.

        y : ndarray, shape (n_samples, )
            The true label in order to check the neighbour labels.

        kind : str, optional (default='danger')
            The type of classification to use. Can be either:

            - If 'danger', check if samples are in danger,
            - If 'noise', check if samples are noise.

        Returns
        -------
        output : ndarray, shape (n_samples, )
            A boolean array where True refer to samples in danger or noise.

        """

        # Find the NN for each samples
        # Exclude the sample itself
        x = self.nearest_neighbour_.kneighbors(samples,
                                               return_distance=False)[:, 1:]

        # Count how many NN belong to the minority class
        # Find the class corresponding to the label in x
        nn_label = (y[x] != self.min_c_).astype(int)
        # Compute the number of majority samples in the NN
        n_maj = np.sum(nn_label, axis=1)

        if kind == 'danger':
            # Samples are in danger for m/2 <= m' < m
            return np.bitwise_and(n_maj >= float(self.m) / 2.,
                                  n_maj < self.m)
        elif kind == 'noise':
            # Samples are noise for m = m'
            return n_maj == self.m
        else:
            raise ValueError('Unknown string for parameter kind.')

    def _make_samples(self, X, y_type, nn_data, nn_num, n_samples,
                     step_size=1.):
        """A support function that returns artificial samples constructed along
        the line connecting nearest neighbours.

        Parameters
        ----------
        X : ndarray, shape (n_samples, n_features)
            Points from which the points will be created.

        y_type : str or int
            The minority target value, just so the function can return the
            target values for the synthetic variables with correct length in
            a clear format.

        nn_data : ndarray, shape (n_samples_all, n_features)
            Data set carrying all the neighbours to be used

        nn_num : ndarray, shape (n_samples_all, k_nearest_neighbours)
            The nearest neighbours of each sample in nn_data.

        n_samples : int
            The number of samples to generate.

        step_size : float, optional (default=1.)
            The step size to create samples.

        Returns
        -------
        X_new : ndarray, shape (n_samples_new, n_features)
            Synthetically generated samples.

        y_new : ndarray, shape (n_samples_new, )
            Target values for synthetic samples.

        """

        # Check the consistency of X
        X = check_array(X)

        # A matrix to store the synthetic samples
        X_new = np.zeros((n_samples, X.shape[1]))

        # Set seeds
        np.random.seed(self.rs_)
        seeds = np.random.randint(low=0,
                                  high=100*len(nn_num.flatten()),
                                  size=n_samples)

        # Randomly pick samples to construct neighbours from
        np.random.seed(self.rs_)
        samples = np.random.randint(low=0,
                                    high=len(nn_num.flatten()),
                                    size=n_samples)

        # Loop over the NN matrix and create new samples
        for i, n in enumerate(samples):
            # NN lines relate to original sample, columns to its
            # nearest neighbours
            row, col = divmod(n, nn_num.shape[1])

            # Take a step of random size (0,1) in the direction of the
            # n nearest neighbours
            np.random.seed(seeds[i])
            step = step_size * np.random.uniform()

            # Construct synthetic sample
            X_new[i] = X[row] - step * (X[row] -
                                        nn_data[nn_num[row, col]])

        # The returned target vector is simply a repetition of the
        # minority label
        y_new = np.array([y_type] * len(X_new))

        if self.verbose:
            print("Generated {} new samples ...".format(len(X_new)))

        return X_new, y_new

    def transform(self, X, y):
        """Resample the dataset.

        Parameters
        ----------
        X : ndarray, shape (n_samples, n_features)
            Matrix containing the data which have to be sampled.

        y : ndarray, shape (n_samples, )
            Corresponding label for each sample in X.

        Returns
        -------
        X_resampled : ndarray, shape (n_samples_new, n_features)
            The array containing the resampled data.

        y_resampled : ndarray, shape (n_samples_new)
            The corresponding label of `X_resampled`

        """
        # Check the consistency of X and y
        X, y = check_X_y(X, y)

        # Call the parent function
        super(SMOTE, self).transform(X, y)

        # Define the number of sample to create
        # We handle only two classes problem for the moment.
        if self.ratio_ == 'auto':
            num_samples = (self.stats_c_[self.maj_c_] -
                           self.stats_c_[self.min_c_])
        else:
            num_samples = ((self.ratio_ * self.stats_c_[self.maj_c_]) -
                           self.stats_c_[self.min_c_])

        # Start by separating minority class features and target values.
        X_min = X[y == self.min_c_]

        # If regular SMOTE is to be performed
        if self.kind == 'regular':

            # Print if verbose is true#
            if self.verbose:
                print('Finding the {} nearest neighbours...'.format(self.k))

            # Look for k-th nearest neighbours, excluding, of course, the
            # point itself.
            self.nearest_neighbour_.fit(X_min)

            # Matrix with k-th nearest neighbours indexes for each minority
            # element.
            nns = self.nearest_neighbour_.kneighbors(
                X_min,
                return_distance=False)[:, 1:]

            # Print status if verbose is true
            if self.verbose:
                print("done!")
                print("Creating synthetic samples...", end="")

            # --- Generating synthetic samples
            # Use static method make_samples to generate minority samples
            X_new, y_new = self._make_samples(X_min,
                                             self.min_c_,
                                             X_min,
                                             nns,
                                             num_samples,
                                             1.0)

            if self.verbose:
                print("done!")

            # Concatenate the newly generated samples to the original data set
            X_resampled = np.concatenate((X, X_new), axis=0)
            y_resampled = np.concatenate((y, y_new), axis=0)

            return X_resampled, y_resampled

        if self.kind == 'borderline1' or self.kind == 'borderline2':

            if self.verbose:
                print("Finding the {} nearest neighbours...".format(self.m))

            # Find the NNs for all samples in the data set.
            self.nearest_neighbour_.fit(X)

            if self.verbose:
                print("done!")

            # Boolean array with True for minority samples in danger
            danger_index = self._in_danger_noise(X_min, y, kind='danger')

            # If all minority samples are safe, return the original data set.
            if not any(danger_index):
                if self.verbose:
                    print('There are no samples in danger. No borderline '
                          'synthetic samples created.')

                # All are safe, nothing to be done here.
                return X, y

            # If we got here is because some samples are in danger, we need to
            # find the NNs among the minority class to create the new synthetic
            # samples.
            #
            # We start by changing the number of NNs to consider from m + 1
            # to k + 1
            self.nearest_neighbour_.set_params(**{'n_neighbors': self.k + 1})
            self.nearest_neighbour_.fit(X_min)

            # nns...#
            nns = self.nearest_neighbour_.kneighbors(
                X_min[danger_index],
                return_distance=False)[:, 1:]

            # B1 and B2 types diverge here!!!
            if self.kind == 'borderline1':
                # Create synthetic samples for borderline points.
                X_new, y_new = self._make_samples(X_min[danger_index],
                                                 self.min_c_,
                                                 X_min,
                                                 nns,
                                                 num_samples)

                # Concatenate the newly generated samples to the original
                # dataset
                X_resampled = np.concatenate((X, X_new), axis=0)
                y_resampled = np.concatenate((y, y_new), axis=0)

                # Reset the k-neighbours to m+1 neighbours
                self.nearest_neighbour_.set_params(**{'n_neighbors': self.m+1})

                return X_resampled, y_resampled

            else:
                # Split the number of synthetic samples between only minority
                # (type 1), or minority and majority (with reduced step size)
                # (type 2).
                np.random.seed(self.rs_)

                # The fraction is sampled from a beta distribution centered
                # around 0.5 with variance ~0.01
                fractions = betavariate(alpha=10, beta=10)

                # Only minority
                X_new_1, y_new_1 = self._make_samples(X_min[danger_index],
                                                     self.min_c_,
                                                     X_min,
                                                     nns,
                                                     int(fractions *
                                                         (num_samples + 1)),
                                                     step_size=1.)

                # Only majority with smaller step size
                X_new_2, y_new_2 = self._make_samples(X_min[danger_index],
                                                     self.min_c_,
                                                     X[y != self.min_c_],
                                                     nns,
                                                     int((1 - fractions) *
                                                         num_samples),
                                                     step_size=0.5)

                # Concatenate the newly generated samples to the original
                # data set
                X_resampled = np.concatenate((X, X_new_1, X_new_2), axis=0)
                y_resampled = np.concatenate((y, y_new_1, y_new_2), axis=0)

                # Reset the k-neighbours to m+1 neighbours
                self.nearest_neighbour_.set_params(**{'n_neighbors': self.m+1})

                return X_resampled, y_resampled

        if self.kind == 'svm':
            # The SVM smote model fits a support vector machine
            # classifier to the data and uses the support vector to
            # provide a notion of boundary. Unlike regular smote, where
            # such notion relies on proportion of nearest neighbours
            # belonging to each class.

            # Fit SVM to the full data#
            self.svm_.fit(X, y)

            # Find the support vectors and their corresponding indexes
            support_index = self.svm_.support_[y[self.svm_.support_] ==
                                               self.min_c_]
            support_vector = X[support_index]

            # First, find the nn of all the samples to identify samples
            # in danger and noisy ones
            if self.verbose:
                print("Finding the {} nearest neighbours...".format(self.m))

            # As usual, fit a nearest neighbour model to the data
            self.nearest_neighbour_.fit(X)

            if self.verbose:
                print("done!")

            # Now, get rid of noisy support vectors

            noise_bool = self._in_danger_noise(support_vector, y, kind='noise')

            # Remove noisy support vectors
            support_vector = support_vector[np.logical_not(noise_bool)]
            danger_bool = self._in_danger_noise(support_vector, y,
                                               kind='danger')
            safety_bool = np.logical_not(danger_bool)

            if self.verbose:
                print("Out of {0} support vectors, {1} are noisy, "
                      "{2} are in danger "
                      "and {3} are safe.".format(support_vector.shape[0],
                                                 noise_bool.sum().astype(int),
                                                 danger_bool.sum().astype(int),
                                                 safety_bool.sum().astype(int)
                                                 ))

                # Proceed to find support vectors NNs among the minority class
                print("Finding the {} nearest neighbours...".format(self.k))

            self.nearest_neighbour_.set_params(**{'n_neighbors': self.k + 1})
            self.nearest_neighbour_.fit(X_min)

            if self.verbose:
                print("done!")
                print("Creating synthetic samples...", end="")

            # Split the number of synthetic samples between interpolation and
            # extrapolation

            # The fraction are sampled from a beta distribution with mean
            # 0.5 and variance 0.01#
            np.random.seed(self.rs_)
            fractions = betavariate(alpha=10, beta=10)

            # Interpolate samples in danger
            if np.count_nonzero(danger_bool) > 0:
                nns = self.nearest_neighbour_.kneighbors(
                    support_vector[danger_bool],
                    return_distance=False)[:, 1:]

                X_new_1, y_new_1 = self._make_samples(
                    support_vector[danger_bool],
                    self.min_c_,
                    X_min,
                    nns,
                    int(fractions * (num_samples + 1)),
                    step_size=1.)

            # Extrapolate safe samples
            if np.count_nonzero(safety_bool) > 0:
                nns = self.nearest_neighbour_.kneighbors(
                    support_vector[safety_bool],
                    return_distance=False)[:, 1:]

                X_new_2, y_new_2 = self._make_samples(
                    support_vector[safety_bool],
                    self.min_c_,
                    X_min,
                    nns,
                    int((1 - fractions) * num_samples),
                    step_size=-self.out_step)

            if self.verbose:
                print("done!")

            # Concatenate the newly generated samples to the original data set
            if (np.count_nonzero(danger_bool) > 0 and
                    np.count_nonzero(safety_bool) > 0):
                X_resampled = np.concatenate((X, X_new_1, X_new_2), axis=0)
                y_resampled = np.concatenate((y, y_new_1, y_new_2), axis=0)
            # not any support vectors in danger
            elif np.count_nonzero(danger_bool) == 0:
                X_resampled = np.concatenate((X, X_new_2), axis=0)
                y_resampled = np.concatenate((y, y_new_2), axis=0)
            # All the support vector in danger
            elif np.count_nonzero(safety_bool) == 0:
                X_resampled = np.concatenate((X, X_new_1), axis=0)
                y_resampled = np.concatenate((y, y_new_1), axis=0)

            # Reset the k-neighbours to m+1 neighbours
            self.nearest_neighbour_.set_params(**{'n_neighbors': self.m+1})

            return X_resampled, y_resampled