Exemplo n.º 1
0
def search_neighbors(request):
	designs = Design.objects.all()

	image_list = []
	for design in designs:
		image_list.append(str(design.uid) + ".png")

	d_geometry = settings.D_GEOMETRY
	designed_images = np.empty((len(image_list), d_geometry[0]*d_geometry[1]*3), dtype="float32")
	for i in range(len(image_list)):
		designed_images[i] = img2numpy_arr(settings.DESIGN_PATH + image_list[i]).reshape(d_geometry[0]*d_geometry[1]*3)
	designed_images /= 255
	
	lshf = LSHForest(random_state=42)
	lshf.fit(designed_images) 

	num = int(request.GET['num'])
	input_fname = str(request.GET['input'])
	input_image = img2numpy_arr(settings.DESIGN_PATH + input_fname)
	input_image = input_image.reshape(1, -1)/255
	_, indices = lshf.kneighbors(input_image, n_neighbors=num)

	similar_images = []
	for i in list(indices.reshape(-1)):
		similar_images.append({ 
			"image": str(designs[i].uid) + ".png", 
			"text": str(designs[i].history_text), 
			"like": int(designs[i].like),
			"filtered": str(designs[i].filtered)
		})

	return JsonResponse({
		"results": similar_images
	})
Exemplo n.º 2
0
    def get_nearest_neighbor_iterable(self, graphlist, start_graphs, start_is_subset=True):

        # vectorize all
        graphlist= list(graphlist)
        graphlist_ = copy.deepcopy(graphlist)
        X = self.vectorizer.transform_single(graphlist_)


        start_graphs= list(start_graphs)
        graphlist_= copy.deepcopy(start_graphs)
        Y = self.vectorizer.transform_single(graphlist_)
        
        
        forest = LSHForest()
        forest.fit(X)
        #http://scikit-learn.org/stable/modules/neighbors.html
        distances, indices = forest.kneighbors(Y, n_neighbors=2)

        # we just assume that this is short...
        index = 0
        if start_is_subset:
            index += 1
        
        #matches= ( X_index ,Y_index, distance  )
        matches = [(indices[i, index], i, distances[i, index]) for i in range(len(indices))]
        matches.sort()

        # this looks super confusing....
        #for index, graph in enumerate(selection_iterator(graphlist, [a[0] for a in matches])):
        #    yield ((graph, start_graphs[matches[index][1]], X[matches[index][0]]))
        # so i wrote this:,,, you may even get rid of the matches variable i think.. and use indices directly
        for Xi,Yi,dist in matches:
            yield ((start_graphs[Yi],graphlist[Xi],X[Xi]))
def test_neighbors_accuracy_with_n_estimators():
    # Checks whether accuracy increases as `n_estimators` increases.
    n_estimators = np.array([1, 10, 100])
    n_samples = 100
    n_features = 10
    n_iter = 10
    n_points = 5
    rng = np.random.RandomState(42)
    accuracies = np.zeros(n_estimators.shape[0], dtype=float)
    X = rng.rand(n_samples, n_features)

    for i, t in enumerate(n_estimators):
        lshf = LSHForest(n_candidates=500, n_estimators=t)
        ignore_warnings(lshf.fit)(X)
        for j in range(n_iter):
            query = X[rng.randint(0, n_samples)].reshape(1, -1)
            neighbors = lshf.kneighbors(query, n_neighbors=n_points,
                                        return_distance=False)
            distances = pairwise_distances(query, X, metric='cosine')
            ranks = np.argsort(distances)[0, :n_points]

            intersection = np.intersect1d(ranks, neighbors).shape[0]
            ratio = intersection / float(n_points)
            accuracies[i] = accuracies[i] + ratio

        accuracies[i] = accuracies[i] / float(n_iter)
    # Sorted accuracies should be equal to original accuracies
    assert_true(np.all(np.diff(accuracies) >= 0),
                msg="Accuracies are not non-decreasing.")
    # Highest accuracy should be strictly greater than the lowest
    assert_true(np.ptp(accuracies) > 0,
                msg="Highest accuracy is not strictly greater than lowest.")
Exemplo n.º 4
0
def test_distances():
    """Checks whether returned neighbors are from closest to farthest."""
    n_samples = 12
    n_features = 2
    n_iter = 10
    rng = np.random.RandomState(42)
    X = rng.rand(n_samples, n_features)

    lshf = LSHForest()
    lshf.fit(X)

    for i in range(n_iter):
        n_neighbors = rng.randint(0, n_samples)
        query = X[rng.randint(0, n_samples)]
        distances, neighbors = lshf.kneighbors(query,
                                               n_neighbors=n_neighbors,
                                               return_distance=True)
        # Returned neighbors should be from closest to farthest.
        assert_true(np.all(np.diff(distances[0]) >= 0))

        mean_dist = np.mean(pairwise_distances(query, X, metric='cosine'))
        distances, neighbors = lshf.radius_neighbors(query,
                                                     radius=mean_dist,
                                                     return_distance=True)
        assert_true(np.all(np.diff(distances[0]) >= 0))
Exemplo n.º 5
0
    def get_heap_and_forest(self, griter, k):
        '''
        so we create the heap and the forest...
        heap is (dist to hyperplane, count, graph)
        and the forest ist just a nearest neighbor from sklearn
        '''

        graphs = list(griter)
        graphs2 = copy.deepcopy(graphs)
        # transform doess mess up the graph objects
        X = self.vectorizer.transform(graphs)

        forest = LSHForest()
        forest.fit(X)
        print 'got forest'

        heap = []
        for vector, graph in zip(X, graphs2):
            graph2 = nx.Graph(graph)
            heapq.heappush(heap, (
                self.sampler.estimator.predict_proba(self.sampler.vectorizer.transform_single(graph2))[0][1],
                # score ~ dist from hyperplane
                k + 1,  # making sure that the counter is high so we dont output the startgraphz at the end
                graph))  # at last the actual graph

        print 'got heap'
        distances, unused = forest.kneighbors(X, n_neighbors=2)
        distances = [a[1] for a in distances]  # the second element should be the dist we want
        avg_dist = distances[len(distances) / 2]  # sum(distances)/len(distances)
        print 'got dist'

        return heap, forest, avg_dist
Exemplo n.º 6
0
class LHSForestEngine:

    def __init__(self):
        self.engine = LSHForest(random_state=42)
        self.name = "LHS"

    def fit(self, data):
        self.engine.fit(data)

    def dist(self, data):
        distances, indices = self.engine.kneighbors(data, n_neighbors=1)
        return distances.ravel()
Exemplo n.º 7
0
def test_sparse_input():
    # note: Fixed random state in sp.rand is not supported in older scipy.
    #       The test should succeed regardless.
    X1 = sp.rand(50, 100)
    X2 = sp.rand(10, 100)
    forest_sparse = LSHForest(radius=1, random_state=0).fit(X1)
    forest_dense = LSHForest(radius=1, random_state=0).fit(X1.A)

    d_sparse, i_sparse = forest_sparse.kneighbors(X2, return_distance=True)
    d_dense, i_dense = forest_dense.kneighbors(X2.A, return_distance=True)
    assert_array_equal(d_sparse, d_dense)
    assert_array_equal(i_sparse, i_dense)

    d_sparse, i_sparse = forest_sparse.radius_neighbors(X2,
                                                        return_distance=True)
    d_dense, i_dense = forest_dense.radius_neighbors(X2.A,
                                                     return_distance=True)
    assert_equal(d_sparse.shape, d_dense.shape)
    for a, b in zip(d_sparse, d_dense):
        assert_array_equal(a, b)
    for a, b in zip(i_sparse, i_dense):
        assert_array_equal(a, b)
Exemplo n.º 8
0
 def calculate_duplication_number(self,text_list):
     print "length is ", len(text_list)
     tf_vectorizer = CountVectorizer(stop_words=None,analyzer='word',ngram_range=(5,5))
     #print text_list
     tf = tf_vectorizer.fit_transform(text_list)
     #print tf_vectorizer.get_feature_names()
     print tf[0]
     #print tf[123]
     lshf = LSHForest()
     #print tf
     lshf.fit(tf)
     distance,index = lshf.kneighbors(tf,n_neighbors=1)
     print distance, index
Exemplo n.º 9
0
def startQuery():
    while True:

      try:
          ipt = raw_input('Directory of query:')
      except ImportError:
          print 'invalid type'
      else:
          query = ipt
      if query == 'exit()':
          break

      

    

      print 'loading query...'
      try:
          token = get_tokens_by_dir(query)
      except IOError:
          print 'invalid file name'
      else:
##########################################query preprocessing
           print 'query pre-processing...'
           stopped_tokens = [i for i in token if not i in en_stop]
           p_stemmer = PorterStemmer()
           stemed_tokens = []
           for i in stopped_tokens:
               try:
                   temp_token = str(p_stemmer.stem(i))
                   stemed_tokens.append(temp_token)
               except IndexError:
                   pass
           tokens = [stemed_tokens]
######################################################################################
           dictionary_new = corpora.Dictionary(tokens)
           corpus_new = [dictionary_new.doc2bow(text) for text in tokens]
           QUERY_TOPIC = np.zeros([1,num_topic]) ## topic vector for query

           new_topics = LDA[corpus_new]


           for i in new_topics[0]:
               print(i)
               QUERY_TOPIC[0,i[0]] = i[1] ##assign new topics to query doc-topic matrix

           print 'fetching results for you...'
           lshf = LSHForest(random_state=42)
           lshf.fit(DOC_TOPICS) ##fit the local sensitive hash forest with training data POINT_SET
           dist,indices=lshf.kneighbors(QUERY_TOPIC,n_neighbors=20)
           print indices
Exemplo n.º 10
0
def test_candidates():
    """Checks whether candidates are sufficient.

    This should handle the cases when number of candidates is 0.
    User should be warned when number of candidates is less than
    requested number of neighbors.
    """
    X_train = np.array([[5, 5, 2], [21, 5, 5], [1, 1, 1], [8, 9, 1],
                        [6, 10, 2]], dtype=np.float32)
    X_test = np.array([7, 10, 3], dtype=np.float32)

    # For zero candidates
    lshf = LSHForest(min_hash_match=32)
    lshf.fit(X_train)

    message = ("Number of candidates is not sufficient to retrieve"
               " %i neighbors with"
               " min_hash_match = %i. Candidates are filled up"
               " uniformly from unselected"
               " indices." % (3, 32))
    assert_warns_message(UserWarning, message, lshf.kneighbors,
                         X_test, n_neighbors=3)
    distances, neighbors = lshf.kneighbors(X_test, n_neighbors=3)
    assert_equal(distances.shape[1], 3)

    # For candidates less than n_neighbors
    lshf = LSHForest(min_hash_match=31)
    lshf.fit(X_train)

    message = ("Number of candidates is not sufficient to retrieve"
               " %i neighbors with"
               " min_hash_match = %i. Candidates are filled up"
               " uniformly from unselected"
               " indices." % (5, 31))
    assert_warns_message(UserWarning, message, lshf.kneighbors,
                         X_test, n_neighbors=5)
    distances, neighbors = lshf.kneighbors(X_test, n_neighbors=5)
    assert_equal(distances.shape[1], 5)
Exemplo n.º 11
0
def cal_acc(pack_file, stat_file, feature_dim):
    f = open(stat_file, 'w')
    f.write('train_pic_num'+'\t'+'person_name'+'\t'+'acc'+'\n')
    pic_num = range(1, max_person_num)
    for num in pic_num:
        all_train_data, all_train_label, all_valid_data, all_valid_label = split_train_valid(pack_file, train_pic_num=num, feature_dim=feature_dim)
        lshf = LSHForest(n_estimators=20, n_candidates=200, n_neighbors=5)

        for index in range(len(all_train_data)):
            try:
                if all_train_data[index] == None:
                    continue
                lshf.partial_fit(all_train_data[index], all_train_label[index])
            except:
                traceback.print_exc()
                continue
        # 对于每个人,分别统计准确率
        person_acc_dic = {}     # 准确的个数
        person_all_dic = {}     # 总的个数
        filter_num = 0
        all_num = 0
        for index in range(len(all_valid_data)):
            try:
                if all_valid_data[index] == None:
                    continue
                all_find_distance, all_find_index = lshf.kneighbors(all_valid_data[index], n_neighbors=5, return_distance=True)
                cos_sim = cosine_similarity(all_valid_data[index], all_train_data[all_find_index[0, 0]])
                label = all_train_label[all_find_index[0, 0]]
                # if cos_sim > sim_threshold:
                if True:
                    if label == all_valid_label[index]:
                        person_acc_dic[label] = person_acc_dic.get(label, 0) + 1
                        person_all_dic[label] = person_all_dic.get(label, 0) + 1
                    else:
                        person_all_dic[label] = person_all_dic.get(label, 0) + 1
                else:
                    filter_num += 1
                all_num += 1
            except:
                print all_valid_label[index]
                continue
        print 'train_num :', num, 'filter_rate: ', (filter_num * 1.0 / all_num)
        for person in person_all_dic:
            all_num = person_all_dic[person]
            right_num = person_acc_dic.get(person, 0)
            f.write('\t'.join(map(str, [num, person, (right_num * 1.0 /  all_num)]))+'\n')
Exemplo n.º 12
0
def lshf_scikit(data, n_neighbors=4,
               n_estimators=10,
               min_hash_match=4,
               n_candidates=10,
               random_state=None):
   n_neighbors += 1

   # initialize nearest neighbor model
   nbrs = LSHForest(n_neighbors=n_neighbors,
                    n_estimators = 10,
                    min_hash_match = 4,
                    n_candidates = 10,
                    random_state = 0)

   # fit nearest neighbor model to the data
   nbrs.fit(data)

   # return the distances and indices
   return nbrs.kneighbors(data)
Exemplo n.º 13
0
def cal_recall(pack_file, stat_file, feature_dim):
    # f_model = open('verf.txt', 'w')
    f = open(stat_file, 'w')
    f.write('train_pic_num'+'\t'+'person_name'+'\t'+'recall'+'\n')
    pic_num = range(1, max_person_num)
    for num in pic_num:
        all_train_data, all_train_label, all_valid_data, all_valid_label = split_train_valid(pack_file, train_pic_num=num, feature_dim=feature_dim)
        lshf = LSHForest(n_estimators=20, n_candidates=200, n_neighbors=5)
        for index in range(len(all_train_data)):
            try:
                if all_train_data[index] == None:
                    continue
                lshf.partial_fit(all_train_data[index], all_train_label[index])
            except:
                continue
        # 对于每个人,分别统计准确率
        person_find_dic = {}     # 准确的个数
        person_all_dic = {}     # 总的个数
        for index in range(len(all_valid_data)):
            try:
                if all_valid_data[index] == None:
                    continue
                all_find_distance, all_find_index = lshf.kneighbors(all_valid_data[index], n_neighbors=5, return_distance=True)
                cos_sim = cosine_similarity(all_valid_data[index], all_train_data[all_find_index[0, 0]])
                label = all_train_label[all_find_index[0, 0]]
                real_label = all_valid_label[index]
                # if cos_sim > sim_threshold:
                if True:
                    if label == real_label:
                        # f_model.write('0'+'\t'+str(cos_sim)+'\n')
                        person_find_dic[real_label] = person_find_dic.get(real_label, 0) + 1
                        person_all_dic[real_label] = person_all_dic.get(real_label, 0) + 1
                    else:
                        # f_model.write('1' + '\t' + str(cos_sim) + '\n')
                        person_all_dic[real_label] = person_all_dic.get(real_label, 0) + 1
            except:
                print all_valid_label[index]
                continue
        print 'train_num :', num
        for person in person_all_dic:
            all_num = person_all_dic[person]
            right_num = person_find_dic.get(person, 0)
            f.write('\t'.join(map(str, [num, person, (right_num * 1.0 /  all_num)]))+'\n')
Exemplo n.º 14
0
def test_kneighbors():
    """Checks whether desired number of neighbors are returned.

    It is guaranteed to return the requested number of neighbors
    if `min_hash_match` is set to 0. Returned distances should be
    in ascending order.
    """
    n_samples = 12
    n_features = 2
    n_iter = 10
    rng = np.random.RandomState(42)
    X = rng.rand(n_samples, n_features)

    lshf = LSHForest(min_hash_match=0)
    # Test unfitted estimator
    assert_raises(ValueError, lshf.kneighbors, X[0])

    lshf.fit(X)

    for i in range(n_iter):
        n_neighbors = rng.randint(0, n_samples)
        query = X[rng.randint(0, n_samples)]
        neighbors = lshf.kneighbors(query, n_neighbors=n_neighbors,
                                    return_distance=False)
        # Desired number of neighbors should be returned.
        assert_equal(neighbors.shape[1], n_neighbors)

    # Multiple points
    n_queries = 5
    queries = X[rng.randint(0, n_samples, n_queries)]
    distances, neighbors = lshf.kneighbors(queries,
                                           n_neighbors=1,
                                           return_distance=True)
    assert_equal(neighbors.shape[0], n_queries)
    assert_equal(distances.shape[0], n_queries)
    # Test only neighbors
    neighbors = lshf.kneighbors(queries, n_neighbors=1,
                                return_distance=False)
    assert_equal(neighbors.shape[0], n_queries)
    # Test random point(not in the data set)
    query = rng.randn(n_features)
    lshf.kneighbors(query, n_neighbors=1,
                    return_distance=False)
    # Test n_neighbors at initialization
    neighbors = lshf.kneighbors(query, return_distance=False)
    assert_equal(neighbors.shape[1], 5)
    # Test `neighbors` has an integer dtype
    assert_true(neighbors.dtype.kind == 'i',
                msg="neighbors are not in integer dtype.")
def test_distances():
    # Checks whether returned neighbors are from closest to farthest.
    n_samples = 12
    n_features = 2
    n_iter = 10
    rng = np.random.RandomState(42)
    X = rng.rand(n_samples, n_features)

    lshf = LSHForest()
    ignore_warnings(lshf.fit)(X)

    for i in range(n_iter):
        n_neighbors = rng.randint(0, n_samples)
        query = X[rng.randint(0, n_samples)].reshape(1, -1)
        distances, neighbors = lshf.kneighbors(query,
                                               n_neighbors=n_neighbors,
                                               return_distance=True)

        # Returned neighbors should be from closest to farthest, that is
        # increasing distance values.
        assert_true(np.all(np.diff(distances[0]) >= 0))
Exemplo n.º 16
0
class LSHForestSearch:
    def __init__(self, docs):
        self.lshf = LSHForest(n_estimators=1, n_candidates=1,
                     n_neighbors=1)
        self.dv = DictVectorizer()
        dicts = []
        for d in docs:
            dicts.append(dict([(w, 1) for w in d]))
        self.dv.fit(dicts)
        features = self.dv.transform(dicts)
        # floats are faster
        # features = csr_matrix(features, dtype=int)
        self.lshf.fit(features)
        
    def search(self, docs):
        dicts = []
        for d in docs:
            dicts.append(dict([(w, 1) for w in d]))
        features = self.dv.transform(dicts)
        # floats are faster
        # features = csr_matrix(features, dtype=int)
        return self.lshf.kneighbors(features, return_distance=False)    
Exemplo n.º 17
0
def filter_data(folder, same_dist_threshold):
    lshf = LSHForest(n_estimators=20, n_candidates=200, n_neighbors=10)
    data, pic_list = load_unknown_data(folder)
    for index in range(len(pic_list)):
        pic_list[index] = os.path.join(folder, pic_list[index])
    new_data = np.transpose(data, (0,3,1,2))
    # print 'load deepid model'
    # model, get_Conv_FeatureMap = load_deepid_model(deepid_model_file, deepid_weight_file)
    start = time()
    data_feature = get_Conv_FeatureMap([new_data,0])[0]
    # print 'data_feature.shape :', data_feature.shape
    # print 'pic_list :', pic_list

    lshf.fit(data_feature, pic_list)
    need_remove_list = set()
    no_same_feature_list = []
    for index_i in range(len(data_feature)):
        if len(no_same_feature_list) == 0:
            no_same_feature_list.append(data_feature[index_i:index_i+1])
        else:
            tmp = lshf.kneighbors(data_feature[index_i:index_i+1], n_neighbors=10, return_distance=True)
            tmp = zip(tmp[0][0], tmp[1][0])
            for index_j in range(len(tmp)):
                if tmp[index_j][1] == index_i:
                    continue
                if tmp[index_j][0] < same_dist_threshold:
                    need_remove_list.add(pic_list[index_i])
    for path in need_remove_list:
        try:
            os.remove(path)
        except:
            print 'error path :', path
            continue

    end = time()
    print 'filter time :', (end - start)
    return len(no_same_feature_list)
Exemplo n.º 18
0
def score(factors):
    verifyCount = 50
    neighbors = 5
    estimators = 10
    candidates = 10
    X, y = Sets.trainingSet
    test_set, databases = Sets.testSet
    X = Common.FactorizeVectors(X, factors)
    test_set = Common.FactorizeVectors(test_set, factors)
    vectorAndDatabaseList = zip(test_set, databases)
    best_neighbor, best_candidates, best_estimator, best_predictions = 0, 0, 0, 0
    correctionAverage = 0
    for i in range(verifyCount):
        best_predictions = 0
        clf = LSHForest(n_neighbors=5, n_estimators = 10, n_candidates = 10)
        clf.fit(X)

        correct = 0
        total = 0

        for vectorAndDb in vectorAndDatabaseList:
            total += 1
            actual = vectorAndDb[1]
            #predicted = clf.predict(vectorAndDb[0])[0]
            distances, indices = clf.kneighbors(vectorAndDb[0], n_neighbors=neighbors)
            predicted = GetPrediction(y, distances[0], indices[0])
            if (actual == predicted):
                correct += 1
            #print('Actual: ' + actual +', predicted: ' + predicted)

        if (correct > best_predictions):
            best_predictions = correct
            best_neighbor, best_candidates, best_estimator = neighbors, candidates, estimators
        correctionAverage += best_predictions
    correctionAverage = float(correctionAverage)/verifyCount
    return correctionAverage, best_neighbor, best_candidates, best_estimator
                         n_candidates_values.shape[0]), dtype=float)

# LSH Forest is a stochastic index: perform several iteration to estimate
# expected accuracy and standard deviation displayed as error bars in
# the plots
for j, value in enumerate(n_estimators_for_candidate_value):
    for i, n_candidates in enumerate(n_candidates_values):
        accuracy_c = []
        for seed in range(n_iter):
            lshf = LSHForest(n_estimators=value,
                             n_candidates=n_candidates, n_neighbors=1,
                             random_state=seed)
            # Build the LSH Forest index
            lshf.fit(X_index)
            # Get neighbors
            neighbors_approx = lshf.kneighbors(X_query,
                                               return_distance=False)
            accuracy_c.append(np.sum(np.equal(neighbors_approx,
                                              neighbors_exact)) /
                              n_queries)

        stds_accuracies[j, i] = np.std(accuracy_c)
        accuracies_c[j, i] = np.mean(accuracy_c)

# Set `n_estimators` values
n_estimators_values = [1, 5, 10, 20, 30, 40, 50]
accuracies_trees = np.zeros(len(n_estimators_values), dtype=float)

# Calculate average accuracy for each value of `n_estimators`
for i, n_estimators in enumerate(n_estimators_values):
    lshf = LSHForest(n_estimators=n_estimators, n_neighbors=1)
    # Build the LSH Forest index
Exemplo n.º 20
0
    nbrs = NearestNeighbors(algorithm='brute', metric='cosine',
                            n_neighbors=10).fit(X)
    time_approx = []
    time_exact = []
    accuracy = []

    for i in range(n_iter):
        # pick one query at random to study query time variability in LSHForest
        query = queries[rng.randint(0, n_queries)]

        t0 = time.time()
        exact_neighbors = nbrs.kneighbors(query, return_distance=False)
        time_exact.append(time.time() - t0)

        t0 = time.time()
        approx_neighbors = lshf.kneighbors(query, return_distance=False)
        time_approx.append(time.time() - t0)

        accuracy.append(np.in1d(approx_neighbors, exact_neighbors).mean())

    average_time_exact = np.mean(time_exact)
    average_time_approx = np.mean(time_approx)
    speedup = np.array(time_exact) / np.array(time_approx)
    average_speedup = np.mean(speedup)
    mean_accuracy = np.mean(accuracy)
    std_accuracy = np.std(accuracy)
    print("Index size: %d, exact: %0.3fs, LSHF: %0.3fs, speedup: %0.1f, "
          "accuracy: %0.2f +/-%0.2f" %
          (n_samples, average_time_exact, average_time_approx, average_speedup,
           mean_accuracy, std_accuracy))
class Search():
    def __init__(self, model_type, n_estimators=20, n_candidates=200, n_neighbors=10):
        self.lshf = LSHForest(n_estimators=n_estimators, n_candidates=n_candidates, n_neighbors=n_neighbors)

        if model_type == 'rgb_small':
            self.deepid_model_file = '/data/liubo/face/vgg_face_dataset/model/vgg_face.all_data.small.rgb.deepid.model'
            self.deepid_weight_file = '/data/liubo/face/vgg_face_dataset/model/vgg_face.all_data.small.rgb.deepid.weight'
            self.part_func = None
            self.pic_shape = (50, 50, 3)
            self.feature_dim = 1024
        elif model_type == 'rgb_big':
            self.deepid_weight_file = '/data/liubo/face/vgg_face_dataset/model/vgg_face.all_data.big.rgb.deepid.weight'
            self.deepid_model_file = '/data/liubo/face/vgg_face_dataset/model/vgg_face.all_data.big.rgb.deepid.model'
            self.part_func = None
            self.pic_shape = (128, 128, 3)
        elif model_type == 'rgb_small_right':
            self.deepid_weight_file = '/data/liubo/face/vgg_face_dataset/model/vgg_face.small_data.small.rgb.right_eye.deepid.weight'
            self.deepid_model_file = '/data/liubo/face/vgg_face_dataset/model/vgg_face.small_data.small.rgb.right_eye.deepid.model'
            self.part_func = get_right_eye
            self.pic_shape = (50, 50, 3)
        elif model_type == 'rgb_small_left':
            self.deepid_weight_file = '/data/liubo/face/vgg_face_dataset/model/vgg_face.small_data.small.rgb.left_eye.deepid.weight'
            self.deepid_model_file = '/data/liubo/face/vgg_face_dataset/model/vgg_face.small_data.small.rgb.left_eye.deepid.model'
            self.part_func = get_left_eye
            self.pic_shape = (50, 50, 3)
        elif model_type == 'rgb_small_nose':
            self.deepid_weight_file = '/data/liubo/face/vgg_face_dataset/model/vgg_face.all_data.all.rgb.nose.deepid.weight'
            self.deepid_model_file = '/data/liubo/face/vgg_face_dataset/model/vgg_face.all_data.all.rgb.nose.deepid.model'
            self.part_func = get_nose
            self.pic_shape = (50, 50, 3)
        elif model_type == 'new_shape':
            self.deepid_model_file = '/data/liubo/face/vgg_face_dataset/model/vgg_face.small_data.new_shape.rgb.deepid.model'
            self.deepid_weight_file = '/data/liubo/face/vgg_face_dataset/model/vgg_face.small_data.new_shape.rgb.deepid.weight'
            self.pic_shape = (156, 124, 3)
            self.feature_dim = 256
            self.part_func = None
        self.model, self.get_Conv_FeatureMap = load_deepid_model(self.deepid_model_file, self.deepid_weight_file)
        self.all_label = None
        self.all_feature_data = None


    def extract_pic_feature(self, pic_data, batch_size=128, feature_dim=1024):
        pic_feature = np.zeros(shape=(pic_data.shape[0], feature_dim))
        batch_num = pic_data.shape[0] / batch_size
        for index in range(batch_num):
            # pic_feature[index*batch_size:(index+1)*batch_size, :] = \
            #     self.get_Conv_FeatureMap([pic_data[index*batch_size:(index+1)*batch_size], 0])[0]
            pic_feature[index*batch_size:(index+1)*batch_size, :] = \
                self.get_Conv_FeatureMap([np.transpose(pic_data[index*batch_size:(index+1)*batch_size], (0, 3, 1, 2)), 0])[0]

        if batch_num*batch_size < pic_data.shape[0]:
            # pic_feature[batch_num*batch_size:, :] = \
            #     self.get_Conv_FeatureMap([pic_data[batch_num*batch_size:], 0])[0]
            pic_feature[batch_num*batch_size:, :] = \
                self.get_Conv_FeatureMap([np.transpose(pic_data[batch_num*batch_size:], (0, 3, 1, 2)), 0])[0]
        return pic_feature


    def train_all_data(self, vgg_folder, person_num=100, batch_person_num=20, pic_num=10):
        # 取前pic_num张图片加入到LSH Forest,其余图片用于判断准确率
        for index in range(0+train_person_start_index, person_num+train_person_start_index, batch_person_num):
            if index == 0+train_person_start_index:
                pic_data, all_label = load_batch_train_data(vgg_folder, shape=self.pic_shape, start_person_index=index,
                                 pic_num=pic_num, batch_num=batch_person_num, is_train=True, part_func=self.part_func)
                all_data_feature = self.extract_pic_feature(pic_data, feature_dim=self.feature_dim)
                self.lshf.fit(all_data_feature, all_label)

            else:
                pic_data, this_label = load_batch_train_data(vgg_folder, start_person_index=index, pic_num=pic_num,
                                shape=self.pic_shape, batch_num=batch_person_num,is_train=True, part_func=self.part_func)
                all_label = np.row_stack((np.reshape(all_label, (all_label.shape[0], 1)),
                                          np.reshape(this_label, (this_label.shape[0],1))))
                pic_data_feature = self.extract_pic_feature(pic_data, feature_dim=self.feature_dim)
                all_data_feature = np.row_stack((pic_data_feature, all_data_feature))
                self.lshf.partial_fit(pic_data_feature, this_label)
        self.all_label = all_label
        self.all_feature_data = all_data_feature
        logging.info(' '.join(map(str, ['self.all_label.shape :', self.all_label.shape])))


    def partical_fit(self, pic_data, this_label):
        '''
            增量训练, 样本比较小, 直接
        :param data:
        :param label:
        :return:
        '''
        pic_data_feature = self.extract_pic_feature(pic_data, feature_dim=self.feature_dim)
        self.lshf.partial_fit(pic_data_feature, this_label)
        self.all_label = np.row_stack((np.reshape(self.all_label, (self.all_label.shape[0], 1)),
                                          np.reshape(this_label, (this_label.shape[0],1))))


    def find_k_neighbors(self, pic_data):
        pic_data_feature = self.extract_pic_feature(pic_data, feature_dim=self.feature_dim)
        distances, indices = self.lshf.kneighbors(pic_data_feature, n_neighbors=1)
        predict_label = self.all_label[indices][:, 0, 0]
        return predict_label


    def valid_model(self, vgg_folder, person_num=100, batch_person_num=20, pic_num=10, topK_acc=1):
        # 取前50张图片加入到LSH Forest,后50张图片用于判断准确率
        right_num = 0
        wrong_num = 0
        clf = cPickle.load(open(clf_model_file, 'rb'))

        for index in range(0+train_person_start_index, person_num+train_person_start_index, batch_person_num):
            pic_data, all_label = load_batch_train_data(vgg_folder, start_person_index=index, pic_num=pic_num,
                            shape=self.pic_shape,batch_num=batch_person_num, is_train=False, part_func=self.part_func)

            pic_data_feature = self.extract_pic_feature(pic_data, feature_dim=self.feature_dim)
            distances, indices = self.lshf.kneighbors(pic_data_feature, n_neighbors=10)
            train_data = self.all_feature_data[indices]
            predict_label = self.all_label[indices][:, 0, 0]
            for label_index in range(len(predict_label)):
                this_predict_data = np.abs(train_data[0] - pic_data_feature[0])
                this_result = clf.predict_proba(this_predict_data)
                print this_result
                # pdb.set_trace()
                if all_label[label_index] in self.all_label[indices][:, :, 0][label_index][:topK_acc]:
                    right_num += 1
                else:
                    wrong_num += 1
        acc = right_num * 1.0 / (right_num + wrong_num)
        logging.info(' '.join(map(str, ['model_type :', model_type, 'person_num :', person_num, 'pic_num :', pic_num, 'acc :', acc])))
Exemplo n.º 22
0
class Deduper_NN(object):
    '''
    DESIGN of this class
    I need to re-evaluate whether or not I want the state of the model/vector space 
    being saved in the event that I dont' I should just kill the self.model = model.fit() stuff
    and pass parameters from one function to another.

    *Figure out a rigorous statistical way to measure quality of NN tree. 
    Would it be whatever's skewed to the left?... how can you gaurantee that it's clustered well?
    
    methods
    ------
    train
        - model type
    predict
    preprocess
        - various stuff
    '''

    metrics = [
        'cosine', 
        'euclidean',
        'dice', 
        'jaccard', 
        'braycurtis',
        'canberra', 
    ]
    
    vector_space = None

    def read_in_the_file(self, file_name):
        
        #read in subject file
        with open(file_name) as f:
             self.orig_file = [line.strip() for line in f]
    
    def build_vectorizer(self, corpus, model_type='bag of words', ngrams=1, tokenizer='char'):
        '''
        *add word2vec
        '''
        
        #think of params
        params = {
            'analyzer': tokenizer,
            'ngram_range' : (1, ngrams)
        }
    
        if model_type == 'bag of words':
            vectorizer = CountVectorizer(**params)
        elif model_type ==  'tfidf':
            vectorizer = TfidfVectorizer(**params)
        
        self.vector_space = vectorizer.fit_transform(corpus) 
        self.vectorizer = vectorizer 
    
    def find_all_duplicates(self):
        
        #find all duplicates
        all_dups_dict = {idx : self.predict(line) for idx, line in enumerate(self.orig_file)}
        return all_dups_dict
    
    def fit_model(self, model_type='brute', params=None):
        '''
        fits model operating under the assumption that there's a model already built
        '''

        if model_type == 'brute':
            self.model = NearestNeighbors(algorithm='brute', **params)
        elif model_type == 'lsh':
            self.model = LSHForest( **params)
        # elif model_type == 'annoy':
        #     self.model = Annoy(**params)

        self.model.fit(self.vector_space)
        print self.model        

    def predict(self, new_data_pt, radius_threshold=.25):
        '''
        not sure how to find the optimal threshold here
        '''
        #careful to note that it takes a single string and converts to a list object of strings
        pt = self.vectorizer.transform([new_data_pt])
        
        #how to find optimal radius?
        distance_from_origin, indices = self.model.radius_neighbors(pt, radius=radius_threshold)
        
        #unpacking
        distance_from_origin = distance_from_origin[0]
        indices = indices[0]

        grabbing_the_lines_from_file = [self.orig_file[index] for index in indices]

        return grabbing_the_lines_from_file
    
    def grid_search(self):
        '''
        I: target string
        O: prints all combinations of comparisons
        
        * this goes in the master deduper class
        '''
        
        #preprocessing variables
            #spaces or no spaces
            #combinations there of.
        
        vector_space_params = {
            #fit the vector-space
            #char-grams, words
                #unagrams, bigrams, tri-grams

            #or some combination there of, to do this we need to output and concat
            
            'model_type' : ['bag of words', 'tfidf'], #add lsi and word2vec
            'ngrams' : [1,2,3,4],
            'tokenizer' : ['char', 'word'],
        }
        
        
        
        #model selection
        model_params = {
            #add annoy later
            #build out a wrapper for the class to make it more like sciki

            #add lsh later
            #need to build a seperate parameters dict for it.

        
            'model_type' : [ 'brute']
            #fill the rest in later
        }
        
         
        #distances
        metrics = [
            # work for sparse input
            'cosine', 
            'euclidean',
            'l1',
            'l2',
            'manhattan',

            # do not work for sparese input
            # 'dice', 
            # 'jaccard', 
            # 'braycurtis',
            # 'canberra', 
            # 'mahalanobis', # this is supposed to be the shit for outlier detection
        ]
        
        
        all_params = {
            'preprocessing': None,
            'vector_space': vector_space_params,
            'nn_algo': model_params,
        }
        
        

        for nn_algo in all_params['nn_algo']['model_type']:
            for vector_space_model in all_params['vector_space']['model_type']:
                for gram in  all_params['vector_space']['ngrams']:
                    for type_of_tokenizer in  all_params['vector_space']['tokenizer']:
                        for dist_metric in metrics:
                            
                            nn_model_params = {
                                # 'model_type' : nn_algo,
                                'metric' : dist_metric,
                            }

                            vectorizer_params = {
                                'model_type' : vector_space_model,
                                'tokenizer' : type_of_tokenizer,
                                'ngrams' : gram
                            }

                            self.build_vectorizer(self.orig_file, **vectorizer_params)
                            self.fit_model(nn_algo, nn_model_params)
                            hist_arr = self.make_hist()
                            print_prof_data()
                            clear_prof_data()
                            self.plot_histogram(hist_arr)

                            
                
        
        #how do you gauge the quality of matches?
        
        pass
    
    #since this isn't a nn search model it belongs in the biggest deduper
    def brute_force_deduper(self, list_of_strings, comparison_algo, threshold=None):
        '''
        I: self explanatory
        O: dictionary {string: sorted list of matches}
        '''
        big_bag = {}
        #to deep copy or not to deep copy

        for index, s1 in enumerate(list_of_strings):
            small_bag = get_all_comparisons(list_of_strings[index:], comparison_algo)
            big_bag[s1] = sorted(small_bag, key=lambda x: x[0], reverse=True)

        return big_bag
    
    @profile
    def make_hist(self):
        '''
        these queries take while

        *add timer bit
        '''
        import sys

        #use a numpy array since the size is already pre-defined
        hist_bag = []
        
        print 'set size -- ', self.vector_space.shape[0]

        for l, observation in enumerate(self.vector_space):

            #just a way to keep track of where it's at
            if l % 30 == 0: 
                sys.stdout.write(str(l))
            

            dist, idx = self.model.kneighbors(observation, n_neighbors=2)
            dist, idx = dist[0], idx[0]

            #operating under the assumption that 
            #the first one is might be a good thing

            #find out which position the current index is in
            # remove_this_arg = [k for k, i in enumerate(idx) if i == index]
            # dist = [i for k, i in enumerate(dist) if i != remove_this_arg[0]]
            


            hist_bag.append(dist[1])


        return pd.Series(hist_bag)

    def plot_histogram(self, histogram_arr, text_pos=None):
        
        figure = plt.figure(figsize=(10,5))
        plt.hist(histogram_arr, bins=50, alpha=0.75) 
        plt.title("not scaled") 
        if text_pos:
            self.distribution_stats_text_label(text_pos[0], text_pos[1], histogram_arr)
        plt.show()


    def distribution_stats_text_label(self, position_x, position_y, data):
        label_position_decrement = 0.08 * position_y
        plt.text(position_x, position_y, "Skewness: {0:.2f}".format(skew(data))) 
        plt.text(position_x, position_y - label_position_decrement, "Mean: {0:.2f}".format(data.mean())) 
        plt.text(position_x, position_y - 2 * label_position_decrement, "Std: {0:.2f}".format(data.std())) 
        return None

    def get_all_comparisons(self, main_str, strings, comparison_algo, threshold=None):
        '''
        I: string, list of strings, string comparison algo eg. levenshtien, threshold
        O: list of tuples (match score, weight) 
        
        Takes a target string and compares it to the rest of strings in the list
        USE --

        get_all_comparisons('check', ['check1'], fuzz.ratio) 
        >>> [(91, 'check1')]
        '''
        match_bag = []

        for str_ in strings:
            match_rating = comparison_algo(main_str, str_)

            if threshold:
                if match_rating > threshold:
                    match_bag.append((match_rating, str_))
            else:
                match_bag.append((match_rating, str_))

        return match_bag
Exemplo n.º 23
0
    nbrs = NearestNeighbors(algorithm='brute', metric='cosine',
                            n_neighbors=10).fit(X)
    time_approx = []
    time_exact = []
    accuracy = []

    for i in range(n_iter):
        # pick one query at random to study query time variability in LSHForest
        query = queries[[rng.randint(0, n_queries)]]

        t0 = time.time()
        exact_neighbors = nbrs.kneighbors(query, return_distance=False)
        time_exact.append(time.time() - t0)

        t0 = time.time()
        approx_neighbors = lshf.kneighbors(query, return_distance=False)
        time_approx.append(time.time() - t0)

        accuracy.append(np.in1d(approx_neighbors, exact_neighbors).mean())

    average_time_exact = np.mean(time_exact)
    average_time_approx = np.mean(time_approx)
    speedup = np.array(time_exact) / np.array(time_approx)
    average_speedup = np.mean(speedup)
    mean_accuracy = np.mean(accuracy)
    std_accuracy = np.std(accuracy)
    print("Index size: %d, exact: %0.3fs, LSHF: %0.3fs, speedup: %0.1f, "
          "accuracy: %0.2f +/-%0.2f" %
          (n_samples, average_time_exact, average_time_approx, average_speedup,
           mean_accuracy, std_accuracy))
Exemplo n.º 24
0
    # svm = SVC(probability=True)

    start = time.time()
    model = lshf.fit(Xvec_train)
    print("Model Training", (time.time() - start) / 60)

    resultset = []
    k = []
    f_score = []
    vrec = []
    vacc = []
    for i in range(1, 5):
        start = time.time()
        print("Current Iteration k", i)
        modelset = []
        distances, indices = lshf.kneighbors(Xvec_validation, n_neighbors=i)
        print("Current Iteration k time", (time.time() - start) / 60)
        # print(indices)
        for row in indices:
            # print(row)
            rowset = []
            for item in row:
                rowset = rowset + Y_train[item]
            modelset.append(list(set(rowset)))
        metrics = (calculateFScore(Y_validation, modelset), i)
        k.append(i)
        vrec.append(metrics[0][2])
        vacc.append(metrics[0][1])
        f_score.append(metrics[0][0])
        resultset.append(metrics)
Exemplo n.º 25
0
   components+=1

pca = PCA(n_components=components)
train_data_fit=pca.fit(train_input[:(trainpoints*num_train_imgs)*.1])
train_data_transform=pca.transform(train_input[:])
del train_input
train_data=train_data_transform#np.vstack((train_data_fit,train_data_transform))
test_data=pca.transform(test_input)

for val in param_values:
    
    candidates=val
    
    lshf = LSHForest(n_estimators=estimators,n_candidates=candidates,min_hash_match=minhash,n_neighbors=k,random_state=1).fit(train_data)

    approx_neighbors = lshf.kneighbors(test_data, return_distance=False)
    gray_prediction=[]
    for query in approx_neighbors:
            nbrs = []
            for j in query:
                nbrs.append(train_label[j][0])
            gray_val=np.mean(nbrs)
            gray_prediction.append(gray_val)
    
    precisions, recalls = metrics.get_precision_recall_values(gray_prediction, test_label.flatten(), 50, False)
    f_scores=2.0*(precisions*recalls)/(precisions+recalls)
    f_max = np.nanmax(f_scores)
    values.append(val)
    f_maxes.append(f_max)
    print f_max
Exemplo n.º 26
0
class DocSim:
    def __init__(self):
        self.model = None
        self.doc_corpus = Corpus()
        self.vectorized_docs = []
        self.vectorized_docs_path = ''
        self.lsh = None

    def set_model(self, model):
        self.model = model

    def set_doc(self, doc_corpus):
        self.doc_corpus = doc_corpus

    def vectorized(self, num_topics=DefaultSetting.NUMBER_TOPICS):
        self.lsh = LSHForest(n_estimators=DefaultSetting.HASH_SIZE, n_neighbors=10)
        docs_bow = [self.doc_corpus.dictionary.doc2bow(content.split(u' '))
                    for content in self.doc_corpus.documents]
        for doc_bow in docs_bow:
            vectorized_doc = [x[1] for x in self.model.get_document_topics(doc_bow, minimum_probability=0.0)]
            self.vectorized_docs.append(vectorized_doc)
        self.lsh.fit(self.vectorized_docs)

    def save(self, path=DefaultSetting.DIRECTORY, prefix_name=DefaultSetting.PREFIX_NAME):
        self.vectorized_docs_path = path + '/' + prefix_name + '.plk'
        with open(self.vectorized_docs_path, 'wb') as handle:
            pickle.dump(self.vectorized_docs, handle)

    def load(self, file):
        self.vectorized_docs_path = file
        with open(file, 'rb') as handle:
            self.vectorized_docs = pickle.load(handle)

    def query(self, documents, save=DefaultSetting.SAVE_RESULT_QUERY,
              path=DefaultSetting.DIRECTORY, prefix_name=DefaultSetting.PREFIX_NAME):
        vectorized_docs = []
        for document in documents:
            doc_bow = self.doc_corpus.dictionary.doc2bow(document.split(u' '))
            vectorized_doc = [x[1] for x in self.model.get_document_topics(doc_bow, minimum_probability=0.0)]
            vectorized_docs.append(vectorized_doc)
        distance, indices = self.lsh.kneighbors(vectorized_docs)
        if save:
            saved_file_path = path + '/' + prefix_name + '_res.txt'
            writer = codecs.open(saved_file_path, 'w', 'utf8')
            for i in range(len(documents)):
                writer.write('Input: \n\t' + documents[i] + '\n')
                writer.write('-'*100 + '\n')
                writer.write('Similar documents: \n')
                for idx in indices[i]:
                    writer.write('\tkey: ' + self.doc_corpus.key[idx] + '\n')
                    writer.write('\ttitle: ' + self.doc_corpus.titles[idx] + '\n')
                    writer.write('\tcontent: ' + self.doc_corpus.documents[idx] + '\n\n')
                writer.write('='*100 + '\n')
        else:
            for i in range(len(documents)):
                print 'Input: '
                print documents[i]
                print '-' * 100
                print 'Similar documents: '
                for idx in indices[0]:
                    print '\tkey: ', self.doc_corpus.key[idx]
                    print '\ttitle: ', self.doc_corpus.titles[idx]
                    print '\tcontent: ', self.doc_corpus.documents[idx]
                print '=' * 100
Exemplo n.º 27
0
    sum = 0
    components=0
    while sum<=.995:
       sum+=variance[components]
       components+=1
    
    pca = PCA(n_components=components)
    train_data_fit=pca.fit(train_input[:(trainpoints*num_train_imgs)*.1])
    train_data_transform=pca.transform(train_input[:])
    del train_input
    train_data=train_data_transform
    test_data=pca.transform(test_input)
    
    lshf = LSHForest(n_estimators=estimators,n_candidates=candidates,min_hash_match=minhash,n_neighbors=nbrs_max,random_state=1).fit(train_data)

    distances, approx_neighbors= lshf.kneighbors(test_data, return_distance=True)
    for n in range(len(k_vals)):
        k = k_vals[n]
        
        lowest_distances=[]
        for query in distances:
            lowest_indices=query.argsort()[:k]
            lowest_distances.append(lowest_indices)
        
        
        gray_prediction=[]
        point=0
        for query in approx_neighbors:
            nbrs = []
            n_nearest=[]
            for i in lowest_distances[point]:
class FaceRecognition():
    def __init__(self):
        self.unknown = ''
        self.same_person_num = 1
        self.has_cal_dist = []
        self.NeighbourNum = 10
        # 如果管理员加载图片, 把图片放到all_pic_data_folder下指定人的目录(图片文件和特征文件的文件名相同)
        self.all_pic_feature_data_folder = '/data/liubo/face/research_feature_self'     # 研究院的模型直接存储特征
        # 保存图片可以方便以后查看效果, 方便前端显示, 也方便管理员进行标注
        self.all_pic_data_folder = '/data/liubo/face/research_self'
        if not os.path.exists(self.all_pic_data_folder):
            os.makedirs(self.all_pic_data_folder)
        if not os.path.exists(self.all_pic_feature_data_folder):
            os.makedirs(self.all_pic_feature_data_folder)
        self.n_neighbors = 10
        self.lshf = LSHForest(n_estimators=20, n_candidates=200, n_neighbors=self.n_neighbors)
        self.all_labels = []
        self.all_pic_feature = []
        self.same_pic_id = 2
        self.must_be_same_id = 1
        self.must_be_not_same_id = 0
        self.maybe_same_id = 3
        self.new_person_str = 'new_person_'
        self.current_new_person_id = self.find_current_new_person_id()
        self.must_same_str = '_Must_Same'
        self.maybe_same_str = '_Maybe_same'
        self.load_time = time.time()
        self.user_count = {}
        self.upper_threshold = upper_verif_threshold
        self.lower_threshold = lower_verif_threshold
        self.same_pic_threshold = same_pic_threshold
        self.trans_dic = {self.same_pic_id: 'same_pic', self.must_be_same_id: 'must_same_id',
                          self.must_be_not_same_id: 'must_not_same_id', self.maybe_same_id: 'maybe_same_id'}
        self.nearest = deque(maxlen=nearest_num)
        self.verification_same_person = 0


    def cal_nearest_sim(self, current_feature):
        nearest_sim_list = []
        current_day = get_current_day()
        log_file = open(os.path.join(log_dir, current_day+'.txt'), 'a')

        try:
            length = len(self.nearest)
            for k in range(length):
                try:
                    person_name, pre_feature = self.nearest[k]
                    # 不在考虑时间, 只考虑图片的相似度

                    this_sim = pw.cosine_similarity(np.reshape(np.asarray(pre_feature), (1, len(pre_feature))),
                                                        np.reshape(np.asarray(current_feature), (1, len(current_feature))))
                    nearest_sim_list.append((this_sim, verification_model.predict(this_sim), person_name))
                except:
                    log_file.write('cal_nearest_sim error'+'\n')
                    traceback.print_exc()
                    continue
            return nearest_sim_list
        except:
            traceback.print_exc()
            return nearest_sim_list


    def load_train_data(self, data_folder):
        # 直接读取图片特征, 返回所有特征和label
        all_pic_feature = []
        all_label = []
        person_list = os.listdir(data_folder)
        for person in person_list:
            if person == self.unknown or self.must_same_str in person or self.maybe_same_str in person:
                continue
            person_path = os.path.join(data_folder, person)
            pic_feature_list = os.listdir(person_path)
            for pic_feature_path in pic_feature_list:
                pic_feature = msgpack_numpy.load(open(os.path.join(person_path, pic_feature_path), 'rb'))
                all_pic_feature.append(pic_feature)
                all_label.append(person)
        all_pic_feature = np.asarray(all_pic_feature)
        all_label = np.asarray(all_label)
        return all_pic_feature, all_label


    def find_current_new_person_id(self):
        current_day = get_current_day()
        log_file = open(os.path.join(log_dir, current_day+'.txt'), 'a')

        old_person_id = []
        # 保存的是原始图片
        person_list = os.listdir(self.all_pic_data_folder)
        for person in person_list:
            if person.startswith(self.new_person_str):
                tmp = person[len(self.new_person_str):].split('_')
                if len(tmp) > 0:
                    this_id = int(tmp[0])
                    old_person_id.append(this_id)
        if len(old_person_id) == 0:
            current_new_person_id = 0
        else:
            current_new_person_id = max(old_person_id) + 1
        log_file.write('\t'.join(map(str, ['current_new_person_id :', current_new_person_id]))+'\n')
        log_file.close()
        return current_new_person_id


    def extract_pic_feature(self, pic_path):
        try:
            result = extract_feature_from_binary_data(open(pic_path, 'rb'))
            if result == None:
                return
            face_num, all_frames, all_feature = result
            biggest_face_index = find_big_face(all_frames)
            pic_frame = all_frames[biggest_face_index]
            pic_feature = all_feature[biggest_face_index]
            x, y, width, height = pic_frame
            face_pic = cv2.imread(pic_path)[y:y+width, x:x+height, :]
            return face_pic, pic_feature
        except:
            traceback.print_exc()
            return None


    def load_all_data(self):
        # 将以前标记的数据全部读入(直接读入的是特征), 用LSH Forest保存,方便计算距离
        current_day = get_current_day()
        log_file = open(os.path.join(log_dir, current_day+'.txt'), 'a')
        try:
            all_pic_feature, all_label = self.load_train_data(self.all_pic_feature_data_folder)
            train_label = np.asarray(all_label)
            if len(all_pic_feature) == len(train_label) and len(train_label) > 0:
                start = time.time()
                self.lshf.fit(all_pic_feature, train_label)
                self.all_pic_feature = list(all_pic_feature)
                self.all_labels = list(train_label)
                end = time.time()
                self.load_time = end
                self.user_count = Counter(self.all_labels)
                log_file.write('\t'.join(map(str, [self.user_count,
                                           'fit all data time :', (end - start)]))+'\n')
                log_file.close()
        except:
            traceback.print_exc()
            log_file.close()
            return


    def save_pic_feature(self, pic_path, person_name):
        #  将已经存在的文件生成特征并保存到指定文件夹下, 用于管理员加入新的图片(加入新的图片后, 提取特征, 保存到指定文件夹)
        person_pic_folder_path = os.path.join(self.all_pic_data_folder, person_name)
        person_feature_folder_path = os.path.join(self.all_pic_feature_data_folder, person_name)
        if not os.path.exists(person_pic_folder_path):
            os.makedirs(person_pic_folder_path)
        if not os.path.exists(person_feature_folder_path):
            os.makedirs(person_feature_folder_path)
        pic_name = os.path.split(pic_path)[-1]
        # 特征文件
        person_feature_path = os.path.join(person_feature_folder_path, pic_name)
        # 人脸文件
        person_pic_path = os.path.join(person_pic_folder_path, pic_name)
        result = extract_feature_from_binary_data(open(pic_path, 'rb'))
        if result == None:
            return
        face_num, all_frames, all_feature = result
        biggest_face_index = find_big_face(all_frames)
        pic_frame = all_frames[biggest_face_index]
        pic_feature = all_feature[biggest_face_index]
        x, y, width, height = pic_frame
        face_pic = cv2.imread(pic_path)[y:y+width, x:x+height, :]
        cv2.imwrite(person_pic_path, face_pic)
        msgpack_numpy.dump(pic_feature, open(person_feature_path, 'wb'))


    def add_all_new_pic(self):
        '''
            将从上次加载数据到当前新增的文件都加载到LSH Forest(有可能是新增加一个人,还有可能是对已有的人增加新图片)
            遍历文件夹(self.all_pic_feature_data_folder), 根据文件的时间判断是否需要加入该图片的特征
            系统在管理员标注图片后, 将人脸图片和特征文件同时进行移动, 所以现在只需要将特征和对应的label加入LSH就可以了
        '''
        current_day = get_current_day()
        log_file = open(os.path.join(log_dir, current_day+'.txt'), 'a')
        start = time.time()
        person_list = os.listdir(self.all_pic_data_folder)
        add_num = 0
        for person in person_list:
            if self.must_same_str in person or self.maybe_same_str in person or self.new_person_str in person:
                continue
            person_path = os.path.join(self.all_pic_data_folder, person)
            if not os.path.isdir(person_path):
                continue
            pic_list = os.listdir(person_path)
            for pic in pic_list:
                pic_path = os.path.join(person_path, pic)
                last_modify_time = os.stat(pic_path).st_atime
                if last_modify_time > self.load_time:
                    request = {
                        "label": person,
                        "request_type": 'add',
                        "one_pic_feature": pic_path
                    }
                    url = "http://127.0.0.1:%d/"%port
                    result = image_request(request, url)
                    try:
                        add_flag = json.loads(result)["add"]
                        if not add_flag:    # 加载失败
                            log_file.write('\t'.join(map(str, ['no add file :', pic_path]))+'\n')
                        else:
                            add_num += 1
                    except:
                        log_file.write('\t'.join(map(str, ['no add file :', pic_path]))+'\n')
                        traceback.print_exc()
                        continue
                    add_num += 1
        end = time.time()
        if add_num > 0:
            self.load_time = end
            log_file.write('\t'.join(map(str, ['self.load_time', self.load_time]))+'\n')
            log_file.write('\t'.join(map(str, ['add pic num :', add_num,
                                               'Dynamic increase time :', (end - start)]))+'\n')
            log_file.close()
        else:
            log_file.close()


    def add_one_new_pic(self, pic_path, label):
        current_day = get_current_day()
        log_file = open(os.path.join(log_dir, current_day+'.txt'), 'a')
        try:
            # 读入数据时已经转换成需要的尺寸
            result = self.extract_pic_feature(pic_path)
            if result == None:
                return False
            face_pic, pic_feature = result
            self.add_one_pic(pic_feature, label)
            pic_name = os.path.split(pic_path)[1]
            this_person_pic_folder = os.path.join(self.all_pic_data_folder, label)
            this_person_feature_folder = os.path.join(self.all_pic_feature_data_folder, label)
            if not os.path.exists(this_person_pic_folder):
                os.makedirs(this_person_pic_folder)
            if not os.path.exists(this_person_feature_folder):
                os.makedirs(this_person_feature_folder)
            # 直接存储图片对应的特征, 同时保存图片文件
            this_pic_feature_name = os.path.join(this_person_feature_folder, pic_name + '.p')
            msgpack_numpy.dump(pic_feature, open(this_pic_feature_name, 'wb'))
            this_pic_face_name = os.path.join(this_person_pic_folder, pic_name + '.jpg')
            cv2.imwrite(this_pic_face_name, face_pic)
            log_file.write('\t'.join(map(str, [pic_path, this_pic_face_name]))+'\n')
            return True
        except:
            traceback.print_exc()
            return False


    def add_one_pic(self, one_pic_feature, pic_label):
        '''
            将一个图像的特征加入到LSH Forest,同时将对应的标签加入到self.all_labels
            :param pic_feature: array shape :(1,1024)
            :param pic_label: (1,)
            :return:
        '''
        one_pic_feature = np.asarray(one_pic_feature)
        self.lshf.partial_fit(one_pic_feature.reshape(1, FEATURE_DIM), pic_label)
        self.all_labels.append(pic_label)
        self.all_pic_feature.append(np.reshape(one_pic_feature, newshape=(1, one_pic_feature.size)))


    def find_k_neighbors_with_lsh(self, one_pic_feature):
        '''
            :param one_pic_feature: 图像特征
            :return: 需要返回neighbors的特征,用于计算pariwise
        '''
        try:
            one_pic_feature = np.asarray(one_pic_feature)
            tmp = self.lshf.kneighbors(one_pic_feature.reshape(1, FEATURE_DIM), n_neighbors=self.n_neighbors, return_distance=True)
            neighbors_label = np.asarray(self.all_labels)[tmp[1][0]]
            neighbors_feature = np.asarray(self.all_pic_feature)[tmp[1][0]]
            pair_score_list = []
            cos_sim_list = []
            for index in range(len(neighbors_feature)):
                pair_score = pw.cosine_similarity(neighbors_feature[index].reshape(1, FEATURE_DIM),
                                     one_pic_feature.reshape(1, FEATURE_DIM))[0][0]
                cos_sim_list.append(pair_score)
                pair_score_list.append(verification_model.predict(pair_score))
            result = zip(cos_sim_list, pair_score_list, neighbors_label)
            # result = self.filter_result(result)
            # result.sort(key=lambda x:x[0], reverse=True)
            return result
        except:
            return None


    def filter_result(self, result):
        '''
            :param result: [(cos_sim, same_person_result, label),
                            (cos_sim, same_person_result, label),
                            (cos_sim, same_person_result, label)] 按cos_sim降序排列
            :return: this_id(Must_same, Must_not_same, May_same), this_label(人名)
        '''
        # 分值相同的, 将new_person的删去
        tmp_dic = {}
        for element in result:
            try:
                this_score, this_same_person_result, this_label = element
                this_score = float(this_score)
                if this_score in tmp_dic:
                    if self.new_person_str in this_label:
                        continue
                    else:
                        tmp_dic[this_score] = element
                else:
                    tmp_dic[this_score] = element
            except:
                traceback.print_exc()
                continue
        result = tmp_dic.values()
        return result


    def evaluate_result(self, result):
        '''
            :param result: [(cos_sim, same_person_result, label),
                            (cos_sim, same_person_result, label),
                            (cos_sim, same_person_result, label)]
            :return: this_id(Must_same, Must_not_same, May_same), this_label(人名)
        '''
        for index, element in enumerate(result):
            this_score, this_same_person_result, this_label = element
            if this_same_person_result == self.verification_same_person and this_score > self.same_pic_threshold:
                return self.same_pic_id, this_label
            if this_same_person_result == self.verification_same_person and this_score > self.upper_threshold:
                return self.must_be_same_id, this_label
            if this_same_person_result == self.verification_same_person and this_score > self.lower_threshold:
                return self.maybe_same_id, this_label
        return self.must_be_not_same_id, ''


    def recognize_online_cluster(self, image, image_id):
        '''
            :param image: 将得到的图片进行识别,加入的LSH Forest,根据距离计算proba(不同的距离对应不同的准确率,根据已有的dist计算阈值);
                            和已经设定的阈值判断是不是一个新出现的人,确定是原来已有的人,还是不确定是原来已有的人
            :return:
        '''
        start = time.time()
        need_add = False
        need_save = False
        current_day = get_current_day()
        log_file = open(os.path.join(log_dir, current_day+'.txt'), 'a')
        log_file.write('\t'.join(map(str, ["receive image", image_id, time.time()])) + '\n')
        feature_str = ''
        try:
            image = base64.decodestring(image)
            image = zlib.decompress(image)
            im = cv2.imdecode(np.fromstring(image, dtype=np.uint8), 1)
            log_file.write('\t'.join(map(str, ['shape :', im.shape[0], im.shape[1]])) + '\n')
            # 图片尺寸过滤
            if im.shape[0] < size_threshold or im.shape[1] < size_threshold:
                log_file.write('\t'.join(map(str, ['stat recognize_time :', (time.time() - start), 'small_size'])) + '\n')
                log_file.close()
                return self.unknown, 1.0, feature_str, need_save
            # 清晰度过滤
            blur_sign, blur_var = is_blur(cv2.resize(im, (96, 96)))
            if blur_sign:
                log_file.write('\t'.join(map(str, ['stat recognize_time :', (time.time() - start), 'blur_filter', blur_var])) + '\n')
                log_file.close()
                return self.unknown, 1.0, feature_str, need_save
            #  保存传过来的图片
            # img_file = '/tmp/research_face/%s.jpg' %image_id
            time_slot = get_time_slot(image_id)
            if time_slot == None:
                time_slot = 'error'
            time_slot_dir = os.path.join(tmp_face_dir, time_slot)
            if not os.path.exists(time_slot_dir):
                os.makedirs(time_slot_dir)
            img_file = os.path.join(time_slot_dir, image_id+'.jpg')
            cv2.imwrite(img_file, im)
        except:
            traceback.print_exc()
            log_file.close()
            return self.unknown, 1.0, feature_str, need_save
        try:
            # 流程 : 找距离最近的图片 ; 计算prob ; 在线聚类 ; 加入LSH Forest
            result = self.extract_pic_feature(img_file)
            if result == None:
                log_file.write('\t'.join(map(str, ['stat not_find_face', 'time :', (time.time() - start)]))+'\n')
                log_file.close()
                return self.unknown, 1.0, feature_str, need_save
            face_pic, im_feature = result

            try:
                # nearest_sim_list的格式和dist_label_list的格式一样,这样可以将两个list合并,一起计算(这样不用考虑时间的因素)
                # 在识别出人名后将人名和feature放入到self.nearest
                nearest_sim_list = self.cal_nearest_sim(current_feature=im_feature)
            except:
                traceback.print_exc()
                nearest_sim_list = []
            log_file.write('\t'.join(map(str, ['nearest_sim_list :', map(str, nearest_sim_list)])) + '\n')
            feature_str = base64.b64encode(msgpack_numpy.dumps(im_feature))
            log_file.write('\t'.join(map(str, ['extract_feature_time :', (time.time() - start)]))+'\n')
            # 找距离最近的图片 --- 用LSH Forest 找出最近的10张图片,然后分别计算距离

            tmp_list = self.find_k_neighbors_with_lsh(im_feature)
            nearest_sim_list.sort(key=lambda x: x[0], reverse=True)
            nearest_sim_list.extend(tmp_list)
            dist_label_list = nearest_sim_list[:]

            # 计算
            log_file.write('\t'.join(map(str, ['dist_label_list :', map(str, dist_label_list)])) + '\n')
            if dist_label_list == None:
                this_id = self.must_be_not_same_id
                this_label = self.new_person_str + str(self.current_new_person_id)
            else:
                # 计算prob --- 根据距离计算prob
                this_id, this_label = self.evaluate_result(dist_label_list)
            # 不管概率, 都要将最新的一张图片加入到self.nearest
            self.nearest.append((this_label, im_feature))
            log_file.write('\t'.join(map(str, ['self.nearest :', map(str, self.nearest)])) + '\n')
            # 在线聚类 --- 根据dist确定是重新增加一个人还是加入到已有的人中
            if this_id == self.same_pic_id:
                need_add = False
            elif this_id == self.must_be_same_id:
                need_add = False
                need_save = True
                this_person_pic_folder = os.path.join(self.all_pic_data_folder, this_label+self.must_same_str)
                this_person_feature_folder = os.path.join(self.all_pic_feature_data_folder, this_label+self.must_same_str)
            elif this_id == self.must_be_not_same_id:
                this_label = self.new_person_str + str(self.current_new_person_id)
                self.current_new_person_id += 1
                this_person_pic_folder = os.path.join(self.all_pic_data_folder, this_label)
                this_person_feature_folder = os.path.join(self.all_pic_feature_data_folder, this_label)
                need_add = True
                need_save = True
            elif this_id == self.maybe_same_id:
                this_person_pic_folder = os.path.join(self.all_pic_data_folder, this_label + self.maybe_same_str)
                this_person_feature_folder = os.path.join(self.all_pic_feature_data_folder, this_label + self.maybe_same_str)
                need_add = False # prob在灰度区域的不如入,其余情况加入
                need_save = True
            else:
                log_file.write('\t'.join(map(str, ['error para :', this_id]))+'\n')
            if need_save:
                try:
                    if not os.path.exists(this_person_pic_folder):
                        os.makedirs(this_person_pic_folder)
                    if not os.path.exists(this_person_feature_folder):
                        os.makedirs(this_person_feature_folder)
                    # 直接存储图片对应的特征, 同时保存图片文件
                    this_pic_feature_name = os.path.join(this_person_feature_folder, image_id+'.p')
                    msgpack_numpy.dump(im_feature, open(this_pic_feature_name, 'wb'))
                    this_pic_face_name = os.path.join(this_person_pic_folder, image_id+'.jpg')
                    cv2.imwrite(this_pic_face_name, face_pic)
                except:
                    traceback.print_exc()
                    return self.unknown, 1.0, feature_str, False
            # 加入LSH Forest --- partial_fit
            if need_add:
                self.add_one_pic(im_feature, this_label)
                # 根据label和image_id可以存生成文件名,确定是否要存储文件[可以选择在服务器和本地同时存储]
            if this_id == self.same_pic_id or this_id == self.must_be_not_same_id or this_id == self.must_be_same_id:
                end = time.time()
                log_file.write('\t'.join(map(str, ['stat recognize_time :',(end - start), 'this_id :', self.trans_dic.get(this_id)]))+'\n')
                log_file.close()
                need_save = True
                return this_label.replace(self.must_same_str, ''), str(dist_label_list[0][0]), str(feature_str), str(need_save)
            else:
                # 灰度区域,不显示人名
                end = time.time()
                log_file.write('\t'.join(map(str, ['stat gray_area :',(end - start)]))+'\n')
                log_file.close()
                return self.unknown, str(dist_label_list[0][0]), str(feature_str), str(False)
        except:
            traceback.print_exc()
            log_file.close()
            return self.unknown, str(100.0), str(feature_str), str(False)
Exemplo n.º 29
0
pad=4
numpoints=50
input_img = np.float16(mahotas.imread('input/test/0.tif'))
input_bordered = border.createBorder(pad,input_img)
#input = border.outputMatrix(pad,input_img)
train_data=knn_trainer_helper.sortClasses(pad,numpoints,knn_trainer_helper.readImages(209,'input/train/','labels/train/'),seed=42)
t1 = time.time()
print 'init took '+str(t1-start)
lshf = LSHForest(n_estimators=20, n_candidates=200, n_neighbors=k,random_state=1).fit(train_data[0])
t2 = time.time()
print 'fit took '+str(t2-t1)
full_predicted=[]
for i in range(1024):
    predict_start=time.time()
    input = border.matrixRow(pad,input_bordered,i)
    approx_neighbors = lshf.kneighbors(input, return_distance=False)

    y_hat=[]
    for query in approx_neighbors:
        temp = []
        for j in range(len(query)):
            temp.append(train_data[1][query[j]])
        y_hat.append(stats.mode(temp)[0][0])
    full_predicted.append(y_hat)
    t3=time.time()
    print 'predict line ' + str(i) + ' took '+str(t3-predict_start)
    

output = np.array(full_predicted).reshape((side,side))\
precisions, recalls = metrics.get_precision_recall_values(full_predicted, label, 100, True)
fp = 'k'+str(k)+'p'+str(pad)+'s'+str(side)+'n'+str(numpoints)
Exemplo n.º 30
0
class SMOTE(OverSampler):
    """Class to perform over-sampling using SMOTE.

    This object is an implementation of SMOTE - Synthetic Minority
    Over-sampling Technique, and the variations Borderline SMOTE 1, 2 and
    SVM-SMOTE.

    Parameters
    ----------
    ratio : str or float, optional (default='auto')
        If 'auto', the ratio will be defined automatically to balanced
        the dataset. Otherwise, the ratio will corresponds to the number
        of samples in the minority class over the the number of samples
        in the majority class.

    random_state : int or None, optional (default=None)
        Seed for random number generation.

    verbose : bool, optional (default=True)
        Boolean to either or not print information about the processing.

    k : int, optional (default=5)
        Number of nearest neighbours to used to construct synthetic samples.

    m : int, optional (default=10)
        Number of nearest neighbours to use to determine if a minority sample
        is in danger.

    out_step : float, optional (default=0.5)
        Step size when extrapolating.

    kind : str, optional (default='regular')
        The type of SMOTE algorithm to use one of the following options:
        'regular', 'borderline1', 'borderline2', 'svm'

    nn_method : str, optional (default='exact')
        The nearest neighbors method to use which can be either: 'approximate'
        or 'exact'. 'approximate' will use LSH Forest while 'exact' will be an
        exact search.

    Attributes
    ----------
    ratio_ : str or float, optional (default='auto')
        If 'auto', the ratio will be defined automatically to balanced
        the dataset. Otherwise, the ratio will corresponds to the number
        of samples in the minority class over the the number of samples
        in the majority class.

    rs_ : int or None, optional (default=None)
        Seed for random number generation.

    min_c_ : str or int
        The identifier of the minority class.

    max_c_ : str or int
        The identifier of the majority class.

    stats_c_ : dict of str/int : int
        A dictionary in which the number of occurences of each class is
        reported.

    Notes
    -----
    See the original papers: [1]_, [2]_, [3]_ for more details.

    It does not support multiple classes automatically, but can be called
    multiple times.

    References
    ----------
    .. [1] N. V. Chawla, K. W. Bowyer, L. O.Hall, W. P. Kegelmeyer, "SMOTE:
       synthetic minority over-sampling technique," Journal of artificial
       intelligence research, 321-357, 2002.

    .. [2] H. Han, W. Wen-Yuan, M. Bing-Huan, "Borderline-SMOTE: a new
       over-sampling method in imbalanced data sets learning," Advances in
       intelligent computing, 878-887, 2005.

    .. [3] H. M. Nguyen, E. W. Cooper, K. Kamei, "Borderline over-sampling for
       imbalanced data classification," International Journal of Knowledge
       Engineering and Soft Data Paradigms, 3(1), pp.4-21, 2001.

    """

    def __init__(self, ratio='auto', random_state=None, verbose=True,
                 k=5, m=10, out_step=0.5, kind='regular', nn_method='exact',
                 n_jobs=-1, **kwargs):
        """Initialisation of SMOTE object.

        Parameters
        ----------
        ratio : str or float, optional (default='auto')
            If 'auto', the ratio will be defined automatically to balanced
            the dataset. Otherwise, the ratio will corresponds to the
            number of samples in the minority class over the the number of
            samples in the majority class.

        random_state : int or None, optional (default=None)
            Seed for random number generation.

        verbose : bool, optional (default=True)
            Boolean to either or not print information about the
            processing.

        k : int, optional (default=5)
            Number of nearest neighbours to used to construct synthetic
            samples.

        m : int, optional (default=10)
            Number of nearest neighbours to use to determine if a minority
            sample is in danger.

        out_step : float, optional (default=0.5)
            Step size when extrapolating.

        kind : str, optional (default='regular')
            The type of SMOTE algorithm to use one of the following
            options: 'regular', 'borderline1', 'borderline2', 'svm'

        nn_method : str, optional (default='exact')
            The nearest neighbors method to use which can be either:
            'approximate' or 'exact'. 'approximate' will use LSH Forest while
            'exact' will be an exact search.

        n_jobs : int, optional (default=-1)
            Number of threads to run the algorithm when it is possible.

        """
        super(SMOTE, self).__init__(ratio=ratio,
                                    random_state=random_state,
                                    verbose=verbose)

        # Check the number of thread to use
        self.n_jobs = n_jobs

        # --- The type of smote
        # This object can perform regular smote over-sampling, borderline 1,
        # borderline 2 and svm smote. Since the algorithms are fairly simple
        # they share most methods.
        possible_kind = ('regular', 'borderline1', 'borderline2', 'svm')
        if kind in possible_kind:
            self.kind = kind
        else:
            raise ValueError('Unknown kind for SMOTE algorithm.')

        # --- Verbose
        # Control whether or not status and progress information should be
        self.verbose = verbose

        # --- Nearest Neighbours for synthetic samples
        # The smote algorithm uses the k-th nearest neighbours of a minority
        # sample to generate new synthetic samples.
        self.k = k

        # --- NN object
        # Import the NN object from scikit-learn library. Since in the smote
        # variations we must first find samples that are in danger, we
        # initialize the NN object differently depending on the method chosen
        if kind == 'regular':
            # Regular smote does not look for samples in danger, instead it
            # creates synthetic samples directly from the k-th nearest
            # neighbours with not filtering
            if nn_method == 'exact':
                self.nearest_neighbour_ = NearestNeighbors(n_neighbors=k + 1,
                                                           n_jobs=self.n_jobs)
            elif nn_method == 'approximate':
                self.nearest_neighbour_ = LSHForest(n_estimators=50,
                                                    n_candidates=500,
                                                    n_neighbors=k+1)
        else:
            # Borderline1, 2 and SVM variations of smote must first look for
            # samples that could be considered noise and samples that live
            # near the boundary between the classes. Therefore, before
            # creating synthetic samples from the k-th nns, it first look
            # for m nearest neighbors to decide whether or not a sample is
            # noise or near the boundary.
            if nn_method == 'exact':
                self.nearest_neighbour_ = NearestNeighbors(n_neighbors=m + 1,
                                                           n_jobs=self.n_jobs)
            elif nn_method == 'approximate':
                self.nearest_neighbour_ = LSHForest(n_estimators=50,
                                                    n_candidates=500,
                                                    n_neighbors=m+1)

            # --- Nearest Neighbours for noise and boundary (in danger)
            # Before creating synthetic samples we must first decide if
            # a given entry is noise or in danger. We use m nns in this step
            self.m = m

        # --- SVM smote
        # Unlike the borderline variations, the SVM variation uses the support
        # vectors to decide which samples are in danger (near the boundary).
        # Additionally it also introduces extrapolation for samples that are
        # considered safe (far from boundary) and interpolation for samples
        # in danger (near the boundary). The level of extrapolation is
        # controled by the out_step.
        if kind == 'svm':
            # Store extrapolation size
            self.out_step = out_step

            # Store SVM object with any parameters
            self.svm_ = SVC(**kwargs)

    def fit(self, X, y):
        """Find the classes statistics before to perform sampling.

        Parameters
        ----------
        X : ndarray, shape (n_samples, n_features)
            Matrix containing the data which have to be sampled.

        y : ndarray, shape (n_samples, )
            Corresponding label for each sample in X.

        Returns
        -------
        self : object,
            Return self.

        """
        # Check the consistency of X and y
        X, y = check_X_y(X, y)

        # Call the parent function
        super(SMOTE, self).fit(X, y)

        return self

    def _in_danger_noise(self, samples, y, kind='danger'):
        """Estimate if a set of sample are in danger or not.

        Parameters
        ----------
        samples : ndarray, shape (n_samples, n_features)
            The samples to check if either they are in danger or not.

        y : ndarray, shape (n_samples, )
            The true label in order to check the neighbour labels.

        kind : str, optional (default='danger')
            The type of classification to use. Can be either:

            - If 'danger', check if samples are in danger,
            - If 'noise', check if samples are noise.

        Returns
        -------
        output : ndarray, shape (n_samples, )
            A boolean array where True refer to samples in danger or noise.

        """

        # Find the NN for each samples
        # Exclude the sample itself
        x = self.nearest_neighbour_.kneighbors(samples,
                                               return_distance=False)[:, 1:]

        # Count how many NN belong to the minority class
        # Find the class corresponding to the label in x
        nn_label = (y[x] != self.min_c_).astype(int)
        # Compute the number of majority samples in the NN
        n_maj = np.sum(nn_label, axis=1)

        if kind == 'danger':
            # Samples are in danger for m/2 <= m' < m
            return np.bitwise_and(n_maj >= float(self.m) / 2.,
                                  n_maj < self.m)
        elif kind == 'noise':
            # Samples are noise for m = m'
            return n_maj == self.m
        else:
            raise ValueError('Unknown string for parameter kind.')

    def _make_samples(self, X, y_type, nn_data, nn_num, n_samples,
                     step_size=1.):
        """A support function that returns artificial samples constructed along
        the line connecting nearest neighbours.

        Parameters
        ----------
        X : ndarray, shape (n_samples, n_features)
            Points from which the points will be created.

        y_type : str or int
            The minority target value, just so the function can return the
            target values for the synthetic variables with correct length in
            a clear format.

        nn_data : ndarray, shape (n_samples_all, n_features)
            Data set carrying all the neighbours to be used

        nn_num : ndarray, shape (n_samples_all, k_nearest_neighbours)
            The nearest neighbours of each sample in nn_data.

        n_samples : int
            The number of samples to generate.

        step_size : float, optional (default=1.)
            The step size to create samples.

        Returns
        -------
        X_new : ndarray, shape (n_samples_new, n_features)
            Synthetically generated samples.

        y_new : ndarray, shape (n_samples_new, )
            Target values for synthetic samples.

        """

        # Check the consistency of X
        X = check_array(X)

        # A matrix to store the synthetic samples
        X_new = np.zeros((n_samples, X.shape[1]))

        # Set seeds
        np.random.seed(self.rs_)
        seeds = np.random.randint(low=0,
                                  high=100*len(nn_num.flatten()),
                                  size=n_samples)

        # Randomly pick samples to construct neighbours from
        np.random.seed(self.rs_)
        samples = np.random.randint(low=0,
                                    high=len(nn_num.flatten()),
                                    size=n_samples)

        # Loop over the NN matrix and create new samples
        for i, n in enumerate(samples):
            # NN lines relate to original sample, columns to its
            # nearest neighbours
            row, col = divmod(n, nn_num.shape[1])

            # Take a step of random size (0,1) in the direction of the
            # n nearest neighbours
            np.random.seed(seeds[i])
            step = step_size * np.random.uniform()

            # Construct synthetic sample
            X_new[i] = X[row] - step * (X[row] -
                                        nn_data[nn_num[row, col]])

        # The returned target vector is simply a repetition of the
        # minority label
        y_new = np.array([y_type] * len(X_new))

        if self.verbose:
            print("Generated {} new samples ...".format(len(X_new)))

        return X_new, y_new

    def transform(self, X, y):
        """Resample the dataset.

        Parameters
        ----------
        X : ndarray, shape (n_samples, n_features)
            Matrix containing the data which have to be sampled.

        y : ndarray, shape (n_samples, )
            Corresponding label for each sample in X.

        Returns
        -------
        X_resampled : ndarray, shape (n_samples_new, n_features)
            The array containing the resampled data.

        y_resampled : ndarray, shape (n_samples_new)
            The corresponding label of `X_resampled`

        """
        # Check the consistency of X and y
        X, y = check_X_y(X, y)

        # Call the parent function
        super(SMOTE, self).transform(X, y)

        # Define the number of sample to create
        # We handle only two classes problem for the moment.
        if self.ratio_ == 'auto':
            num_samples = (self.stats_c_[self.maj_c_] -
                           self.stats_c_[self.min_c_])
        else:
            num_samples = ((self.ratio_ * self.stats_c_[self.maj_c_]) -
                           self.stats_c_[self.min_c_])

        # Start by separating minority class features and target values.
        X_min = X[y == self.min_c_]

        # If regular SMOTE is to be performed
        if self.kind == 'regular':

            # Print if verbose is true#
            if self.verbose:
                print('Finding the {} nearest neighbours...'.format(self.k))

            # Look for k-th nearest neighbours, excluding, of course, the
            # point itself.
            self.nearest_neighbour_.fit(X_min)

            # Matrix with k-th nearest neighbours indexes for each minority
            # element.
            nns = self.nearest_neighbour_.kneighbors(
                X_min,
                return_distance=False)[:, 1:]

            # Print status if verbose is true
            if self.verbose:
                print("done!")
                print("Creating synthetic samples...", end="")

            # --- Generating synthetic samples
            # Use static method make_samples to generate minority samples
            X_new, y_new = self._make_samples(X_min,
                                             self.min_c_,
                                             X_min,
                                             nns,
                                             num_samples,
                                             1.0)

            if self.verbose:
                print("done!")

            # Concatenate the newly generated samples to the original data set
            X_resampled = np.concatenate((X, X_new), axis=0)
            y_resampled = np.concatenate((y, y_new), axis=0)

            return X_resampled, y_resampled

        if self.kind == 'borderline1' or self.kind == 'borderline2':

            if self.verbose:
                print("Finding the {} nearest neighbours...".format(self.m))

            # Find the NNs for all samples in the data set.
            self.nearest_neighbour_.fit(X)

            if self.verbose:
                print("done!")

            # Boolean array with True for minority samples in danger
            danger_index = self._in_danger_noise(X_min, y, kind='danger')

            # If all minority samples are safe, return the original data set.
            if not any(danger_index):
                if self.verbose:
                    print('There are no samples in danger. No borderline '
                          'synthetic samples created.')

                # All are safe, nothing to be done here.
                return X, y

            # If we got here is because some samples are in danger, we need to
            # find the NNs among the minority class to create the new synthetic
            # samples.
            #
            # We start by changing the number of NNs to consider from m + 1
            # to k + 1
            self.nearest_neighbour_.set_params(**{'n_neighbors': self.k + 1})
            self.nearest_neighbour_.fit(X_min)

            # nns...#
            nns = self.nearest_neighbour_.kneighbors(
                X_min[danger_index],
                return_distance=False)[:, 1:]

            # B1 and B2 types diverge here!!!
            if self.kind == 'borderline1':
                # Create synthetic samples for borderline points.
                X_new, y_new = self._make_samples(X_min[danger_index],
                                                 self.min_c_,
                                                 X_min,
                                                 nns,
                                                 num_samples)

                # Concatenate the newly generated samples to the original
                # dataset
                X_resampled = np.concatenate((X, X_new), axis=0)
                y_resampled = np.concatenate((y, y_new), axis=0)

                # Reset the k-neighbours to m+1 neighbours
                self.nearest_neighbour_.set_params(**{'n_neighbors': self.m+1})

                return X_resampled, y_resampled

            else:
                # Split the number of synthetic samples between only minority
                # (type 1), or minority and majority (with reduced step size)
                # (type 2).
                np.random.seed(self.rs_)

                # The fraction is sampled from a beta distribution centered
                # around 0.5 with variance ~0.01
                fractions = betavariate(alpha=10, beta=10)

                # Only minority
                X_new_1, y_new_1 = self._make_samples(X_min[danger_index],
                                                     self.min_c_,
                                                     X_min,
                                                     nns,
                                                     int(fractions *
                                                         (num_samples + 1)),
                                                     step_size=1.)

                # Only majority with smaller step size
                X_new_2, y_new_2 = self._make_samples(X_min[danger_index],
                                                     self.min_c_,
                                                     X[y != self.min_c_],
                                                     nns,
                                                     int((1 - fractions) *
                                                         num_samples),
                                                     step_size=0.5)

                # Concatenate the newly generated samples to the original
                # data set
                X_resampled = np.concatenate((X, X_new_1, X_new_2), axis=0)
                y_resampled = np.concatenate((y, y_new_1, y_new_2), axis=0)

                # Reset the k-neighbours to m+1 neighbours
                self.nearest_neighbour_.set_params(**{'n_neighbors': self.m+1})

                return X_resampled, y_resampled

        if self.kind == 'svm':
            # The SVM smote model fits a support vector machine
            # classifier to the data and uses the support vector to
            # provide a notion of boundary. Unlike regular smote, where
            # such notion relies on proportion of nearest neighbours
            # belonging to each class.

            # Fit SVM to the full data#
            self.svm_.fit(X, y)

            # Find the support vectors and their corresponding indexes
            support_index = self.svm_.support_[y[self.svm_.support_] ==
                                               self.min_c_]
            support_vector = X[support_index]

            # First, find the nn of all the samples to identify samples
            # in danger and noisy ones
            if self.verbose:
                print("Finding the {} nearest neighbours...".format(self.m))

            # As usual, fit a nearest neighbour model to the data
            self.nearest_neighbour_.fit(X)

            if self.verbose:
                print("done!")

            # Now, get rid of noisy support vectors

            noise_bool = self._in_danger_noise(support_vector, y, kind='noise')

            # Remove noisy support vectors
            support_vector = support_vector[np.logical_not(noise_bool)]
            danger_bool = self._in_danger_noise(support_vector, y,
                                               kind='danger')
            safety_bool = np.logical_not(danger_bool)

            if self.verbose:
                print("Out of {0} support vectors, {1} are noisy, "
                      "{2} are in danger "
                      "and {3} are safe.".format(support_vector.shape[0],
                                                 noise_bool.sum().astype(int),
                                                 danger_bool.sum().astype(int),
                                                 safety_bool.sum().astype(int)
                                                 ))

                # Proceed to find support vectors NNs among the minority class
                print("Finding the {} nearest neighbours...".format(self.k))

            self.nearest_neighbour_.set_params(**{'n_neighbors': self.k + 1})
            self.nearest_neighbour_.fit(X_min)

            if self.verbose:
                print("done!")
                print("Creating synthetic samples...", end="")

            # Split the number of synthetic samples between interpolation and
            # extrapolation

            # The fraction are sampled from a beta distribution with mean
            # 0.5 and variance 0.01#
            np.random.seed(self.rs_)
            fractions = betavariate(alpha=10, beta=10)

            # Interpolate samples in danger
            if np.count_nonzero(danger_bool) > 0:
                nns = self.nearest_neighbour_.kneighbors(
                    support_vector[danger_bool],
                    return_distance=False)[:, 1:]

                X_new_1, y_new_1 = self._make_samples(
                    support_vector[danger_bool],
                    self.min_c_,
                    X_min,
                    nns,
                    int(fractions * (num_samples + 1)),
                    step_size=1.)

            # Extrapolate safe samples
            if np.count_nonzero(safety_bool) > 0:
                nns = self.nearest_neighbour_.kneighbors(
                    support_vector[safety_bool],
                    return_distance=False)[:, 1:]

                X_new_2, y_new_2 = self._make_samples(
                    support_vector[safety_bool],
                    self.min_c_,
                    X_min,
                    nns,
                    int((1 - fractions) * num_samples),
                    step_size=-self.out_step)

            if self.verbose:
                print("done!")

            # Concatenate the newly generated samples to the original data set
            if (np.count_nonzero(danger_bool) > 0 and
                    np.count_nonzero(safety_bool) > 0):
                X_resampled = np.concatenate((X, X_new_1, X_new_2), axis=0)
                y_resampled = np.concatenate((y, y_new_1, y_new_2), axis=0)
            # not any support vectors in danger
            elif np.count_nonzero(danger_bool) == 0:
                X_resampled = np.concatenate((X, X_new_2), axis=0)
                y_resampled = np.concatenate((y, y_new_2), axis=0)
            # All the support vector in danger
            elif np.count_nonzero(safety_bool) == 0:
                X_resampled = np.concatenate((X, X_new_1), axis=0)
                y_resampled = np.concatenate((y, y_new_1), axis=0)

            # Reset the k-neighbours to m+1 neighbours
            self.nearest_neighbour_.set_params(**{'n_neighbors': self.m+1})

            return X_resampled, y_resampled
Exemplo n.º 31
0
# print indices
# print distances
# print data[indices]
# import code
# code.interact(local=locals())
##how do i handle edges??
img_w = 100
img_h = 100
result =  np.zeros((img_w,img_h,3), np.uint8)
for i in range(img_w):
    for j in range(0,k/2):
        result[i][j] = img[np.random.randint(img.shape[0])][np.random.randint(img.shape[1])]
        result[j][i] = img[np.random.randint(img.shape[0])][np.random.randint(img.shape[1])]


#fill in
for i in range(k/2,img_w-k/2):
    print('row ' + str(i) + ': {:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.now()))
    for j in range(k/2, img_h-k/2):
        neighborhood = result[(i-k/2):(i+k/2+1), (j-k/2):(j+k/2+1)]
        neighborhood = neighborhood.reshape(1,-1)
        distances, indices = lshf.kneighbors(neighborhood, n_neighbors=1)
        result[i][j] = data[indices[0][0]][(k/2*k/2*3):((k/2*k/2+1)*3)]
        # print indices
        # print distances
        # print data[indices]

#show
plt.imshow(result)
plt.show()
Exemplo n.º 32
0
class SMOTE(OverSampler):
    """Class to perform over-sampling using SMOTE.

    This object is an implementation of SMOTE - Synthetic Minority
    Over-sampling Technique, and the variations Borderline SMOTE 1, 2 and
    SVM-SMOTE.

    Parameters
    ----------
    ratio : str or float, optional (default='auto')
        If 'auto', the ratio will be defined automatically to balanced
        the dataset. Otherwise, the ratio will corresponds to the number
        of samples in the minority class over the the number of samples
        in the majority class.

    random_state : int or None, optional (default=None)
        Seed for random number generation.

    verbose : bool, optional (default=True)
        Boolean to either or not print information about the processing.

    k : int, optional (default=5)
        Number of nearest neighbours to used to construct synthetic samples.

    m : int, optional (default=10)
        Number of nearest neighbours to use to determine if a minority sample
        is in danger.

    out_step : float, optional (default=0.5)
        Step size when extrapolating.

    kind : str, optional (default='regular')
        The type of SMOTE algorithm to use one of the following options:
        'regular', 'borderline1', 'borderline2', 'svm'

    nn_method : str, optional (default='exact')
        The nearest neighbors method to use which can be either: 'approximate'
        or 'exact'. 'approximate' will use LSH Forest while 'exact' will be an
        exact search.

    Attributes
    ----------
    ratio_ : str or float, optional (default='auto')
        If 'auto', the ratio will be defined automatically to balanced
        the dataset. Otherwise, the ratio will corresponds to the number
        of samples in the minority class over the the number of samples
        in the majority class.

    rs_ : int or None, optional (default=None)
        Seed for random number generation.

    min_c_ : str or int
        The identifier of the minority class.

    max_c_ : str or int
        The identifier of the majority class.

    stats_c_ : dict of str/int : int
        A dictionary in which the number of occurences of each class is
        reported.

    Notes
    -----
    See the original papers: [1]_, [2]_, [3]_ for more details.

    It does not support multiple classes automatically, but can be called
    multiple times.

    References
    ----------
    .. [1] N. V. Chawla, K. W. Bowyer, L. O.Hall, W. P. Kegelmeyer, "SMOTE:
       synthetic minority over-sampling technique," Journal of artificial
       intelligence research, 321-357, 2002.

    .. [2] H. Han, W. Wen-Yuan, M. Bing-Huan, "Borderline-SMOTE: a new
       over-sampling method in imbalanced data sets learning," Advances in
       intelligent computing, 878-887, 2005.

    .. [3] H. M. Nguyen, E. W. Cooper, K. Kamei, "Borderline over-sampling for
       imbalanced data classification," International Journal of Knowledge
       Engineering and Soft Data Paradigms, 3(1), pp.4-21, 2001.

    """

    def __init__(self, ratio='auto', random_state=None, verbose=True,
                 k=5, m=10, out_step=0.5, kind='regular', nn_method='exact',
                 n_jobs=-1, **kwargs):
        """Initialisation of SMOTE object.

        Parameters
        ----------
        ratio : str or float, optional (default='auto')
            If 'auto', the ratio will be defined automatically to balanced
            the dataset. Otherwise, the ratio will corresponds to the
            number of samples in the minority class over the the number of
            samples in the majority class.

        random_state : int or None, optional (default=None)
            Seed for random number generation.

        verbose : bool, optional (default=True)
            Boolean to either or not print information about the
            processing.

        k : int, optional (default=5)
            Number of nearest neighbours to used to construct synthetic
            samples.

        m : int, optional (default=10)
            Number of nearest neighbours to use to determine if a minority
            sample is in danger.

        out_step : float, optional (default=0.5)
            Step size when extrapolating.

        kind : str, optional (default='regular')
            The type of SMOTE algorithm to use one of the following
            options: 'regular', 'borderline1', 'borderline2', 'svm'

        nn_method : str, optional (default='exact')
            The nearest neighbors method to use which can be either:
            'approximate' or 'exact'. 'approximate' will use LSH Forest while
            'exact' will be an exact search.

        n_jobs : int, optional (default=-1)
            Number of threads to run the algorithm when it is possible.

        """
        super(SMOTE, self).__init__(ratio=ratio,
                                    random_state=random_state,
                                    verbose=verbose)

        # Check the number of thread to use
        self.n_jobs = n_jobs

        # --- The type of smote
        # This object can perform regular smote over-sampling, borderline 1,
        # borderline 2 and svm smote. Since the algorithms are fairly simple
        # they share most methods.
        possible_kind = ('regular', 'borderline1', 'borderline2', 'svm')
        if kind in possible_kind:
            self.kind = kind
        else:
            raise ValueError('Unknown kind for SMOTE algorithm.')

        # --- Verbose
        # Control whether or not status and progress information should be
        self.verbose = verbose

        # --- Nearest Neighbours for synthetic samples
        # The smote algorithm uses the k-th nearest neighbours of a minority
        # sample to generate new synthetic samples.
        self.k = k

        # --- NN object
        # Import the NN object from scikit-learn library. Since in the smote
        # variations we must first find samples that are in danger, we
        # initialize the NN object differently depending on the method chosen
        if kind == 'regular':
            # Regular smote does not look for samples in danger, instead it
            # creates synthetic samples directly from the k-th nearest
            # neighbours with not filtering
            if nn_method == 'exact':
                self.nearest_neighbour_ = NearestNeighbors(n_neighbors=k + 1,
                                                           n_jobs=self.n_jobs)
            elif nn_method == 'approximate':
                self.nearest_neighbour_ = LSHForest(n_estimators=50,
                                                    n_candidates=500,
                                                    n_neighbors=k+1)
        else:
            # Borderline1, 2 and SVM variations of smote must first look for
            # samples that could be considered noise and samples that live
            # near the boundary between the classes. Therefore, before
            # creating synthetic samples from the k-th nns, it first look
            # for m nearest neighbors to decide whether or not a sample is
            # noise or near the boundary.
            if nn_method == 'exact':
                self.nearest_neighbour_ = NearestNeighbors(n_neighbors=m + 1,
                                                           n_jobs=self.n_jobs)
            elif nn_method == 'approximate':
                self.nearest_neighbour_ = LSHForest(n_estimators=50,
                                                    n_candidates=500,
                                                    n_neighbors=m+1)

            # --- Nearest Neighbours for noise and boundary (in danger)
            # Before creating synthetic samples we must first decide if
            # a given entry is noise or in danger. We use m nns in this step
            self.m = m

        # --- SVM smote
        # Unlike the borderline variations, the SVM variation uses the support
        # vectors to decide which samples are in danger (near the boundary).
        # Additionally it also introduces extrapolation for samples that are
        # considered safe (far from boundary) and interpolation for samples
        # in danger (near the boundary). The level of extrapolation is
        # controled by the out_step.
        if kind == 'svm':
            # Store extrapolation size
            self.out_step = out_step

            # Store SVM object with any parameters
            self.svm_ = SVC(**kwargs)

    def fit(self, X, y):
        """Find the classes statistics before to perform sampling.

        Parameters
        ----------
        X : ndarray, shape (n_samples, n_features)
            Matrix containing the data which have to be sampled.

        y : ndarray, shape (n_samples, )
            Corresponding label for each sample in X.

        Returns
        -------
        self : object,
            Return self.

        """
        # Check the consistency of X and y
        X, y = check_X_y(X, y)

        # Call the parent function
        super(SMOTE, self).fit(X, y)

        return self

    def in_danger_noise(self, samples, y, kind='danger'):
        """Estimate if a set of sample are in danger or not.

        Parameters
        ----------
        samples : ndarray, shape (n_samples, n_features)
            The samples to check if either they are in danger or not.

        y : ndarray, shape (n_samples, )
            The true label in order to check the neighbour labels.

        kind : str, optional (default='danger')
            The type of classification to use. Can be either:

            - If 'danger', check if samples are in danger,
            - If 'noise', check if samples are noise.

        Returns
        -------
        output : ndarray, shape (n_samples, )
            A boolean array where True refer to samples in danger or noise.

        """

        # Find the NN for each samples
        # Exclude the sample itself
        x = self.nearest_neighbour_.kneighbors(samples,
                                               return_distance=False)[:, 1:]

        # Count how many NN belong to the minority class
        # Find the class corresponding to the label in x
        nn_label = (y[x] != self.min_c_).astype(int)
        # Compute the number of majority samples in the NN
        n_maj = np.sum(nn_label, axis=1)

        if kind == 'danger':
            # Samples are in danger for m/2 <= m' < m
            return np.bitwise_and(n_maj >= float(self.m) / 2.,
                                  n_maj < self.m)
        elif kind == 'noise':
            # Samples are noise for m = m'
            return n_maj == self.m
        else:
            raise ValueError('Unknown string for parameter kind.')

    def make_samples(self, X, y_type, nn_data, nn_num, n_samples,
                     step_size=1.):
        """A support function that returns artificial samples constructed along
        the line connecting nearest neighbours.

        Parameters
        ----------
        X : ndarray, shape (n_samples, n_features)
            Points from which the points will be created.

        y_type : str or int
            The minority target value, just so the function can return the
            target values for the synthetic variables with correct length in
            a clear format.

        nn_data : ndarray, shape(n_samples_all, n_features)
            Data set carrying all the neighbours to be used

        nn_num : int
            The number of nearest neighbours to be used.

        n_samples : int
            The number of samples to generate.

        step_size : float, optional (default=1.)
            The step size to create samples.

        Returns
        -------
        X_new : ndarray, shape (n_samples_new, n_features)
            Synthetically generated samples.

        y_new : ndarray, shape (n_samples_new, )
            Target values for synthetic samples.

        """

        # Check the consistency of X
        X = check_array(X)

        # A matrix to store the synthetic samples
        X_new = np.zeros((n_samples, X.shape[1]))

        # Set seeds
        np.random.seed(self.rs_)
        seeds = np.random.randint(low=0,
                                  high=100*len(nn_num.flatten()),
                                  size=n_samples)

        # Randomly pick samples to construct neighbours from
        np.random.seed(self.rs_)
        samples = np.random.randint(low=0,
                                    high=len(nn_num.flatten()),
                                    size=n_samples)

        # Loop over the NN matrix and create new samples
        for i, n in enumerate(samples):
            # NN lines relate to original sample, columns to its
            # nearest neighbours
            row, col = divmod(n, nn_num.shape[1])

            # Take a step of random size (0,1) in the direction of the
            # n nearest neighbours
            np.random.seed(seeds[i])
            step = step_size * np.random.uniform()

            # Construct synthetic sample
            X_new[i] = X[row] - step * (X[row] -
                                        nn_data[nn_num[row, col]])

        # The returned target vector is simply a repetition of the
        # minority label
        y_new = np.array([y_type] * len(X_new))

        if self.verbose:
            print("Generated {} new samples ...".format(len(X_new)))

        return X_new, y_new

    def transform(self, X, y):
        """Resample the dataset.

        Parameters
        ----------
        X : ndarray, shape (n_samples, n_features)
            Matrix containing the data which have to be sampled.

        y : ndarray, shape (n_samples, )
            Corresponding label for each sample in X.

        Returns
        -------
        X_resampled : ndarray, shape (n_samples_new, n_features)
            The array containing the resampled data.

        y_resampled : ndarray, shape (n_samples_new)
            The corresponding label of `X_resampled`

        """
        # Check the consistency of X and y
        X, y = check_X_y(X, y)

        # Call the parent function
        super(SMOTE, self).transform(X, y)

        # Define the number of sample to create
        # We handle only two classes problem for the moment.
        if self.ratio_ == 'auto':
            num_samples = (self.stats_c_[self.maj_c_] -
                           self.stats_c_[self.min_c_])
        else:
            num_samples = ((self.ratio_ * self.stats_c_[self.maj_c_]) -
                           self.stats_c_[self.min_c_])

        # Start by separating minority class features and target values.
        X_min = X[y == self.min_c_]

        # If regular SMOTE is to be performed
        if self.kind == 'regular':

            # Print if verbose is true#
            if self.verbose:
                print('Finding the {} nearest neighbours...'.format(self.k))

            # Look for k-th nearest neighbours, excluding, of course, the
            # point itself.
            self.nearest_neighbour_.fit(X_min)

            # Matrix with k-th nearest neighbours indexes for each minority
            # element.
            nns = self.nearest_neighbour_.kneighbors(
                X_min,
                return_distance=False)[:, 1:]

            # Print status if verbose is true
            if self.verbose:
                print("done!")
                print("Creating synthetic samples...", end="")

            # --- Generating synthetic samples
            # Use static method make_samples to generate minority samples
            X_new, y_new = self.make_samples(X_min,
                                             self.min_c_,
                                             X_min,
                                             nns,
                                             num_samples,
                                             1.0)

            if self.verbose:
                print("done!")

            # Concatenate the newly generated samples to the original data set
            X_resampled = np.concatenate((X, X_new), axis=0)
            y_resampled = np.concatenate((y, y_new), axis=0)

            return X_resampled, y_resampled

        if self.kind == 'borderline1' or self.kind == 'borderline2':

            if self.verbose:
                print("Finding the {} nearest neighbours...".format(self.m))

            # Find the NNs for all samples in the data set.
            self.nearest_neighbour_.fit(X)

            if self.verbose:
                print("done!")

            # Boolean array with True for minority samples in danger
            danger_index = self.in_danger_noise(X_min, y, kind='danger')

            # If all minority samples are safe, return the original data set.
            if not any(danger_index):
                if self.verbose:
                    print('There are no samples in danger. No borderline '
                          'synthetic samples created.')

                # All are safe, nothing to be done here.
                return X, y

            # If we got here is because some samples are in danger, we need to
            # find the NNs among the minority class to create the new synthetic
            # samples.
            #
            # We start by changing the number of NNs to consider from m + 1
            # to k + 1
            self.nearest_neighbour_.set_params(**{'n_neighbors': self.k + 1})
            self.nearest_neighbour_.fit(X_min)

            # nns...#
            nns = self.nearest_neighbour_.kneighbors(
                X_min[danger_index],
                return_distance=False)[:, 1:]

            # B1 and B2 types diverge here!!!
            if self.kind == 'borderline1':
                # Create synthetic samples for borderline points.
                X_new, y_new = self.make_samples(X_min[danger_index],
                                                 self.min_c_,
                                                 X_min,
                                                 nns,
                                                 num_samples)

                # Concatenate the newly generated samples to the original
                # dataset
                X_resampled = np.concatenate((X, X_new), axis=0)
                y_resampled = np.concatenate((y, y_new), axis=0)

                # Reset the k-neighbours to m+1 neighbours
                self.nearest_neighbour_.set_params(**{'n_neighbors': self.m+1})

                return X_resampled, y_resampled

            else:
                # Split the number of synthetic samples between only minority
                # (type 1), or minority and majority (with reduced step size)
                # (type 2).
                np.random.seed(self.rs_)

                # The fraction is sampled from a beta distribution centered
                # around 0.5 with variance ~0.01
                fractions = betavariate(alpha=10, beta=10)

                # Only minority
                X_new_1, y_new_1 = self.make_samples(X_min[danger_index],
                                                     self.min_c_,
                                                     X_min,
                                                     nns,
                                                     int(fractions *
                                                         (num_samples + 1)),
                                                     step_size=1.)

                # Only majority with smaller step size
                X_new_2, y_new_2 = self.make_samples(X_min[danger_index],
                                                     self.min_c_,
                                                     X[y != self.min_c_],
                                                     nns,
                                                     int((1 - fractions) *
                                                         num_samples),
                                                     step_size=0.5)

                # Concatenate the newly generated samples to the original
                # data set
                X_resampled = np.concatenate((X, X_new_1, X_new_2), axis=0)
                y_resampled = np.concatenate((y, y_new_1, y_new_2), axis=0)

                # Reset the k-neighbours to m+1 neighbours
                self.nearest_neighbour_.set_params(**{'n_neighbors': self.m+1})

                return X_resampled, y_resampled

        if self.kind == 'svm':
            # The SVM smote model fits a support vector machine
            # classifier to the data and uses the support vector to
            # provide a notion of boundary. Unlike regular smote, where
            # such notion relies on proportion of nearest neighbours
            # belonging to each class.

            # Fit SVM to the full data#
            self.svm_.fit(X, y)

            # Find the support vectors and their corresponding indexes
            support_index = self.svm_.support_[y[self.svm_.support_] ==
                                               self.min_c_]
            support_vector = X[support_index]

            # First, find the nn of all the samples to identify samples
            # in danger and noisy ones
            if self.verbose:
                print("Finding the {} nearest neighbours...".format(self.m))

            # As usual, fit a nearest neighbour model to the data
            self.nearest_neighbour_.fit(X)

            if self.verbose:
                print("done!")

            # Now, get rid of noisy support vectors

            noise_bool = self.in_danger_noise(support_vector, y, kind='noise')

            # Remove noisy support vectors
            support_vector = support_vector[np.logical_not(noise_bool)]
            danger_bool = self.in_danger_noise(support_vector, y,
                                               kind='danger')
            safety_bool = np.logical_not(danger_bool)

            if self.verbose:
                print("Out of {0} support vectors, {1} are noisy, "
                      "{2} are in danger "
                      "and {3} are safe.".format(support_vector.shape[0],
                                                 noise_bool.sum().astype(int),
                                                 danger_bool.sum().astype(int),
                                                 safety_bool.sum().astype(int)
                                                 ))

                # Proceed to find support vectors NNs among the minority class
                print("Finding the {} nearest neighbours...".format(self.k))

            self.nearest_neighbour_.set_params(**{'n_neighbors': self.k + 1})
            self.nearest_neighbour_.fit(X_min)

            if self.verbose:
                print("done!")
                print("Creating synthetic samples...", end="")

            # Split the number of synthetic samples between interpolation and
            # extrapolation

            # The fraction are sampled from a beta distribution with mean
            # 0.5 and variance 0.01#
            np.random.seed(self.rs_)
            fractions = betavariate(alpha=10, beta=10)

            # Interpolate samples in danger
            if np.count_nonzero(danger_bool) > 0:
                nns = self.nearest_neighbour_.kneighbors(
                    support_vector[danger_bool],
                    return_distance=False)[:, 1:]

                X_new_1, y_new_1 = self.make_samples(
                    support_vector[danger_bool],
                    self.min_c_,
                    X_min,
                    nns,
                    int(fractions * (num_samples + 1)),
                    step_size=1.)

            # Extrapolate safe samples
            if np.count_nonzero(safety_bool) > 0:
                nns = self.nearest_neighbour_.kneighbors(
                    support_vector[safety_bool],
                    return_distance=False)[:, 1:]

                X_new_2, y_new_2 = self.make_samples(
                    support_vector[safety_bool],
                    self.min_c_,
                    X_min,
                    nns,
                    int((1 - fractions) * num_samples),
                    step_size=-self.out_step)

            if self.verbose:
                print("done!")

            # Concatenate the newly generated samples to the original data set
            if (np.count_nonzero(danger_bool) > 0 and
                    np.count_nonzero(safety_bool) > 0):
                X_resampled = np.concatenate((X, X_new_1, X_new_2), axis=0)
                y_resampled = np.concatenate((y, y_new_1, y_new_2), axis=0)
            # not any support vectors in danger
            elif np.count_nonzero(danger_bool) == 0:
                X_resampled = np.concatenate((X, X_new_2), axis=0)
                y_resampled = np.concatenate((y, y_new_2), axis=0)
            # All the support vector in danger
            elif np.count_nonzero(safety_bool) == 0:
                X_resampled = np.concatenate((X, X_new_1), axis=0)
                y_resampled = np.concatenate((y, y_new_1), axis=0)

            # Reset the k-neighbours to m+1 neighbours
            self.nearest_neighbour_.set_params(**{'n_neighbors': self.m+1})

            return X_resampled, y_resampled
Exemplo n.º 33
0
session = open(top_dn + "/" + user_dn + "/" + session_fn, "w")

#some test strings
#tests = ["this is the first test", "this is the second test"]

testX = []
with open('sdd_t2/Examination/BMX_H_Doc-SDD.csv', 'r') as myfile:
    f_test = myfile.read().split("\n")
myfile.close()

tests = f_test
for test in tests:
    testX.append(stringToCoordinates(test, dimension, DimensionDict))

n_neighbors = 10
distances, indices = lshf.kneighbors(testX, n_neighbors=n_neighbors)

base = Tk()
root = Frame(base)
root.pack()

x_index = IntVar()
v = IntVar()
uri = StringVar()
lab = StringVar()
label_contents = StringVar()
radio_contents = [StringVar() for i in range(n_neighbors)]

x_index.set(0)

Exemplo n.º 34
0
    print('Done')

    estimateList = [
        1, 2, 3, 4, 5, 7, 10, 13, 15, 18, 20, 23, 26, 30, 33, 36, 40, 43, 46,
        50
    ]

    test_num = 1
    for estimateNum in estimateList:
        time_sum = 0.0
        recallSum = 0.0
        lshf_nytimes = LSHForest(n_estimators=estimateNum,
                                 random_state=42,
                                 n_candidates=500)
        lshf_nytimes.fit(trainDataset)
        startMillis = int(round(time.time() * 1000))
        distances1, indices1 = lshf_nytimes.kneighbors(queries,
                                                       n_neighbors=topk)
        indices = []
        # for i in range(len(indices1)):
        #     indices.append(set(indices1[i]).union(set(indices2[i])).union(set(indices3[i])))
        endMillis = int(round(time.time() * 1000))
        time_sum += (endMillis - startMillis)
        print(time_sum / number_of_queries)
        # print(indices3)
        # print(type(indices[0]))

        for (i, OneResult) in enumerate(indices1):
            recallSum += len(set(OneResult).intersection(set(
                groundTruth[i]))) / number_of_queries
        print("recall={0}".format(recallSum / topk))
class FaceRecognition():
    def __init__(self):
        self.unknown = ''
        self.same_person_num = 1
        self.has_save_pic_feature = []
        self.has_cal_dist = []
        self.NeighbourNum = 10
        self.all_pic_data_folder = '/data/liubo/face/self'
        self.other_dataset_para_add = 1
        self.n_neighbors = 5
        self.lshf = LSHForest(n_estimators=20, n_candidates=200, n_neighbors=self.n_neighbors)
        self.all_labels = []
        self.all_pic_feature = []
        self.same_pic_id = 2
        self.must_be_same_id = 1
        self.must_be_not_same_id = 0
        self.maybe_same_id = 3
        self.new_person_str = 'new_person_'
        self.current_new_person_id = self.find_current_new_person_id()
        self.must_same_str = '_Must_Same'
        self.maybe_same_str = '_Maybe_same'
        self.load_time = time.time()
        self.user_count = {}
        # 不同的模型阈值不相同
        self.upper_threshold = upper_verif_threshold
        self.lower_threshold = lower_verif_threshold
        self.same_pic_threshold = same_pic_threshold
        self.pitch_threshold = 20
        self.yaw_threshold = 20
        self.roll_threshold = 20
        #  [(time, feature),...,(time, feature)] : 根据时间计算当前图片与前5张图片的相似度(如果时间相差很多, 不在计算)
        self.nearest = deque(maxlen=nearest_num)
        self.trans_dic = {self.same_pic_id: 'same_pic', self.must_be_same_id: 'must_same_id',
                          self.must_be_not_same_id: 'must_not_same_id', self.maybe_same_id: 'maybe_same_id'}
        self.verification_same_person = 0


    def cal_nearest_sim(self, current_feature):
        nearest_sim_list = []
        try:
            length = len(self.nearest)
            for k in range(length):
                try:
                    person_name, pre_feature = self.nearest[k]
                    # 不在考虑时间, 只考虑图片的相似度
                    this_sim = pw.cosine_similarity(np.reshape(np.asarray(pre_feature), (1, len(pre_feature))),
                                                    np.reshape(np.asarray(current_feature), (1, len(current_feature))))

                    nearest_sim_list.append((this_sim, verification_model.predict(this_sim), person_name))
                except:
                    traceback.print_exc()
                    continue
            return nearest_sim_list
        except:
            traceback.print_exc()
            return nearest_sim_list


    def find_current_new_person_id(self):
        current_day = get_current_day()
        log_file = open(os.path.join(log_dir, current_day+'.txt'), 'a')

        old_person_id = []
        person_list = os.listdir(self.all_pic_data_folder)
        for person in person_list:
            if person.startswith(self.new_person_str):
                tmp = person[len(self.new_person_str):].split('_')
                if len(tmp) > 0:
                    this_id = int(tmp[0])
                    old_person_id.append(this_id)
        if len(old_person_id) == 0:
            current_new_person_id = 0
        else:
            current_new_person_id = max(old_person_id) + 1
        log_file.write('\t'.join(map(str, ['current_new_person_id :', current_new_person_id]))+'\n')
        log_file.close()
        return current_new_person_id


    def extract_pic_feature(self, pic_data, batch_size=1, feature_dim=FEATURE_DIM):
        '''
            用于提取多张图片的特征(用于处理load数据)
            :param pic_data: 图片数据
            :param batch_size:
            :param feature_dim: 模型输出维度(vgg的输出是4096)
            :return:
        '''
        pic_feature = np.zeros(shape=(pic_data.shape[0], feature_dim))
        batch_num = pic_data.shape[0] / batch_size
        for index in range(batch_num):
            pic_feature[index*batch_size:(index+1)*batch_size, :] = \
                extract_feature_from_numpy(pic_data[index*batch_size:(index+1)*batch_size])
        if batch_num*batch_size < pic_data.shape[0]:
            pic_feature[batch_num*batch_size:, :] = \
                extract_feature_from_numpy(pic_data[batch_num*batch_size:])
        return pic_feature


    def load_all_data(self):
        # 将以前标记的数据全部读入,用LSH Forest保存,方便计算距离
        current_day = get_current_day()
        log_file = open(os.path.join(log_dir, current_day+'.txt'), 'a')

        train_data, train_label = load_train_data(self.all_pic_data_folder)
        if len(train_label) == 0:
            return
        pic_feature = self.extract_pic_feature(train_data)
        start = time.time()
        self.lshf.fit(pic_feature, train_label)
        self.all_pic_feature = list(pic_feature)
        self.all_labels = list(train_label)
        end = time.time()
        self.load_time = end
        self.user_count = Counter(self.all_labels)
        log_file.write('\t'.join(map(str, [self.user_count,
                                           'fit all data time :', (end - start)]))+'\n')
        log_file.close()


    def add_all_new_pic(self):
        '''
            将从上次加载数据到当前新增的文件都加载到LSH Forest(有可能是新增加一个人,还有可能是对已有的人增加新图片)
            遍历文件夹(self.all_pic_data_folder),根据文件的时间判断是否需要加入该图片
            用户新加入的图片先进行人脸检测, 如果能够检测到人脸,使用检测结果, 否则使用用户的原始图片
        '''
        current_day = get_current_day()
        log_file = open(os.path.join(log_dir, current_day+'.txt'), 'a')

        start = time.time()
        person_list = os.listdir(self.all_pic_data_folder)
        add_num = 0
        for person in person_list:
            if self.must_same_str in person or self.maybe_same_str in person or self.new_person_str in person:
                continue
            person_path = os.path.join(self.all_pic_data_folder, person)
            if not os.path.isdir(person_path):
                continue
            pic_list = os.listdir(person_path)
            for pic in pic_list:
                pic_path = os.path.join(person_path, pic)
                last_modify_time = os.stat(pic_path).st_atime
                if last_modify_time > self.load_time:
                    # 请求本地服务
                    request = {
                        "label": person,
                        "request_type": 'add',
                        "one_pic_feature": pic_path
                    }
                    url = "http://127.0.0.1:%d/"%port
                    result = image_request(request, url)
                    try:
                        add_flag = json.loads(result)["add"]
                        if not add_flag:# 加载失败
                            log_file.write('\t'.join(map(str, ['no add file :', pic_path]))+'\n')
                        else:
                            add_num += 1
                    except:
                        log_file.write('\t'.join(map(str, ['no add file :', pic_path]))+'\n')
                        traceback.print_exc()
                        continue
                    add_num += 1
        end = time.time()
        if add_num > 0:
            self.load_time = end
            log_file.write('\t'.join(map(str, ['self.load_time', self.load_time]))+'\n')
            log_file.write('\t'.join(map(str, ['add pic num :', add_num,
                                               'Dynamic increase time :', (end - start)]))+'\n')
            log_file.close()
        else:
            log_file.close()


    def add_one_new_pic(self, pic_path, label):
        try:
            # 读入数据时已经转换成需要的尺寸
            im_feature = extract_feature_from_file(pic_path)
            self.add_one_pic(im_feature, label)
            return True
        except:
            traceback.print_exc()
            return False


    def add_one_pic(self, one_pic_feature, pic_label):
        '''
            将一个图像的特征加入到LSH Forest,同时将对应的标签加入到self.all_labels
            :param pic_feature: array shape :(1,1024)
            :param pic_label: (1,)
            :return:
        '''
        self.lshf.partial_fit(one_pic_feature.reshape(1, FEATURE_DIM), pic_label)
        self.all_labels.append(pic_label)
        self.all_pic_feature.append(np.reshape(one_pic_feature, newshape=(1, one_pic_feature.size)))


    def find_k_neighbors_with_lsh(self, one_pic_feature):
        '''
            :param one_pic_feature: 图像特征
            :return: 需要返回neighbors的特征, 用于计算pariwise
        '''
        try:
            tmp = self.lshf.kneighbors(one_pic_feature.reshape(1, FEATURE_DIM), n_neighbors=self.n_neighbors, return_distance=True)
            neighbors_label = np.asarray(self.all_labels)[tmp[1][0]]
            neighbors_feature = np.asarray(self.all_pic_feature)[tmp[1][0]]
            pair_score_list = []
            cos_sim_list = []
            for index in range(len(neighbors_feature)):
                pair_score = pw.cosine_similarity(neighbors_feature[index].reshape(1, FEATURE_DIM),
                                     one_pic_feature.reshape(1, FEATURE_DIM))[0][0]
                cos_sim_list.append(pair_score)
                pair_score_list.append(verification_model.predict(pair_score))
            result = zip(cos_sim_list, pair_score_list, neighbors_label)
            # result = self.filter_result(result)
            # result.sort(key=lambda x:x[0], reverse=True)
            return result
        except:
            return None


    def filter_result(self, result):
        '''
            :param result: [(cos_sim, same_person_result, label),
                            (cos_sim, same_person_result, label),
                            (cos_sim, same_person_result, label)] 按cos_sim降序排列
            :return: this_id(Must_same, Must_not_same, May_same), this_label(人名)
        '''
        # 分值相同的, 将new_person的删去
        tmp_dic = {}
        for element in result:
            this_score, this_same_person_result, this_label = element
            if this_score in tmp_dic:
                if self.new_person_str in this_label:
                    continue
                else:
                    tmp_dic[this_score] = element
            else:
                tmp_dic[this_score] = element
        result = tmp_dic.values()
        return result


    def evaluate_result(self, result):
        '''
            :param result: [(cos_sim, same_person_result, label),
                            (cos_sim, same_person_result, label),
                            (cos_sim, same_person_result, label)]
            :return: this_id(Must_same, Must_not_same, May_same), this_label(人名)
        '''
        for index, element in enumerate(result):
            this_score, this_same_person_result, this_label = element
            if this_same_person_result == self.verification_same_person and this_score > self.same_pic_threshold:
                return self.same_pic_id, this_label
            if this_same_person_result == self.verification_same_person and this_score > self.upper_threshold:
                return self.must_be_same_id, this_label
            if this_same_person_result == self.verification_same_person and this_score > self.lower_threshold:
                return self.maybe_same_id, this_label
        return self.must_be_not_same_id, ''


    def check_face_img(self, face_img, image_id):
        # 计算角度
        '''
        :param face_img: 人脸对应的矩阵
        :param image_id: 图片id
        :return: 是否进行识别(False:不进行识别)
        '''
        # 姿势检测

        current_day = get_current_day()
        log_file = open(os.path.join(log_dir, current_day+'.txt'), 'a')

        face_img_str = base64.b64encode(msgpack_numpy.dumps(face_img))
        request = {
            "request_type": 'check_pose',
            "face_img_str": face_img_str,
            "image_id": image_id,
        }
        url = "http://%s:%d/" % (check_ip, check_port)
        result = image_request(request, url)
        try:
            pose_predict = json.loads(result)["pose_predict"]
            if not pose_predict:  # 加载失败
                log_file.write('\t'.join(map(str, [image_id, 'pose filter request'])) + '\n')
                log_file.close()
                return False
            else:
                pose_predict = msgpack_numpy.loads(base64.b64decode(pose_predict))
                if pose_predict == None:
                    log_file.write('\t'.join(map(str, [image_id, 'pose filter detect'])) + '\n')
                    log_file.close()
                    return False
                pitch, yaw, roll = pose_predict[0]
                if math.fabs(pitch) < self.pitch_threshold and \
                        math.fabs(yaw) < self.yaw_threshold and \
                        math.fabs(roll) < self.roll_threshold:
                    log_file.close()
                    return True
                else:
                    log_file.write('\t'.join(map(str, [image_id, 'pose filter threshold'])) + '\n')
                    log_file.close()
                    return False
        except:
            traceback.print_exc()
            log_file.close()
            return False


    def recognize_online_cluster(self, image, image_id):
        '''
            :param image: 将得到的图片进行识别,加入的LSH Forest,根据距离计算proba(不同的距离对应不同的准确率,根据已有的dist计算阈值);
                            和已经设定的阈值判断是不是一个新出现的人,确定是原来已有的人,还是不确定是原来已有的人
            # 增加统计的功能, 方便以后计算过滤原因和比例, 以及识别比例(same, not_same, maybe_same)
            :return:
        '''
        start = time.time()
        need_add = False
        has_save_num = 0
        current_day = get_current_day()
        log_file = open(os.path.join(log_dir, current_day+'.txt'), 'a')
        log_file.write('\t'.join(map(str, ["receive image", image_id, time.time()])) + '\n')
        try:
            image = base64.decodestring(image)
            image = zlib.decompress(image)
            im = cv2.imdecode(np.fromstring(image, dtype=np.uint8), 1)
            time_slot = get_time_slot(image_id)
            if time_slot == None:
                time_slot = 'error'
            time_slot_dir = os.path.join(tmp_face_dir, time_slot)
            if not os.path.exists(time_slot_dir):
                os.makedirs(time_slot_dir)
            tmp_pic_path = os.path.join(time_slot_dir, image_id+'.jpg')
            cv2.imwrite(tmp_pic_path, im)
            blur_result = is_blur(im)
            blur_sign, blur_var = blur_result
            if blur_sign:
                log_file.write('\t'.join(map(str, ['stat', 'blur_filter', blur_var, image_id]))+'\n')
                log_file.close()
                return self.unknown, 1.0, self.has_save_pic_feature, need_add
            align_face_img = align_face(tmp_pic_path)
            if align_face_img == None:
                log_file.write('\t'.join(map(str, ['stat', 'detect_filter', blur_var, image_id])) + '\n')
                log_file.close()
                return self.unknown, 1.0, self.has_save_pic_feature, need_add
            else:
                # 使用重新检测并对对齐的人脸进行识别
                im = align_face_img
            # 对检测到的人脸重新进行模糊检测
            blur_result = is_blur(im)
            blur_sign, blur_var = blur_result
            if blur_sign:
                log_file.write('\t'.join(map(str, ['stat', 'blur_filter', blur_var, image_id]))+'\n')
                log_file.close()
                return self.unknown, 1.0, self.has_save_pic_feature, need_add
            need_process = self.check_face_img(im, image_id)
            if not need_process:
                log_file.write('\t'.join(map(str, ['stat', 'pose_filter', blur_var, image_id])) + '\n')
                log_file.close()
                return self.unknown, 1.0, self.has_save_pic_feature, need_add
            im = cv2.resize(im, (PIC_SHAPE[1], PIC_SHAPE[2]), interpolation=cv2.INTER_LINEAR)
            im = im[:, :, ::-1]*1.0
            im = im - avg
            im = im.transpose((2, 0, 1))
            im = im[None, :]
        except:
            traceback.print_exc()
            return self.unknown, 1.0, self.has_save_pic_feature, need_add
        try:
            # 流程 : 找距离最近的图片 ; 计算prob ; 在线聚类 ; 加入LSH Forest
            im_feature = extract_feature_from_numpy(im)
            try:
                # nearest_sim_list的格式和dist_label_list的格式一样,这样可以将两个list合并,一起计算(这样不用考虑时间的因素)
                # 在识别出人名后将人名和feature放入到self.nearest
                nearest_sim_list = self.cal_nearest_sim(current_feature=im_feature)
            except:
                traceback.print_exc()
                nearest_sim_list = []
            log_file.write('\t'.join(map(str, ['nearest_sim_list :', map(str, nearest_sim_list)])) + '\n')

            # 找距离最近的图片 --- 用LSH Forest 找出最近的10张图片,然后分别计算距离
            dist_label_list = self.find_k_neighbors_with_lsh(im_feature)
            dist_label_list.extend(nearest_sim_list)
            dist_label_list = self.filter_result(dist_label_list)
            dist_label_list.sort(key=lambda x: x[0], reverse=True)
            # 计算
            if dist_label_list == None:
                this_id = self.must_be_not_same_id
                this_label = self.new_person_str + str(self.current_new_person_id)
            else:
                # 计算prob --- 根据距离计算prob
                this_id, this_label = self.evaluate_result(dist_label_list)
            # 在线聚类 --- 根据dist确定是重新增加一个人还是加入到已有的人中
            log_file.write('\t'.join(map(str, ['stat', 'recognize_id', blur_var, this_id])) + '\n')
            if dist_label_list != None and len(dist_label_list) > 0:
                log_file.write('\t'.join(map(str, ['dist_label_list :', map(str, dist_label_list)])) + '\n')
            need_save = False
            if this_id == self.same_pic_id:
                need_add = False
            elif this_id == self.must_be_same_id:
                need_add = False
                need_save = True
                this_person_folder = os.path.join(self.all_pic_data_folder, this_label+self.must_same_str)
            elif this_id == self.must_be_not_same_id:
                this_label = self.new_person_str + str(self.current_new_person_id)
                self.current_new_person_id += 1
                this_person_folder = os.path.join(self.all_pic_data_folder, this_label)
                need_add = True
                need_save = True
            elif this_id == self.maybe_same_id:
                this_person_folder = os.path.join(self.all_pic_data_folder, this_label+self.maybe_same_str)
                need_add = False # prob在灰度区域的不如入,其余情况加入
                need_save = True
            else:
                log_file.write('\t'.join(map(str, ['error para :', this_id])) + '\n')
            if need_save:
                try:
                    if not os.path.exists(this_person_folder):
                        os.makedirs(this_person_folder)
                        os.chmod(this_person_folder, stat.S_IRWXG + stat.S_IRWXO + stat.S_IRWXU)
                    this_pic_name = os.path.join(this_person_folder, image_id+'.png')
                    imsave(this_pic_name, np.transpose(im[0], (1, 2, 0)))
                except:
                    traceback.print_exc()
                    return self.unknown, 1.0, has_save_num, False

            # 加入LSH Forest --- partial_fit
            if need_add:
                self.add_one_pic(im_feature, this_label)
                has_save_num += 1
                # 根据label和image_id可以存生成文件名,确定是否要存储文件[可以选择在服务器和本地同时存储]
            if this_id == self.same_pic_id or this_id == self.must_be_not_same_id or this_id == self.must_be_same_id:
                end = time.time()
                log_file.write('\t'.join(map(str, ['stat recognize_time :', (end - start), 'this_id :', self.trans_dic.get(this_id)])) + '\n')
                log_file.close()
                return this_label.replace(self.must_same_str, ''), \
                       str(dist_label_list[0][0]), str(has_save_num), str(need_add)
            else:
                # 灰度区域,不显示人名
                end = time.time()
                log_file.write('\t'.join(map(str, ['gray area recog time :',(end - start)])) + '\n')
                log_file.close()
                # return this_label.replace(self.maybe_same_str, ''), \
                #        str(dist_label_list[0][0]), str(has_save_num), str(need_add)
                return self.unknown, str(dist_label_list[0][0]), str(has_save_num), str(need_add)
        except:
            traceback.print_exc()
            log_file.close()
            return self.unknown, str(100.0), str(has_save_num), str(False)