Exemplo n.º 1
0
def test_partial_fit():
    """Checks whether inserting array is consitent with fitted data.

    `partial_fit` method should set all attribute values correctly.
    """
    n_samples = 12
    n_samples_partial_fit = 3
    n_features = 2
    rng = np.random.RandomState(42)
    X = rng.rand(n_samples, n_features)
    X_partial_fit = rng.rand(n_samples_partial_fit, n_features)

    lshf = LSHForest()

    # Test unfitted estimator
    lshf.partial_fit(X)
    assert_array_equal(X, lshf._fit_X)

    lshf.fit(X)

    # Insert wrong dimension
    assert_raises(ValueError, lshf.partial_fit,
                  np.random.randn(n_samples_partial_fit, n_features - 1))

    lshf.partial_fit(X_partial_fit)

    # size of _input_array = samples + 1 after insertion
    assert_equal(lshf._fit_X.shape[0], n_samples + n_samples_partial_fit)
    # size of original_indices_[1] = samples + 1
    assert_equal(len(lshf.original_indices_[0]),
                 n_samples + n_samples_partial_fit)
    # size of trees_[1] = samples + 1
    assert_equal(len(lshf.trees_[1]), n_samples + n_samples_partial_fit)
Exemplo n.º 2
0
def test_partial_fit():
    """Checks whether inserting array is consitent with fitted data.

    `partial_fit` method should set all attribute values correctly.
    """
    n_samples = 12
    n_samples_partial_fit = 3
    n_features = 2
    rng = np.random.RandomState(42)
    X = rng.rand(n_samples, n_features)
    X_partial_fit = rng.rand(n_samples_partial_fit, n_features)

    lshf = LSHForest()

    # Test unfitted estimator
    lshf.partial_fit(X)
    assert_array_equal(X, lshf._fit_X)

    lshf.fit(X)

    # Insert wrong dimension
    assert_raises(ValueError, lshf.partial_fit,
                  np.random.randn(n_samples_partial_fit, n_features - 1))

    lshf.partial_fit(X_partial_fit)

    # size of _input_array = samples + 1 after insertion
    assert_equal(lshf._fit_X.shape[0],
                 n_samples + n_samples_partial_fit)
    # size of original_indices_[1] = samples + 1
    assert_equal(len(lshf.original_indices_[0]),
                 n_samples + n_samples_partial_fit)
    # size of trees_[1] = samples + 1
    assert_equal(len(lshf.trees_[1]),
                 n_samples + n_samples_partial_fit)
Exemplo n.º 3
0
def cal_acc(pack_file, stat_file, feature_dim):
    f = open(stat_file, 'w')
    f.write('train_pic_num'+'\t'+'person_name'+'\t'+'acc'+'\n')
    pic_num = range(1, max_person_num)
    for num in pic_num:
        all_train_data, all_train_label, all_valid_data, all_valid_label = split_train_valid(pack_file, train_pic_num=num, feature_dim=feature_dim)
        lshf = LSHForest(n_estimators=20, n_candidates=200, n_neighbors=5)

        for index in range(len(all_train_data)):
            try:
                if all_train_data[index] == None:
                    continue
                lshf.partial_fit(all_train_data[index], all_train_label[index])
            except:
                traceback.print_exc()
                continue
        # 对于每个人,分别统计准确率
        person_acc_dic = {}     # 准确的个数
        person_all_dic = {}     # 总的个数
        filter_num = 0
        all_num = 0
        for index in range(len(all_valid_data)):
            try:
                if all_valid_data[index] == None:
                    continue
                all_find_distance, all_find_index = lshf.kneighbors(all_valid_data[index], n_neighbors=5, return_distance=True)
                cos_sim = cosine_similarity(all_valid_data[index], all_train_data[all_find_index[0, 0]])
                label = all_train_label[all_find_index[0, 0]]
                # if cos_sim > sim_threshold:
                if True:
                    if label == all_valid_label[index]:
                        person_acc_dic[label] = person_acc_dic.get(label, 0) + 1
                        person_all_dic[label] = person_all_dic.get(label, 0) + 1
                    else:
                        person_all_dic[label] = person_all_dic.get(label, 0) + 1
                else:
                    filter_num += 1
                all_num += 1
            except:
                print all_valid_label[index]
                continue
        print 'train_num :', num, 'filter_rate: ', (filter_num * 1.0 / all_num)
        for person in person_all_dic:
            all_num = person_all_dic[person]
            right_num = person_acc_dic.get(person, 0)
            f.write('\t'.join(map(str, [num, person, (right_num * 1.0 /  all_num)]))+'\n')
Exemplo n.º 4
0
def cal_recall(pack_file, stat_file, feature_dim):
    # f_model = open('verf.txt', 'w')
    f = open(stat_file, 'w')
    f.write('train_pic_num'+'\t'+'person_name'+'\t'+'recall'+'\n')
    pic_num = range(1, max_person_num)
    for num in pic_num:
        all_train_data, all_train_label, all_valid_data, all_valid_label = split_train_valid(pack_file, train_pic_num=num, feature_dim=feature_dim)
        lshf = LSHForest(n_estimators=20, n_candidates=200, n_neighbors=5)
        for index in range(len(all_train_data)):
            try:
                if all_train_data[index] == None:
                    continue
                lshf.partial_fit(all_train_data[index], all_train_label[index])
            except:
                continue
        # 对于每个人,分别统计准确率
        person_find_dic = {}     # 准确的个数
        person_all_dic = {}     # 总的个数
        for index in range(len(all_valid_data)):
            try:
                if all_valid_data[index] == None:
                    continue
                all_find_distance, all_find_index = lshf.kneighbors(all_valid_data[index], n_neighbors=5, return_distance=True)
                cos_sim = cosine_similarity(all_valid_data[index], all_train_data[all_find_index[0, 0]])
                label = all_train_label[all_find_index[0, 0]]
                real_label = all_valid_label[index]
                # if cos_sim > sim_threshold:
                if True:
                    if label == real_label:
                        # f_model.write('0'+'\t'+str(cos_sim)+'\n')
                        person_find_dic[real_label] = person_find_dic.get(real_label, 0) + 1
                        person_all_dic[real_label] = person_all_dic.get(real_label, 0) + 1
                    else:
                        # f_model.write('1' + '\t' + str(cos_sim) + '\n')
                        person_all_dic[real_label] = person_all_dic.get(real_label, 0) + 1
            except:
                print all_valid_label[index]
                continue
        print 'train_num :', num
        for person in person_all_dic:
            all_num = person_all_dic[person]
            right_num = person_find_dic.get(person, 0)
            f.write('\t'.join(map(str, [num, person, (right_num * 1.0 /  all_num)]))+'\n')
Exemplo n.º 5
0
def train_LSHForest(model, batch_size=1000, n_candidates=50, n_estimators=10):
    ''' Given a large wordvec or GloVe model, we need to efficiently be able
        to get a word back from a vector. Current methods rely on
        inefficient search algorithms.

        Args
        ----
        model : gensim.model
                pretrained WordVec model
        batch_size : int
        n_candidates : int
                       number of candidates for LSH to generate
        n_estimators : number of LSH trees in forest

        Returns
        -------
        lshf : LSHForest
    '''
    lshf = LSHForest(n_candidates=n_candidates, n_estimators=n_estimators)
    for batch in grouper(model.index2word, batch_size):
        array = np.array([model[word] for word in batch])
        lshf.partial_fit(array)
    return lshf
class FaceRecognition():
    def __init__(self):
        self.unknown = ''
        self.same_person_num = 1
        self.has_save_pic_feature = []
        self.has_cal_dist = []
        self.NeighbourNum = 10
        self.all_pic_data_folder = '/data/liubo/face/self'
        self.other_dataset_para_add = 1
        self.n_neighbors = 5
        self.lshf = LSHForest(n_estimators=20,
                              n_candidates=200,
                              n_neighbors=self.n_neighbors)
        self.all_labels = []
        self.all_pic_feature = []
        self.same_pic_id = 2
        self.must_be_same_id = 1
        self.must_be_not_same_id = 0
        self.maybe_same_id = 3
        self.new_person_str = 'new_person_'
        self.current_new_person_id = self.find_current_new_person_id()
        self.must_same_str = '_Must_Same'
        self.maybe_same_str = '_Maybe_same'
        self.load_time = time.time()
        self.user_count = {}
        # 不同的模型阈值不相同
        self.upper_threshold = upper_verif_threshold
        self.lower_threshold = lower_verif_threshold
        self.same_pic_threshold = same_pic_threshold
        self.pitch_threshold = 20
        self.yaw_threshold = 20
        self.roll_threshold = 20
        #  [(time, feature),...,(time, feature)] : 根据时间计算当前图片与前5张图片的相似度(如果时间相差很多, 不在计算)
        self.nearest = deque(maxlen=nearest_num)
        self.trans_dic = {
            self.same_pic_id: 'same_pic',
            self.must_be_same_id: 'must_same_id',
            self.must_be_not_same_id: 'must_not_same_id',
            self.maybe_same_id: 'maybe_same_id'
        }
        self.verification_same_person = 0

    def cal_nearest_sim(self, current_feature):
        nearest_sim_list = []
        try:
            length = len(self.nearest)
            for k in range(length):
                try:
                    person_name, pre_feature = self.nearest[k]
                    # 不在考虑时间, 只考虑图片的相似度
                    this_sim = pw.cosine_similarity(
                        np.reshape(np.asarray(pre_feature),
                                   (1, len(pre_feature))),
                        np.reshape(np.asarray(current_feature),
                                   (1, len(current_feature))))

                    nearest_sim_list.append(
                        (this_sim, verification_model.predict(this_sim),
                         person_name))
                except:
                    traceback.print_exc()
                    continue
            return nearest_sim_list
        except:
            traceback.print_exc()
            return nearest_sim_list

    def find_current_new_person_id(self):
        current_day = get_current_day()
        log_file = open(os.path.join(log_dir, current_day + '.txt'), 'a')

        old_person_id = []
        person_list = os.listdir(self.all_pic_data_folder)
        for person in person_list:
            if person.startswith(self.new_person_str):
                tmp = person[len(self.new_person_str):].split('_')
                if len(tmp) > 0:
                    this_id = int(tmp[0])
                    old_person_id.append(this_id)
        if len(old_person_id) == 0:
            current_new_person_id = 0
        else:
            current_new_person_id = max(old_person_id) + 1
        log_file.write('\t'.join(
            map(str, ['current_new_person_id :', current_new_person_id])) +
                       '\n')
        log_file.close()
        return current_new_person_id

    def extract_pic_feature(self,
                            pic_data,
                            batch_size=1,
                            feature_dim=FEATURE_DIM):
        '''
            用于提取多张图片的特征(用于处理load数据)
            :param pic_data: 图片数据
            :param batch_size:
            :param feature_dim: 模型输出维度(vgg的输出是4096)
            :return:
        '''
        pic_feature = np.zeros(shape=(pic_data.shape[0], feature_dim))
        batch_num = pic_data.shape[0] / batch_size
        for index in range(batch_num):
            pic_feature[index*batch_size:(index+1)*batch_size, :] = \
                extract_feature_from_numpy(pic_data[index*batch_size:(index+1)*batch_size])
        if batch_num * batch_size < pic_data.shape[0]:
            pic_feature[batch_num*batch_size:, :] = \
                extract_feature_from_numpy(pic_data[batch_num*batch_size:])
        return pic_feature

    def load_all_data(self):
        # 将以前标记的数据全部读入,用LSH Forest保存,方便计算距离
        current_day = get_current_day()
        log_file = open(os.path.join(log_dir, current_day + '.txt'), 'a')

        train_data, train_label = load_train_data(self.all_pic_data_folder)
        if len(train_label) == 0:
            return
        pic_feature = self.extract_pic_feature(train_data)
        start = time.time()
        self.lshf.fit(pic_feature, train_label)
        self.all_pic_feature = list(pic_feature)
        self.all_labels = list(train_label)
        end = time.time()
        self.load_time = end
        self.user_count = Counter(self.all_labels)
        log_file.write('\t'.join(
            map(str, [self.user_count, 'fit all data time :',
                      (end - start)])) + '\n')
        log_file.close()

    def add_all_new_pic(self):
        '''
            将从上次加载数据到当前新增的文件都加载到LSH Forest(有可能是新增加一个人,还有可能是对已有的人增加新图片)
            遍历文件夹(self.all_pic_data_folder),根据文件的时间判断是否需要加入该图片
            用户新加入的图片先进行人脸检测, 如果能够检测到人脸,使用检测结果, 否则使用用户的原始图片
        '''
        current_day = get_current_day()
        log_file = open(os.path.join(log_dir, current_day + '.txt'), 'a')

        start = time.time()
        person_list = os.listdir(self.all_pic_data_folder)
        add_num = 0
        for person in person_list:
            if self.must_same_str in person or self.maybe_same_str in person or self.new_person_str in person:
                continue
            person_path = os.path.join(self.all_pic_data_folder, person)
            if not os.path.isdir(person_path):
                continue
            pic_list = os.listdir(person_path)
            for pic in pic_list:
                pic_path = os.path.join(person_path, pic)
                last_modify_time = os.stat(pic_path).st_atime
                if last_modify_time > self.load_time:
                    # 请求本地服务
                    request = {
                        "label": person,
                        "request_type": 'add',
                        "one_pic_feature": pic_path
                    }
                    url = "http://127.0.0.1:%d/" % port
                    result = image_request(request, url)
                    try:
                        add_flag = json.loads(result)["add"]
                        if not add_flag:  # 加载失败
                            log_file.write('\t'.join(
                                map(str, ['no add file :', pic_path])) + '\n')
                        else:
                            add_num += 1
                    except:
                        log_file.write(
                            '\t'.join(map(str, ['no add file :', pic_path])) +
                            '\n')
                        traceback.print_exc()
                        continue
                    add_num += 1
        end = time.time()
        if add_num > 0:
            self.load_time = end
            log_file.write(
                '\t'.join(map(str, ['self.load_time', self.load_time])) + '\n')
            log_file.write('\t'.join(
                map(str, [
                    'add pic num :', add_num, 'Dynamic increase time :',
                    (end - start)
                ])) + '\n')
            log_file.close()
        else:
            log_file.close()

    def add_one_new_pic(self, pic_path, label):
        try:
            # 读入数据时已经转换成需要的尺寸
            im_feature = extract_feature_from_file(pic_path)
            self.add_one_pic(im_feature, label)
            return True
        except:
            traceback.print_exc()
            return False

    def add_one_pic(self, one_pic_feature, pic_label):
        '''
            将一个图像的特征加入到LSH Forest,同时将对应的标签加入到self.all_labels
            :param pic_feature: array shape :(1,1024)
            :param pic_label: (1,)
            :return:
        '''
        self.lshf.partial_fit(one_pic_feature.reshape(1, FEATURE_DIM),
                              pic_label)
        self.all_labels.append(pic_label)
        self.all_pic_feature.append(
            np.reshape(one_pic_feature, newshape=(1, one_pic_feature.size)))

    def find_k_neighbors_with_lsh(self, one_pic_feature):
        '''
            :param one_pic_feature: 图像特征
            :return: 需要返回neighbors的特征, 用于计算pariwise
        '''
        try:
            tmp = self.lshf.kneighbors(one_pic_feature.reshape(1, FEATURE_DIM),
                                       n_neighbors=self.n_neighbors,
                                       return_distance=True)
            neighbors_label = np.asarray(self.all_labels)[tmp[1][0]]
            neighbors_feature = np.asarray(self.all_pic_feature)[tmp[1][0]]
            pair_score_list = []
            cos_sim_list = []
            for index in range(len(neighbors_feature)):
                pair_score = pw.cosine_similarity(
                    neighbors_feature[index].reshape(1, FEATURE_DIM),
                    one_pic_feature.reshape(1, FEATURE_DIM))[0][0]
                cos_sim_list.append(pair_score)
                pair_score_list.append(verification_model.predict(pair_score))
            result = zip(cos_sim_list, pair_score_list, neighbors_label)
            # result = self.filter_result(result)
            # result.sort(key=lambda x:x[0], reverse=True)
            return result
        except:
            return None

    def filter_result(self, result):
        '''
            :param result: [(cos_sim, same_person_result, label),
                            (cos_sim, same_person_result, label),
                            (cos_sim, same_person_result, label)] 按cos_sim降序排列
            :return: this_id(Must_same, Must_not_same, May_same), this_label(人名)
        '''
        # 分值相同的, 将new_person的删去
        tmp_dic = {}
        for element in result:
            this_score, this_same_person_result, this_label = element
            if this_score in tmp_dic:
                if self.new_person_str in this_label:
                    continue
                else:
                    tmp_dic[this_score] = element
            else:
                tmp_dic[this_score] = element
        result = tmp_dic.values()
        return result

    def evaluate_result(self, result):
        '''
            :param result: [(cos_sim, same_person_result, label),
                            (cos_sim, same_person_result, label),
                            (cos_sim, same_person_result, label)]
            :return: this_id(Must_same, Must_not_same, May_same), this_label(人名)
        '''
        for index, element in enumerate(result):
            this_score, this_same_person_result, this_label = element
            if this_same_person_result == self.verification_same_person and this_score > self.same_pic_threshold:
                return self.same_pic_id, this_label
            if this_same_person_result == self.verification_same_person and this_score > self.upper_threshold:
                return self.must_be_same_id, this_label
            if this_same_person_result == self.verification_same_person and this_score > self.lower_threshold:
                return self.maybe_same_id, this_label
        return self.must_be_not_same_id, ''

    def check_face_img(self, face_img, image_id):
        # 计算角度
        '''
        :param face_img: 人脸对应的矩阵
        :param image_id: 图片id
        :return: 是否进行识别(False:不进行识别)
        '''
        # 姿势检测

        current_day = get_current_day()
        log_file = open(os.path.join(log_dir, current_day + '.txt'), 'a')

        face_img_str = base64.b64encode(msgpack_numpy.dumps(face_img))
        request = {
            "request_type": 'check_pose',
            "face_img_str": face_img_str,
            "image_id": image_id,
        }
        url = "http://%s:%d/" % (check_ip, check_port)
        result = image_request(request, url)
        try:
            pose_predict = json.loads(result)["pose_predict"]
            if not pose_predict:  # 加载失败
                log_file.write(
                    '\t'.join(map(str, [image_id, 'pose filter request'])) +
                    '\n')
                log_file.close()
                return False
            else:
                pose_predict = msgpack_numpy.loads(
                    base64.b64decode(pose_predict))
                if pose_predict == None:
                    log_file.write(
                        '\t'.join(map(str, [image_id, 'pose filter detect'])) +
                        '\n')
                    log_file.close()
                    return False
                pitch, yaw, roll = pose_predict[0]
                if math.fabs(pitch) < self.pitch_threshold and \
                        math.fabs(yaw) < self.yaw_threshold and \
                        math.fabs(roll) < self.roll_threshold:
                    log_file.close()
                    return True
                else:
                    log_file.write('\t'.join(
                        map(str, [image_id, 'pose filter threshold'])) + '\n')
                    log_file.close()
                    return False
        except:
            traceback.print_exc()
            log_file.close()
            return False

    def recognize_online_cluster(self, image, image_id):
        '''
            :param image: 将得到的图片进行识别,加入的LSH Forest,根据距离计算proba(不同的距离对应不同的准确率,根据已有的dist计算阈值);
                            和已经设定的阈值判断是不是一个新出现的人,确定是原来已有的人,还是不确定是原来已有的人
            # 增加统计的功能, 方便以后计算过滤原因和比例, 以及识别比例(same, not_same, maybe_same)
            :return:
        '''
        start = time.time()
        need_add = False
        has_save_num = 0
        current_day = get_current_day()
        log_file = open(os.path.join(log_dir, current_day + '.txt'), 'a')
        log_file.write(
            '\t'.join(map(str, ["receive image", image_id,
                                time.time()])) + '\n')
        try:
            image = base64.decodestring(image)
            image = zlib.decompress(image)
            im = cv2.imdecode(np.fromstring(image, dtype=np.uint8), 1)
            time_slot = get_time_slot(image_id)
            if time_slot == None:
                time_slot = 'error'
            time_slot_dir = os.path.join(tmp_face_dir, time_slot)
            if not os.path.exists(time_slot_dir):
                os.makedirs(time_slot_dir)
            tmp_pic_path = os.path.join(time_slot_dir, image_id + '.jpg')
            cv2.imwrite(tmp_pic_path, im)
            blur_result = is_blur(im)
            blur_sign, blur_var = blur_result
            if blur_sign:
                log_file.write('\t'.join(
                    map(str, ['stat', 'blur_filter', blur_var, image_id])) +
                               '\n')
                log_file.close()
                return self.unknown, 1.0, self.has_save_pic_feature, need_add
            align_face_img = align_face(tmp_pic_path)
            if align_face_img == None:
                log_file.write('\t'.join(
                    map(str, ['stat', 'detect_filter', blur_var, image_id])) +
                               '\n')
                log_file.close()
                return self.unknown, 1.0, self.has_save_pic_feature, need_add
            else:
                # 使用重新检测并对对齐的人脸进行识别
                im = align_face_img
            # 对检测到的人脸重新进行模糊检测
            blur_result = is_blur(im)
            blur_sign, blur_var = blur_result
            if blur_sign:
                log_file.write('\t'.join(
                    map(str, ['stat', 'blur_filter', blur_var, image_id])) +
                               '\n')
                log_file.close()
                return self.unknown, 1.0, self.has_save_pic_feature, need_add
            need_process = self.check_face_img(im, image_id)
            if not need_process:
                log_file.write('\t'.join(
                    map(str, ['stat', 'pose_filter', blur_var, image_id])) +
                               '\n')
                log_file.close()
                return self.unknown, 1.0, self.has_save_pic_feature, need_add
            im = cv2.resize(im, (PIC_SHAPE[1], PIC_SHAPE[2]),
                            interpolation=cv2.INTER_LINEAR)
            im = im[:, :, ::-1] * 1.0
            im = im - avg
            im = im.transpose((2, 0, 1))
            im = im[None, :]
        except:
            traceback.print_exc()
            return self.unknown, 1.0, self.has_save_pic_feature, need_add
        try:
            # 流程 : 找距离最近的图片 ; 计算prob ; 在线聚类 ; 加入LSH Forest
            im_feature = extract_feature_from_numpy(im)
            try:
                # nearest_sim_list的格式和dist_label_list的格式一样,这样可以将两个list合并,一起计算(这样不用考虑时间的因素)
                # 在识别出人名后将人名和feature放入到self.nearest
                nearest_sim_list = self.cal_nearest_sim(
                    current_feature=im_feature)
            except:
                traceback.print_exc()
                nearest_sim_list = []
            log_file.write('\t'.join(
                map(str, ['nearest_sim_list :',
                          map(str, nearest_sim_list)])) + '\n')

            # 找距离最近的图片 --- 用LSH Forest 找出最近的10张图片,然后分别计算距离
            dist_label_list = self.find_k_neighbors_with_lsh(im_feature)
            dist_label_list.extend(nearest_sim_list)
            dist_label_list = self.filter_result(dist_label_list)
            dist_label_list.sort(key=lambda x: x[0], reverse=True)
            # 计算
            if dist_label_list == None:
                this_id = self.must_be_not_same_id
                this_label = self.new_person_str + str(
                    self.current_new_person_id)
            else:
                # 计算prob --- 根据距离计算prob
                this_id, this_label = self.evaluate_result(dist_label_list)
            # 在线聚类 --- 根据dist确定是重新增加一个人还是加入到已有的人中
            log_file.write('\t'.join(
                map(str, ['stat', 'recognize_id', blur_var, this_id])) + '\n')
            if dist_label_list != None and len(dist_label_list) > 0:
                log_file.write('\t'.join(
                    map(str, ['dist_label_list :',
                              map(str, dist_label_list)])) + '\n')
            need_save = False
            if this_id == self.same_pic_id:
                need_add = False
            elif this_id == self.must_be_same_id:
                need_add = False
                need_save = True
                this_person_folder = os.path.join(
                    self.all_pic_data_folder, this_label + self.must_same_str)
            elif this_id == self.must_be_not_same_id:
                this_label = self.new_person_str + str(
                    self.current_new_person_id)
                self.current_new_person_id += 1
                this_person_folder = os.path.join(self.all_pic_data_folder,
                                                  this_label)
                need_add = True
                need_save = True
            elif this_id == self.maybe_same_id:
                this_person_folder = os.path.join(
                    self.all_pic_data_folder, this_label + self.maybe_same_str)
                need_add = False  # prob在灰度区域的不如入,其余情况加入
                need_save = True
            else:
                log_file.write('\t'.join(map(str, ['error para :', this_id])) +
                               '\n')
            if need_save:
                try:
                    if not os.path.exists(this_person_folder):
                        os.makedirs(this_person_folder)
                        os.chmod(this_person_folder,
                                 stat.S_IRWXG + stat.S_IRWXO + stat.S_IRWXU)
                    this_pic_name = os.path.join(this_person_folder,
                                                 image_id + '.png')
                    imsave(this_pic_name, np.transpose(im[0], (1, 2, 0)))
                except:
                    traceback.print_exc()
                    return self.unknown, 1.0, has_save_num, False

            # 加入LSH Forest --- partial_fit
            if need_add:
                self.add_one_pic(im_feature, this_label)
                has_save_num += 1
                # 根据label和image_id可以存生成文件名,确定是否要存储文件[可以选择在服务器和本地同时存储]
            if this_id == self.same_pic_id or this_id == self.must_be_not_same_id or this_id == self.must_be_same_id:
                end = time.time()
                log_file.write('\t'.join(
                    map(str, [
                        'stat recognize_time :', (end - start), 'this_id :',
                        self.trans_dic.get(this_id)
                    ])) + '\n')
                log_file.close()
                return this_label.replace(self.must_same_str, ''), \
                       str(dist_label_list[0][0]), str(has_save_num), str(need_add)
            else:
                # 灰度区域,不显示人名
                end = time.time()
                log_file.write('\t'.join(
                    map(str, ['gray area recog time :', (end - start)])) +
                               '\n')
                log_file.close()
                # return this_label.replace(self.maybe_same_str, ''), \
                #        str(dist_label_list[0][0]), str(has_save_num), str(need_add)
                return self.unknown, str(
                    dist_label_list[0][0]), str(has_save_num), str(need_add)
        except:
            traceback.print_exc()
            log_file.close()
            return self.unknown, str(100.0), str(has_save_num), str(False)
Exemplo n.º 7
0
class Search():
    def __init__(self,
                 model_type,
                 n_estimators=20,
                 n_candidates=200,
                 n_neighbors=10):
        self.lshf = LSHForest(n_estimators=n_estimators,
                              n_candidates=n_candidates,
                              n_neighbors=n_neighbors)

        if model_type == 'rgb_small':
            self.deepid_model_file = '/data/liubo/face/vgg_face_dataset/model/vgg_face.all_data.small.rgb.deepid.model'
            self.deepid_weight_file = '/data/liubo/face/vgg_face_dataset/model/vgg_face.all_data.small.rgb.deepid.weight'
            self.part_func = None
            self.pic_shape = (50, 50, 3)
            self.feature_dim = 1024
        elif model_type == 'rgb_big':
            self.deepid_weight_file = '/data/liubo/face/vgg_face_dataset/model/vgg_face.all_data.big.rgb.deepid.weight'
            self.deepid_model_file = '/data/liubo/face/vgg_face_dataset/model/vgg_face.all_data.big.rgb.deepid.model'
            self.part_func = None
            self.pic_shape = (128, 128, 3)
        elif model_type == 'rgb_small_right':
            self.deepid_weight_file = '/data/liubo/face/vgg_face_dataset/model/vgg_face.small_data.small.rgb.right_eye.deepid.weight'
            self.deepid_model_file = '/data/liubo/face/vgg_face_dataset/model/vgg_face.small_data.small.rgb.right_eye.deepid.model'
            self.part_func = get_right_eye
            self.pic_shape = (50, 50, 3)
        elif model_type == 'rgb_small_left':
            self.deepid_weight_file = '/data/liubo/face/vgg_face_dataset/model/vgg_face.small_data.small.rgb.left_eye.deepid.weight'
            self.deepid_model_file = '/data/liubo/face/vgg_face_dataset/model/vgg_face.small_data.small.rgb.left_eye.deepid.model'
            self.part_func = get_left_eye
            self.pic_shape = (50, 50, 3)
        elif model_type == 'rgb_small_nose':
            self.deepid_weight_file = '/data/liubo/face/vgg_face_dataset/model/vgg_face.all_data.all.rgb.nose.deepid.weight'
            self.deepid_model_file = '/data/liubo/face/vgg_face_dataset/model/vgg_face.all_data.all.rgb.nose.deepid.model'
            self.part_func = get_nose
            self.pic_shape = (50, 50, 3)
        elif model_type == 'new_shape':
            self.deepid_model_file = '/data/liubo/face/vgg_face_dataset/model/vgg_face.small_data.new_shape.rgb.deepid.model'
            self.deepid_weight_file = '/data/liubo/face/vgg_face_dataset/model/vgg_face.small_data.new_shape.rgb.deepid.weight'
            self.pic_shape = (156, 124, 3)
            self.feature_dim = 256
            self.part_func = None
        self.model, self.get_Conv_FeatureMap = load_deepid_model(
            self.deepid_model_file, self.deepid_weight_file)
        self.all_label = None
        self.all_feature_data = None

    def extract_pic_feature(self, pic_data, batch_size=128, feature_dim=1024):
        pic_feature = np.zeros(shape=(pic_data.shape[0], feature_dim))
        batch_num = pic_data.shape[0] / batch_size
        for index in range(batch_num):
            # pic_feature[index*batch_size:(index+1)*batch_size, :] = \
            #     self.get_Conv_FeatureMap([pic_data[index*batch_size:(index+1)*batch_size], 0])[0]
            pic_feature[index*batch_size:(index+1)*batch_size, :] = \
                self.get_Conv_FeatureMap([np.transpose(pic_data[index*batch_size:(index+1)*batch_size], (0, 3, 1, 2)), 0])[0]

        if batch_num * batch_size < pic_data.shape[0]:
            # pic_feature[batch_num*batch_size:, :] = \
            #     self.get_Conv_FeatureMap([pic_data[batch_num*batch_size:], 0])[0]
            pic_feature[batch_num*batch_size:, :] = \
                self.get_Conv_FeatureMap([np.transpose(pic_data[batch_num*batch_size:], (0, 3, 1, 2)), 0])[0]
        return pic_feature

    def train_all_data(self,
                       vgg_folder,
                       person_num=100,
                       batch_person_num=20,
                       pic_num=10):
        # 取前pic_num张图片加入到LSH Forest,其余图片用于判断准确率
        for index in range(0 + train_person_start_index,
                           person_num + train_person_start_index,
                           batch_person_num):
            if index == 0 + train_person_start_index:
                pic_data, all_label = load_batch_train_data(
                    vgg_folder,
                    shape=self.pic_shape,
                    start_person_index=index,
                    pic_num=pic_num,
                    batch_num=batch_person_num,
                    is_train=True,
                    part_func=self.part_func)
                all_data_feature = self.extract_pic_feature(
                    pic_data, feature_dim=self.feature_dim)
                self.lshf.fit(all_data_feature, all_label)

            else:
                pic_data, this_label = load_batch_train_data(
                    vgg_folder,
                    start_person_index=index,
                    pic_num=pic_num,
                    shape=self.pic_shape,
                    batch_num=batch_person_num,
                    is_train=True,
                    part_func=self.part_func)
                all_label = np.row_stack(
                    (np.reshape(all_label, (all_label.shape[0], 1)),
                     np.reshape(this_label, (this_label.shape[0], 1))))
                pic_data_feature = self.extract_pic_feature(
                    pic_data, feature_dim=self.feature_dim)
                all_data_feature = np.row_stack(
                    (pic_data_feature, all_data_feature))
                self.lshf.partial_fit(pic_data_feature, this_label)
        self.all_label = all_label
        self.all_feature_data = all_data_feature
        logging.info(' '.join(
            map(str, ['self.all_label.shape :', self.all_label.shape])))

    def partical_fit(self, pic_data, this_label):
        '''
            增量训练, 样本比较小, 直接
        :param data:
        :param label:
        :return:
        '''
        pic_data_feature = self.extract_pic_feature(
            pic_data, feature_dim=self.feature_dim)
        self.lshf.partial_fit(pic_data_feature, this_label)
        self.all_label = np.row_stack(
            (np.reshape(self.all_label, (self.all_label.shape[0], 1)),
             np.reshape(this_label, (this_label.shape[0], 1))))

    def find_k_neighbors(self, pic_data):
        pic_data_feature = self.extract_pic_feature(
            pic_data, feature_dim=self.feature_dim)
        distances, indices = self.lshf.kneighbors(pic_data_feature,
                                                  n_neighbors=1)
        predict_label = self.all_label[indices][:, 0, 0]
        return predict_label

    def valid_model(self,
                    vgg_folder,
                    person_num=100,
                    batch_person_num=20,
                    pic_num=10,
                    topK_acc=1):
        # 取前50张图片加入到LSH Forest,后50张图片用于判断准确率
        right_num = 0
        wrong_num = 0
        clf = cPickle.load(open(clf_model_file, 'rb'))

        for index in range(0 + train_person_start_index,
                           person_num + train_person_start_index,
                           batch_person_num):
            pic_data, all_label = load_batch_train_data(
                vgg_folder,
                start_person_index=index,
                pic_num=pic_num,
                shape=self.pic_shape,
                batch_num=batch_person_num,
                is_train=False,
                part_func=self.part_func)

            pic_data_feature = self.extract_pic_feature(
                pic_data, feature_dim=self.feature_dim)
            distances, indices = self.lshf.kneighbors(pic_data_feature,
                                                      n_neighbors=10)
            train_data = self.all_feature_data[indices]
            predict_label = self.all_label[indices][:, 0, 0]
            for label_index in range(len(predict_label)):
                this_predict_data = np.abs(train_data[0] - pic_data_feature[0])
                this_result = clf.predict_proba(this_predict_data)
                print this_result
                # pdb.set_trace()
                if all_label[label_index] in self.all_label[indices][:, :, 0][
                        label_index][:topK_acc]:
                    right_num += 1
                else:
                    wrong_num += 1
        acc = right_num * 1.0 / (right_num + wrong_num)
        logging.info(' '.join(
            map(str, [
                'model_type :', model_type, 'person_num :', person_num,
                'pic_num :', pic_num, 'acc :', acc
            ])))
class Search():
    def __init__(self, model_type, n_estimators=20, n_candidates=200, n_neighbors=10):
        self.lshf = LSHForest(n_estimators=n_estimators, n_candidates=n_candidates, n_neighbors=n_neighbors)

        if model_type == 'rgb_small':
            self.deepid_model_file = '/data/liubo/face/vgg_face_dataset/model/vgg_face.all_data.small.rgb.deepid.model'
            self.deepid_weight_file = '/data/liubo/face/vgg_face_dataset/model/vgg_face.all_data.small.rgb.deepid.weight'
            self.part_func = None
            self.pic_shape = (50, 50, 3)
            self.feature_dim = 1024
        elif model_type == 'rgb_big':
            self.deepid_weight_file = '/data/liubo/face/vgg_face_dataset/model/vgg_face.all_data.big.rgb.deepid.weight'
            self.deepid_model_file = '/data/liubo/face/vgg_face_dataset/model/vgg_face.all_data.big.rgb.deepid.model'
            self.part_func = None
            self.pic_shape = (128, 128, 3)
        elif model_type == 'rgb_small_right':
            self.deepid_weight_file = '/data/liubo/face/vgg_face_dataset/model/vgg_face.small_data.small.rgb.right_eye.deepid.weight'
            self.deepid_model_file = '/data/liubo/face/vgg_face_dataset/model/vgg_face.small_data.small.rgb.right_eye.deepid.model'
            self.part_func = get_right_eye
            self.pic_shape = (50, 50, 3)
        elif model_type == 'rgb_small_left':
            self.deepid_weight_file = '/data/liubo/face/vgg_face_dataset/model/vgg_face.small_data.small.rgb.left_eye.deepid.weight'
            self.deepid_model_file = '/data/liubo/face/vgg_face_dataset/model/vgg_face.small_data.small.rgb.left_eye.deepid.model'
            self.part_func = get_left_eye
            self.pic_shape = (50, 50, 3)
        elif model_type == 'rgb_small_nose':
            self.deepid_weight_file = '/data/liubo/face/vgg_face_dataset/model/vgg_face.all_data.all.rgb.nose.deepid.weight'
            self.deepid_model_file = '/data/liubo/face/vgg_face_dataset/model/vgg_face.all_data.all.rgb.nose.deepid.model'
            self.part_func = get_nose
            self.pic_shape = (50, 50, 3)
        elif model_type == 'new_shape':
            self.deepid_model_file = '/data/liubo/face/vgg_face_dataset/model/vgg_face.small_data.new_shape.rgb.deepid.model'
            self.deepid_weight_file = '/data/liubo/face/vgg_face_dataset/model/vgg_face.small_data.new_shape.rgb.deepid.weight'
            self.pic_shape = (156, 124, 3)
            self.feature_dim = 256
            self.part_func = None
        self.model, self.get_Conv_FeatureMap = load_deepid_model(self.deepid_model_file, self.deepid_weight_file)
        self.all_label = None
        self.all_feature_data = None


    def extract_pic_feature(self, pic_data, batch_size=128, feature_dim=1024):
        pic_feature = np.zeros(shape=(pic_data.shape[0], feature_dim))
        batch_num = pic_data.shape[0] / batch_size
        for index in range(batch_num):
            # pic_feature[index*batch_size:(index+1)*batch_size, :] = \
            #     self.get_Conv_FeatureMap([pic_data[index*batch_size:(index+1)*batch_size], 0])[0]
            pic_feature[index*batch_size:(index+1)*batch_size, :] = \
                self.get_Conv_FeatureMap([np.transpose(pic_data[index*batch_size:(index+1)*batch_size], (0, 3, 1, 2)), 0])[0]

        if batch_num*batch_size < pic_data.shape[0]:
            # pic_feature[batch_num*batch_size:, :] = \
            #     self.get_Conv_FeatureMap([pic_data[batch_num*batch_size:], 0])[0]
            pic_feature[batch_num*batch_size:, :] = \
                self.get_Conv_FeatureMap([np.transpose(pic_data[batch_num*batch_size:], (0, 3, 1, 2)), 0])[0]
        return pic_feature


    def train_all_data(self, vgg_folder, person_num=100, batch_person_num=20, pic_num=10):
        # 取前pic_num张图片加入到LSH Forest,其余图片用于判断准确率
        for index in range(0+train_person_start_index, person_num+train_person_start_index, batch_person_num):
            if index == 0+train_person_start_index:
                pic_data, all_label = load_batch_train_data(vgg_folder, shape=self.pic_shape, start_person_index=index,
                                 pic_num=pic_num, batch_num=batch_person_num, is_train=True, part_func=self.part_func)
                all_data_feature = self.extract_pic_feature(pic_data, feature_dim=self.feature_dim)
                self.lshf.fit(all_data_feature, all_label)

            else:
                pic_data, this_label = load_batch_train_data(vgg_folder, start_person_index=index, pic_num=pic_num,
                                shape=self.pic_shape, batch_num=batch_person_num,is_train=True, part_func=self.part_func)
                all_label = np.row_stack((np.reshape(all_label, (all_label.shape[0], 1)),
                                          np.reshape(this_label, (this_label.shape[0],1))))
                pic_data_feature = self.extract_pic_feature(pic_data, feature_dim=self.feature_dim)
                all_data_feature = np.row_stack((pic_data_feature, all_data_feature))
                self.lshf.partial_fit(pic_data_feature, this_label)
        self.all_label = all_label
        self.all_feature_data = all_data_feature
        logging.info(' '.join(map(str, ['self.all_label.shape :', self.all_label.shape])))


    def partical_fit(self, pic_data, this_label):
        '''
            增量训练, 样本比较小, 直接
        :param data:
        :param label:
        :return:
        '''
        pic_data_feature = self.extract_pic_feature(pic_data, feature_dim=self.feature_dim)
        self.lshf.partial_fit(pic_data_feature, this_label)
        self.all_label = np.row_stack((np.reshape(self.all_label, (self.all_label.shape[0], 1)),
                                          np.reshape(this_label, (this_label.shape[0],1))))


    def find_k_neighbors(self, pic_data):
        pic_data_feature = self.extract_pic_feature(pic_data, feature_dim=self.feature_dim)
        distances, indices = self.lshf.kneighbors(pic_data_feature, n_neighbors=1)
        predict_label = self.all_label[indices][:, 0, 0]
        return predict_label


    def valid_model(self, vgg_folder, person_num=100, batch_person_num=20, pic_num=10, topK_acc=1):
        # 取前50张图片加入到LSH Forest,后50张图片用于判断准确率
        right_num = 0
        wrong_num = 0
        clf = cPickle.load(open(clf_model_file, 'rb'))

        for index in range(0+train_person_start_index, person_num+train_person_start_index, batch_person_num):
            pic_data, all_label = load_batch_train_data(vgg_folder, start_person_index=index, pic_num=pic_num,
                            shape=self.pic_shape,batch_num=batch_person_num, is_train=False, part_func=self.part_func)

            pic_data_feature = self.extract_pic_feature(pic_data, feature_dim=self.feature_dim)
            distances, indices = self.lshf.kneighbors(pic_data_feature, n_neighbors=10)
            train_data = self.all_feature_data[indices]
            predict_label = self.all_label[indices][:, 0, 0]
            for label_index in range(len(predict_label)):
                this_predict_data = np.abs(train_data[0] - pic_data_feature[0])
                this_result = clf.predict_proba(this_predict_data)
                print this_result
                # pdb.set_trace()
                if all_label[label_index] in self.all_label[indices][:, :, 0][label_index][:topK_acc]:
                    right_num += 1
                else:
                    wrong_num += 1
        acc = right_num * 1.0 / (right_num + wrong_num)
        logging.info(' '.join(map(str, ['model_type :', model_type, 'person_num :', person_num, 'pic_num :', pic_num, 'acc :', acc])))
Exemplo n.º 9
0
'''
    LSHash(局部敏感哈希)
https://blog.csdn.net/sinat_26917383/article/details/70243066
'''

from sklearn.neighbors import LSHForest


# X_train = [[5, 5, 99], [21, 5, 5], [1, 1, 1]]
# X_train1 = [ [8, 9, 1], [6, 10, 2]]
# X_test = [[9, 1, 6], [3, 1, 10], [7, 10, 3]]
X_train = [[1,1], [2,3], [3,2]]
X_test = [[3,3]]
lshf = LSHForest(random_state=42)
lshf.fit(X_train)
# lshf.partial_fit(X_train1)
distances, indices = lshf.kneighbors(X_test, n_neighbors=2)

print('distances',distances)
print('indices',indices)

X_train1 = [[1,1], [3,2], [3,3]]
lshf.partial_fit(X_train1)
distances1, indices1 = lshf.kneighbors(X_test, n_neighbors=2)
# lshf.partial_fit(X_test)
print('distances1',distances1)
print('indices1',indices1)
class FaceRecognition():
    def __init__(self):
        self.unknown = ''
        self.same_person_num = 1
        self.has_cal_dist = []
        self.NeighbourNum = 10
        # 如果管理员加载图片, 把图片放到all_pic_data_folder下指定人的目录(图片文件和特征文件的文件名相同)
        self.all_pic_feature_data_folder = '/data/liubo/face/research_feature_self'  # 研究院的模型直接存储特征
        # 保存图片可以方便以后查看效果, 方便前端显示, 也方便管理员进行标注
        self.all_pic_data_folder = '/data/liubo/face/research_self'
        if not os.path.exists(self.all_pic_data_folder):
            os.makedirs(self.all_pic_data_folder)
        if not os.path.exists(self.all_pic_feature_data_folder):
            os.makedirs(self.all_pic_feature_data_folder)
        self.n_neighbors = 10
        self.lshf = LSHForest(n_estimators=20,
                              n_candidates=200,
                              n_neighbors=self.n_neighbors)
        self.all_labels = []
        self.all_pic_feature = []
        self.same_pic_id = 2
        self.must_be_same_id = 1
        self.must_be_not_same_id = 0
        self.maybe_same_id = 3
        self.new_person_str = 'new_person_'
        self.current_new_person_id = self.find_current_new_person_id()
        self.must_same_str = '_Must_Same'
        self.maybe_same_str = '_Maybe_same'
        self.load_time = time.time()
        self.user_count = {}
        self.upper_threshold = upper_verif_threshold
        self.lower_threshold = lower_verif_threshold
        self.same_pic_threshold = same_pic_threshold
        self.trans_dic = {
            self.same_pic_id: 'same_pic',
            self.must_be_same_id: 'must_same_id',
            self.must_be_not_same_id: 'must_not_same_id',
            self.maybe_same_id: 'maybe_same_id'
        }
        self.nearest = deque(maxlen=nearest_num)
        self.verification_same_person = 0

    def cal_nearest_sim(self, current_feature):
        nearest_sim_list = []
        current_day = get_current_day()
        log_file = open(os.path.join(log_dir, current_day + '.txt'), 'a')

        try:
            length = len(self.nearest)
            for k in range(length):
                try:
                    person_name, pre_feature = self.nearest[k]
                    # 不在考虑时间, 只考虑图片的相似度

                    this_sim = pw.cosine_similarity(
                        np.reshape(np.asarray(pre_feature),
                                   (1, len(pre_feature))),
                        np.reshape(np.asarray(current_feature),
                                   (1, len(current_feature))))
                    nearest_sim_list.append(
                        (this_sim, verification_model.predict(this_sim),
                         person_name))
                except:
                    log_file.write('cal_nearest_sim error' + '\n')
                    traceback.print_exc()
                    continue
            return nearest_sim_list
        except:
            traceback.print_exc()
            return nearest_sim_list

    def load_train_data(self, data_folder):
        # 直接读取图片特征, 返回所有特征和label
        all_pic_feature = []
        all_label = []
        person_list = os.listdir(data_folder)
        for person in person_list:
            if person == self.unknown or self.must_same_str in person or self.maybe_same_str in person:
                continue
            person_path = os.path.join(data_folder, person)
            pic_feature_list = os.listdir(person_path)
            for pic_feature_path in pic_feature_list:
                pic_feature = msgpack_numpy.load(
                    open(os.path.join(person_path, pic_feature_path), 'rb'))
                all_pic_feature.append(pic_feature)
                all_label.append(person)
        all_pic_feature = np.asarray(all_pic_feature)
        all_label = np.asarray(all_label)
        return all_pic_feature, all_label

    def find_current_new_person_id(self):
        current_day = get_current_day()
        log_file = open(os.path.join(log_dir, current_day + '.txt'), 'a')

        old_person_id = []
        # 保存的是原始图片
        person_list = os.listdir(self.all_pic_data_folder)
        for person in person_list:
            if person.startswith(self.new_person_str):
                tmp = person[len(self.new_person_str):].split('_')
                if len(tmp) > 0:
                    this_id = int(tmp[0])
                    old_person_id.append(this_id)
        if len(old_person_id) == 0:
            current_new_person_id = 0
        else:
            current_new_person_id = max(old_person_id) + 1
        log_file.write('\t'.join(
            map(str, ['current_new_person_id :', current_new_person_id])) +
                       '\n')
        log_file.close()
        return current_new_person_id

    def extract_pic_feature(self, pic_path):
        try:
            result = extract_feature_from_binary_data(open(pic_path, 'rb'))
            if result == None:
                return
            face_num, all_frames, all_feature = result
            biggest_face_index = find_big_face(all_frames)
            pic_frame = all_frames[biggest_face_index]
            pic_feature = all_feature[biggest_face_index]
            x, y, width, height = pic_frame
            face_pic = cv2.imread(pic_path)[y:y + width, x:x + height, :]
            return face_pic, pic_feature
        except:
            traceback.print_exc()
            return None

    def load_all_data(self):
        # 将以前标记的数据全部读入(直接读入的是特征), 用LSH Forest保存,方便计算距离
        current_day = get_current_day()
        log_file = open(os.path.join(log_dir, current_day + '.txt'), 'a')
        try:
            all_pic_feature, all_label = self.load_train_data(
                self.all_pic_feature_data_folder)
            train_label = np.asarray(all_label)
            if len(all_pic_feature) == len(
                    train_label) and len(train_label) > 0:
                start = time.time()
                self.lshf.fit(all_pic_feature, train_label)
                self.all_pic_feature = list(all_pic_feature)
                self.all_labels = list(train_label)
                end = time.time()
                self.load_time = end
                self.user_count = Counter(self.all_labels)
                log_file.write('\t'.join(
                    map(str, [
                        self.user_count, 'fit all data time :', (end - start)
                    ])) + '\n')
                log_file.close()
        except:
            traceback.print_exc()
            log_file.close()
            return

    def save_pic_feature(self, pic_path, person_name):
        #  将已经存在的文件生成特征并保存到指定文件夹下, 用于管理员加入新的图片(加入新的图片后, 提取特征, 保存到指定文件夹)
        person_pic_folder_path = os.path.join(self.all_pic_data_folder,
                                              person_name)
        person_feature_folder_path = os.path.join(
            self.all_pic_feature_data_folder, person_name)
        if not os.path.exists(person_pic_folder_path):
            os.makedirs(person_pic_folder_path)
        if not os.path.exists(person_feature_folder_path):
            os.makedirs(person_feature_folder_path)
        pic_name = os.path.split(pic_path)[-1]
        # 特征文件
        person_feature_path = os.path.join(person_feature_folder_path,
                                           pic_name)
        # 人脸文件
        person_pic_path = os.path.join(person_pic_folder_path, pic_name)
        result = extract_feature_from_binary_data(open(pic_path, 'rb'))
        if result == None:
            return
        face_num, all_frames, all_feature = result
        biggest_face_index = find_big_face(all_frames)
        pic_frame = all_frames[biggest_face_index]
        pic_feature = all_feature[biggest_face_index]
        x, y, width, height = pic_frame
        face_pic = cv2.imread(pic_path)[y:y + width, x:x + height, :]
        cv2.imwrite(person_pic_path, face_pic)
        msgpack_numpy.dump(pic_feature, open(person_feature_path, 'wb'))

    def add_all_new_pic(self):
        '''
            将从上次加载数据到当前新增的文件都加载到LSH Forest(有可能是新增加一个人,还有可能是对已有的人增加新图片)
            遍历文件夹(self.all_pic_feature_data_folder), 根据文件的时间判断是否需要加入该图片的特征
            系统在管理员标注图片后, 将人脸图片和特征文件同时进行移动, 所以现在只需要将特征和对应的label加入LSH就可以了
        '''
        current_day = get_current_day()
        log_file = open(os.path.join(log_dir, current_day + '.txt'), 'a')
        start = time.time()
        person_list = os.listdir(self.all_pic_data_folder)
        add_num = 0
        for person in person_list:
            if self.must_same_str in person or self.maybe_same_str in person or self.new_person_str in person:
                continue
            person_path = os.path.join(self.all_pic_data_folder, person)
            if not os.path.isdir(person_path):
                continue
            pic_list = os.listdir(person_path)
            for pic in pic_list:
                pic_path = os.path.join(person_path, pic)
                last_modify_time = os.stat(pic_path).st_atime
                if last_modify_time > self.load_time:
                    request = {
                        "label": person,
                        "request_type": 'add',
                        "one_pic_feature": pic_path
                    }
                    url = "http://127.0.0.1:%d/" % port
                    result = image_request(request, url)
                    try:
                        add_flag = json.loads(result)["add"]
                        if not add_flag:  # 加载失败
                            log_file.write('\t'.join(
                                map(str, ['no add file :', pic_path])) + '\n')
                        else:
                            add_num += 1
                    except:
                        log_file.write(
                            '\t'.join(map(str, ['no add file :', pic_path])) +
                            '\n')
                        traceback.print_exc()
                        continue
                    add_num += 1
        end = time.time()
        if add_num > 0:
            self.load_time = end
            log_file.write(
                '\t'.join(map(str, ['self.load_time', self.load_time])) + '\n')
            log_file.write('\t'.join(
                map(str, [
                    'add pic num :', add_num, 'Dynamic increase time :',
                    (end - start)
                ])) + '\n')
            log_file.close()
        else:
            log_file.close()

    def add_one_new_pic(self, pic_path, label):
        current_day = get_current_day()
        log_file = open(os.path.join(log_dir, current_day + '.txt'), 'a')
        try:
            # 读入数据时已经转换成需要的尺寸
            result = self.extract_pic_feature(pic_path)
            if result == None:
                return False
            face_pic, pic_feature = result
            self.add_one_pic(pic_feature, label)
            pic_name = os.path.split(pic_path)[1]
            this_person_pic_folder = os.path.join(self.all_pic_data_folder,
                                                  label)
            this_person_feature_folder = os.path.join(
                self.all_pic_feature_data_folder, label)
            if not os.path.exists(this_person_pic_folder):
                os.makedirs(this_person_pic_folder)
            if not os.path.exists(this_person_feature_folder):
                os.makedirs(this_person_feature_folder)
            # 直接存储图片对应的特征, 同时保存图片文件
            this_pic_feature_name = os.path.join(this_person_feature_folder,
                                                 pic_name + '.p')
            msgpack_numpy.dump(pic_feature, open(this_pic_feature_name, 'wb'))
            this_pic_face_name = os.path.join(this_person_pic_folder,
                                              pic_name + '.jpg')
            cv2.imwrite(this_pic_face_name, face_pic)
            log_file.write(
                '\t'.join(map(str, [pic_path, this_pic_face_name])) + '\n')
            return True
        except:
            traceback.print_exc()
            return False

    def add_one_pic(self, one_pic_feature, pic_label):
        '''
            将一个图像的特征加入到LSH Forest,同时将对应的标签加入到self.all_labels
            :param pic_feature: array shape :(1,1024)
            :param pic_label: (1,)
            :return:
        '''
        one_pic_feature = np.asarray(one_pic_feature)
        self.lshf.partial_fit(one_pic_feature.reshape(1, FEATURE_DIM),
                              pic_label)
        self.all_labels.append(pic_label)
        self.all_pic_feature.append(
            np.reshape(one_pic_feature, newshape=(1, one_pic_feature.size)))

    def find_k_neighbors_with_lsh(self, one_pic_feature):
        '''
            :param one_pic_feature: 图像特征
            :return: 需要返回neighbors的特征,用于计算pariwise
        '''
        try:
            one_pic_feature = np.asarray(one_pic_feature)
            tmp = self.lshf.kneighbors(one_pic_feature.reshape(1, FEATURE_DIM),
                                       n_neighbors=self.n_neighbors,
                                       return_distance=True)
            neighbors_label = np.asarray(self.all_labels)[tmp[1][0]]
            neighbors_feature = np.asarray(self.all_pic_feature)[tmp[1][0]]
            pair_score_list = []
            cos_sim_list = []
            for index in range(len(neighbors_feature)):
                pair_score = pw.cosine_similarity(
                    neighbors_feature[index].reshape(1, FEATURE_DIM),
                    one_pic_feature.reshape(1, FEATURE_DIM))[0][0]
                cos_sim_list.append(pair_score)
                pair_score_list.append(verification_model.predict(pair_score))
            result = zip(cos_sim_list, pair_score_list, neighbors_label)
            # result = self.filter_result(result)
            # result.sort(key=lambda x:x[0], reverse=True)
            return result
        except:
            return None

    def filter_result(self, result):
        '''
            :param result: [(cos_sim, same_person_result, label),
                            (cos_sim, same_person_result, label),
                            (cos_sim, same_person_result, label)] 按cos_sim降序排列
            :return: this_id(Must_same, Must_not_same, May_same), this_label(人名)
        '''
        # 分值相同的, 将new_person的删去
        tmp_dic = {}
        for element in result:
            try:
                this_score, this_same_person_result, this_label = element
                this_score = float(this_score)
                if this_score in tmp_dic:
                    if self.new_person_str in this_label:
                        continue
                    else:
                        tmp_dic[this_score] = element
                else:
                    tmp_dic[this_score] = element
            except:
                traceback.print_exc()
                continue
        result = tmp_dic.values()
        return result

    def evaluate_result(self, result):
        '''
            :param result: [(cos_sim, same_person_result, label),
                            (cos_sim, same_person_result, label),
                            (cos_sim, same_person_result, label)]
            :return: this_id(Must_same, Must_not_same, May_same), this_label(人名)
        '''
        for index, element in enumerate(result):
            this_score, this_same_person_result, this_label = element
            if this_same_person_result == self.verification_same_person and this_score > self.same_pic_threshold:
                return self.same_pic_id, this_label
            if this_same_person_result == self.verification_same_person and this_score > self.upper_threshold:
                return self.must_be_same_id, this_label
            if this_same_person_result == self.verification_same_person and this_score > self.lower_threshold:
                return self.maybe_same_id, this_label
        return self.must_be_not_same_id, ''

    def recognize_online_cluster(self, image, image_id):
        '''
            :param image: 将得到的图片进行识别,加入的LSH Forest,根据距离计算proba(不同的距离对应不同的准确率,根据已有的dist计算阈值);
                            和已经设定的阈值判断是不是一个新出现的人,确定是原来已有的人,还是不确定是原来已有的人
            :return:
        '''
        start = time.time()
        need_add = False
        need_save = False
        current_day = get_current_day()
        log_file = open(os.path.join(log_dir, current_day + '.txt'), 'a')
        log_file.write(
            '\t'.join(map(str, ["receive image", image_id,
                                time.time()])) + '\n')
        feature_str = ''
        try:
            image = base64.decodestring(image)
            image = zlib.decompress(image)
            im = cv2.imdecode(np.fromstring(image, dtype=np.uint8), 1)
            log_file.write(
                '\t'.join(map(str, ['shape :', im.shape[0], im.shape[1]])) +
                '\n')
            # 图片尺寸过滤
            if im.shape[0] < size_threshold or im.shape[1] < size_threshold:
                log_file.write('\t'.join(
                    map(str, [
                        'stat recognize_time :', (time.time() -
                                                  start), 'small_size'
                    ])) + '\n')
                log_file.close()
                return self.unknown, 1.0, feature_str, need_save
            # 清晰度过滤
            blur_sign, blur_var = is_blur(cv2.resize(im, (96, 96)))
            if blur_sign:
                log_file.write('\t'.join(
                    map(str, [
                        'stat recognize_time :',
                        (time.time() - start), 'blur_filter', blur_var
                    ])) + '\n')
                log_file.close()
                return self.unknown, 1.0, feature_str, need_save
            #  保存传过来的图片
            # img_file = '/tmp/research_face/%s.jpg' %image_id
            time_slot = get_time_slot(image_id)
            if time_slot == None:
                time_slot = 'error'
            time_slot_dir = os.path.join(tmp_face_dir, time_slot)
            if not os.path.exists(time_slot_dir):
                os.makedirs(time_slot_dir)
            img_file = os.path.join(time_slot_dir, image_id + '.jpg')
            cv2.imwrite(img_file, im)
        except:
            traceback.print_exc()
            log_file.close()
            return self.unknown, 1.0, feature_str, need_save
        try:
            # 流程 : 找距离最近的图片 ; 计算prob ; 在线聚类 ; 加入LSH Forest
            result = self.extract_pic_feature(img_file)
            if result == None:
                log_file.write('\t'.join(
                    map(str, [
                        'stat not_find_face', 'time :', (time.time() - start)
                    ])) + '\n')
                log_file.close()
                return self.unknown, 1.0, feature_str, need_save
            face_pic, im_feature = result

            try:
                # nearest_sim_list的格式和dist_label_list的格式一样,这样可以将两个list合并,一起计算(这样不用考虑时间的因素)
                # 在识别出人名后将人名和feature放入到self.nearest
                nearest_sim_list = self.cal_nearest_sim(
                    current_feature=im_feature)
            except:
                traceback.print_exc()
                nearest_sim_list = []
            log_file.write('\t'.join(
                map(str, ['nearest_sim_list :',
                          map(str, nearest_sim_list)])) + '\n')
            feature_str = base64.b64encode(msgpack_numpy.dumps(im_feature))
            log_file.write('\t'.join(
                map(str, ['extract_feature_time :', (time.time() - start)])) +
                           '\n')
            # 找距离最近的图片 --- 用LSH Forest 找出最近的10张图片,然后分别计算距离

            tmp_list = self.find_k_neighbors_with_lsh(im_feature)
            nearest_sim_list.sort(key=lambda x: x[0], reverse=True)
            nearest_sim_list.extend(tmp_list)
            dist_label_list = nearest_sim_list[:]

            # 计算
            log_file.write('\t'.join(
                map(str, ['dist_label_list :',
                          map(str, dist_label_list)])) + '\n')
            if dist_label_list == None:
                this_id = self.must_be_not_same_id
                this_label = self.new_person_str + str(
                    self.current_new_person_id)
            else:
                # 计算prob --- 根据距离计算prob
                this_id, this_label = self.evaluate_result(dist_label_list)
            # 不管概率, 都要将最新的一张图片加入到self.nearest
            self.nearest.append((this_label, im_feature))
            log_file.write(
                '\t'.join(map(str, ['self.nearest :',
                                    map(str, self.nearest)])) + '\n')
            # 在线聚类 --- 根据dist确定是重新增加一个人还是加入到已有的人中
            if this_id == self.same_pic_id:
                need_add = False
            elif this_id == self.must_be_same_id:
                need_add = False
                need_save = True
                this_person_pic_folder = os.path.join(
                    self.all_pic_data_folder, this_label + self.must_same_str)
                this_person_feature_folder = os.path.join(
                    self.all_pic_feature_data_folder,
                    this_label + self.must_same_str)
            elif this_id == self.must_be_not_same_id:
                this_label = self.new_person_str + str(
                    self.current_new_person_id)
                self.current_new_person_id += 1
                this_person_pic_folder = os.path.join(self.all_pic_data_folder,
                                                      this_label)
                this_person_feature_folder = os.path.join(
                    self.all_pic_feature_data_folder, this_label)
                need_add = True
                need_save = True
            elif this_id == self.maybe_same_id:
                this_person_pic_folder = os.path.join(
                    self.all_pic_data_folder, this_label + self.maybe_same_str)
                this_person_feature_folder = os.path.join(
                    self.all_pic_feature_data_folder,
                    this_label + self.maybe_same_str)
                need_add = False  # prob在灰度区域的不如入,其余情况加入
                need_save = True
            else:
                log_file.write('\t'.join(map(str, ['error para :', this_id])) +
                               '\n')
            if need_save:
                try:
                    if not os.path.exists(this_person_pic_folder):
                        os.makedirs(this_person_pic_folder)
                    if not os.path.exists(this_person_feature_folder):
                        os.makedirs(this_person_feature_folder)
                    # 直接存储图片对应的特征, 同时保存图片文件
                    this_pic_feature_name = os.path.join(
                        this_person_feature_folder, image_id + '.p')
                    msgpack_numpy.dump(im_feature,
                                       open(this_pic_feature_name, 'wb'))
                    this_pic_face_name = os.path.join(this_person_pic_folder,
                                                      image_id + '.jpg')
                    cv2.imwrite(this_pic_face_name, face_pic)
                except:
                    traceback.print_exc()
                    return self.unknown, 1.0, feature_str, False
            # 加入LSH Forest --- partial_fit
            if need_add:
                self.add_one_pic(im_feature, this_label)
                # 根据label和image_id可以存生成文件名,确定是否要存储文件[可以选择在服务器和本地同时存储]
            if this_id == self.same_pic_id or this_id == self.must_be_not_same_id or this_id == self.must_be_same_id:
                end = time.time()
                log_file.write('\t'.join(
                    map(str, [
                        'stat recognize_time :', (end - start), 'this_id :',
                        self.trans_dic.get(this_id)
                    ])) + '\n')
                log_file.close()
                need_save = True
                return this_label.replace(self.must_same_str, ''), str(
                    dist_label_list[0][0]), str(feature_str), str(need_save)
            else:
                # 灰度区域,不显示人名
                end = time.time()
                log_file.write(
                    '\t'.join(map(str, ['stat gray_area :', (end - start)])) +
                    '\n')
                log_file.close()
                return self.unknown, str(
                    dist_label_list[0][0]), str(feature_str), str(False)
        except:
            traceback.print_exc()
            log_file.close()
            return self.unknown, str(100.0), str(feature_str), str(False)
Exemplo n.º 11
0
import time
import numpy as np
from sklearn.datasets.samples_generator import make_blobs
from sklearn.neighbors import LSHForest
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt
import random

X_train = [[5, 5, 2], [21, 5, 5], [1, 1, 1], [8, 9, 1], [6, 10, 2]]
X_test = [[9, 1, 6], [3, 1, 10], [7, 10, 3]]

dim=3000

lshf = LSHForest(random_state=42, n_estimators=65, n_candidates=200, n_neighbors=10)



X_train = [ [random.randint(0, 4) for k in range(dim)] for i in range(50)]

for j in range(1000):
    X_test = [ [random.randint(0, 4) for k in range(dim)] for i in range(1)]
    lshf.partial_fit(X_test)
    if j % 50==0:
        print (j)

distances, indices = lshf.kneighbors(X_test, n_neighbors=33)
print(distances, indices)
Exemplo n.º 12
0
class ClassifierLSHForest(ClassifierMixin, LSHForest):
    """ 
    KNeighborsClassifier with partial_fit method for online learning.
    Memory-based classifier. Wrapper around LSHForest.
    """
    def __init__(self,
                 n_estimators=10,
                 radius=1.0,
                 n_candidates=50,
                 n_neighbors=5,
                 min_hash_match=4,
                 radius_cutoff_ratio=0.9,
                 random_state=None,
                 class_weights=None):
        self.lshf_ = LSHForest(n_estimators=n_estimators,
                               radius=radius,
                               n_candidates=n_candidates,
                               n_neighbors=n_neighbors,
                               min_hash_match=min_hash_match,
                               radius_cutoff_ratio=radius_cutoff_ratio,
                               random_state=random_state)
        self.y_ = None
        self.classes_ = list()
        self.class_weights_ = class_weights

    def fit(self, X, y):
        self.y_ = y
        self.classes_ = np.unique(y).tolist()
        self.lshf_.fit(X)

        print 'fitted'
        return self

    def partial_fit(self, X, y, *args, **kwargs):
        if self.y_ is None:
            self.y_ = y
        else:
            self.y_ = np.concatenate((self.y_, y))
            print self.y_.shape

        for yi in y:
            if yi not in self.classes_:
                self.classes_.append(yi)

        self.lshf_.partial_fit(X)

        return self

    def _kernel(self, x):
        return np.exp(-x)

    def _get_class_weights(self):
        return compute_class_weight(self.class_weights_, self.classes_,
                                    self.y_)

    def _compute_weights(self, X):
        dists, neighbors = self.lshf_.kneighbors(X, return_distance=True)

        result = np.zeros((neighbors.shape[0], len(self.classes_)))
        for i in xrange(neighbors.shape[0]):
            for cl_index, cl in enumerate(self.classes_):
                result[i, cl_index] = self._kernel(
                    dists[i][self.y_[neighbors[i]] == cl]).sum()

        if self.class_weights_ is not None:
            result *= self._get_class_weights()

        return result

    def predict(self, X):
        weights = self._compute_weights(X)
        result = np.argmax(weights, axis=1)
        for i in xrange(X.shape[0]):
            result = self.classes_[result[i]]

        return result

    def predict_proba(self, X):
        weights = self._compute_weights(X)

        normalizer = weights.sum(axis=1)
        normalizer[normalizer == 0.0] = 1.0

        weights /= normalizer

        return weights
class FaceRecognition():
    def __init__(self):
        self.unknown = ''
        self.same_person_num = 1
        self.has_save_pic_feature = []
        self.has_cal_dist = []
        self.NeighbourNum = 10
        self.all_pic_data_folder = '/data/liubo/face/self'
        self.other_dataset_para_add = 1
        self.n_neighbors = 5
        self.lshf = LSHForest(n_estimators=20, n_candidates=200, n_neighbors=self.n_neighbors)
        self.all_labels = []
        self.all_pic_feature = []
        self.same_pic_id = 2
        self.must_be_same_id = 1
        self.must_be_not_same_id = 0
        self.maybe_same_id = 3
        self.new_person_str = 'new_person_'
        self.current_new_person_id = self.find_current_new_person_id()
        self.must_same_str = '_Must_Same'
        self.maybe_same_str = '_Maybe_same'
        self.load_time = time.time()
        self.user_count = {}
        # 不同的模型阈值不相同
        self.upper_threshold = upper_verif_threshold
        self.lower_threshold = lower_verif_threshold
        self.same_pic_threshold = same_pic_threshold
        self.pitch_threshold = 20
        self.yaw_threshold = 20
        self.roll_threshold = 20
        #  [(time, feature),...,(time, feature)] : 根据时间计算当前图片与前5张图片的相似度(如果时间相差很多, 不在计算)
        self.nearest = deque(maxlen=nearest_num)
        self.trans_dic = {self.same_pic_id: 'same_pic', self.must_be_same_id: 'must_same_id',
                          self.must_be_not_same_id: 'must_not_same_id', self.maybe_same_id: 'maybe_same_id'}
        self.verification_same_person = 0


    def cal_nearest_sim(self, current_feature):
        nearest_sim_list = []
        try:
            length = len(self.nearest)
            for k in range(length):
                try:
                    person_name, pre_feature = self.nearest[k]
                    # 不在考虑时间, 只考虑图片的相似度
                    this_sim = pw.cosine_similarity(np.reshape(np.asarray(pre_feature), (1, len(pre_feature))),
                                                    np.reshape(np.asarray(current_feature), (1, len(current_feature))))

                    nearest_sim_list.append((this_sim, verification_model.predict(this_sim), person_name))
                except:
                    traceback.print_exc()
                    continue
            return nearest_sim_list
        except:
            traceback.print_exc()
            return nearest_sim_list


    def find_current_new_person_id(self):
        current_day = get_current_day()
        log_file = open(os.path.join(log_dir, current_day+'.txt'), 'a')

        old_person_id = []
        person_list = os.listdir(self.all_pic_data_folder)
        for person in person_list:
            if person.startswith(self.new_person_str):
                tmp = person[len(self.new_person_str):].split('_')
                if len(tmp) > 0:
                    this_id = int(tmp[0])
                    old_person_id.append(this_id)
        if len(old_person_id) == 0:
            current_new_person_id = 0
        else:
            current_new_person_id = max(old_person_id) + 1
        log_file.write('\t'.join(map(str, ['current_new_person_id :', current_new_person_id]))+'\n')
        log_file.close()
        return current_new_person_id


    def extract_pic_feature(self, pic_data, batch_size=1, feature_dim=FEATURE_DIM):
        '''
            用于提取多张图片的特征(用于处理load数据)
            :param pic_data: 图片数据
            :param batch_size:
            :param feature_dim: 模型输出维度(vgg的输出是4096)
            :return:
        '''
        pic_feature = np.zeros(shape=(pic_data.shape[0], feature_dim))
        batch_num = pic_data.shape[0] / batch_size
        for index in range(batch_num):
            pic_feature[index*batch_size:(index+1)*batch_size, :] = \
                extract_feature_from_numpy(pic_data[index*batch_size:(index+1)*batch_size])
        if batch_num*batch_size < pic_data.shape[0]:
            pic_feature[batch_num*batch_size:, :] = \
                extract_feature_from_numpy(pic_data[batch_num*batch_size:])
        return pic_feature


    def load_all_data(self):
        # 将以前标记的数据全部读入,用LSH Forest保存,方便计算距离
        current_day = get_current_day()
        log_file = open(os.path.join(log_dir, current_day+'.txt'), 'a')

        train_data, train_label = load_train_data(self.all_pic_data_folder)
        if len(train_label) == 0:
            return
        pic_feature = self.extract_pic_feature(train_data)
        start = time.time()
        self.lshf.fit(pic_feature, train_label)
        self.all_pic_feature = list(pic_feature)
        self.all_labels = list(train_label)
        end = time.time()
        self.load_time = end
        self.user_count = Counter(self.all_labels)
        log_file.write('\t'.join(map(str, [self.user_count,
                                           'fit all data time :', (end - start)]))+'\n')
        log_file.close()


    def add_all_new_pic(self):
        '''
            将从上次加载数据到当前新增的文件都加载到LSH Forest(有可能是新增加一个人,还有可能是对已有的人增加新图片)
            遍历文件夹(self.all_pic_data_folder),根据文件的时间判断是否需要加入该图片
            用户新加入的图片先进行人脸检测, 如果能够检测到人脸,使用检测结果, 否则使用用户的原始图片
        '''
        current_day = get_current_day()
        log_file = open(os.path.join(log_dir, current_day+'.txt'), 'a')

        start = time.time()
        person_list = os.listdir(self.all_pic_data_folder)
        add_num = 0
        for person in person_list:
            if self.must_same_str in person or self.maybe_same_str in person or self.new_person_str in person:
                continue
            person_path = os.path.join(self.all_pic_data_folder, person)
            if not os.path.isdir(person_path):
                continue
            pic_list = os.listdir(person_path)
            for pic in pic_list:
                pic_path = os.path.join(person_path, pic)
                last_modify_time = os.stat(pic_path).st_atime
                if last_modify_time > self.load_time:
                    # 请求本地服务
                    request = {
                        "label": person,
                        "request_type": 'add',
                        "one_pic_feature": pic_path
                    }
                    url = "http://127.0.0.1:%d/"%port
                    result = image_request(request, url)
                    try:
                        add_flag = json.loads(result)["add"]
                        if not add_flag:# 加载失败
                            log_file.write('\t'.join(map(str, ['no add file :', pic_path]))+'\n')
                        else:
                            add_num += 1
                    except:
                        log_file.write('\t'.join(map(str, ['no add file :', pic_path]))+'\n')
                        traceback.print_exc()
                        continue
                    add_num += 1
        end = time.time()
        if add_num > 0:
            self.load_time = end
            log_file.write('\t'.join(map(str, ['self.load_time', self.load_time]))+'\n')
            log_file.write('\t'.join(map(str, ['add pic num :', add_num,
                                               'Dynamic increase time :', (end - start)]))+'\n')
            log_file.close()
        else:
            log_file.close()


    def add_one_new_pic(self, pic_path, label):
        try:
            # 读入数据时已经转换成需要的尺寸
            im_feature = extract_feature_from_file(pic_path)
            self.add_one_pic(im_feature, label)
            return True
        except:
            traceback.print_exc()
            return False


    def add_one_pic(self, one_pic_feature, pic_label):
        '''
            将一个图像的特征加入到LSH Forest,同时将对应的标签加入到self.all_labels
            :param pic_feature: array shape :(1,1024)
            :param pic_label: (1,)
            :return:
        '''
        self.lshf.partial_fit(one_pic_feature.reshape(1, FEATURE_DIM), pic_label)
        self.all_labels.append(pic_label)
        self.all_pic_feature.append(np.reshape(one_pic_feature, newshape=(1, one_pic_feature.size)))


    def find_k_neighbors_with_lsh(self, one_pic_feature):
        '''
            :param one_pic_feature: 图像特征
            :return: 需要返回neighbors的特征, 用于计算pariwise
        '''
        try:
            tmp = self.lshf.kneighbors(one_pic_feature.reshape(1, FEATURE_DIM), n_neighbors=self.n_neighbors, return_distance=True)
            neighbors_label = np.asarray(self.all_labels)[tmp[1][0]]
            neighbors_feature = np.asarray(self.all_pic_feature)[tmp[1][0]]
            pair_score_list = []
            cos_sim_list = []
            for index in range(len(neighbors_feature)):
                pair_score = pw.cosine_similarity(neighbors_feature[index].reshape(1, FEATURE_DIM),
                                     one_pic_feature.reshape(1, FEATURE_DIM))[0][0]
                cos_sim_list.append(pair_score)
                pair_score_list.append(verification_model.predict(pair_score))
            result = zip(cos_sim_list, pair_score_list, neighbors_label)
            # result = self.filter_result(result)
            # result.sort(key=lambda x:x[0], reverse=True)
            return result
        except:
            return None


    def filter_result(self, result):
        '''
            :param result: [(cos_sim, same_person_result, label),
                            (cos_sim, same_person_result, label),
                            (cos_sim, same_person_result, label)] 按cos_sim降序排列
            :return: this_id(Must_same, Must_not_same, May_same), this_label(人名)
        '''
        # 分值相同的, 将new_person的删去
        tmp_dic = {}
        for element in result:
            this_score, this_same_person_result, this_label = element
            if this_score in tmp_dic:
                if self.new_person_str in this_label:
                    continue
                else:
                    tmp_dic[this_score] = element
            else:
                tmp_dic[this_score] = element
        result = tmp_dic.values()
        return result


    def evaluate_result(self, result):
        '''
            :param result: [(cos_sim, same_person_result, label),
                            (cos_sim, same_person_result, label),
                            (cos_sim, same_person_result, label)]
            :return: this_id(Must_same, Must_not_same, May_same), this_label(人名)
        '''
        for index, element in enumerate(result):
            this_score, this_same_person_result, this_label = element
            if this_same_person_result == self.verification_same_person and this_score > self.same_pic_threshold:
                return self.same_pic_id, this_label
            if this_same_person_result == self.verification_same_person and this_score > self.upper_threshold:
                return self.must_be_same_id, this_label
            if this_same_person_result == self.verification_same_person and this_score > self.lower_threshold:
                return self.maybe_same_id, this_label
        return self.must_be_not_same_id, ''


    def check_face_img(self, face_img, image_id):
        # 计算角度
        '''
        :param face_img: 人脸对应的矩阵
        :param image_id: 图片id
        :return: 是否进行识别(False:不进行识别)
        '''
        # 姿势检测

        current_day = get_current_day()
        log_file = open(os.path.join(log_dir, current_day+'.txt'), 'a')

        face_img_str = base64.b64encode(msgpack_numpy.dumps(face_img))
        request = {
            "request_type": 'check_pose',
            "face_img_str": face_img_str,
            "image_id": image_id,
        }
        url = "http://%s:%d/" % (check_ip, check_port)
        result = image_request(request, url)
        try:
            pose_predict = json.loads(result)["pose_predict"]
            if not pose_predict:  # 加载失败
                log_file.write('\t'.join(map(str, [image_id, 'pose filter request'])) + '\n')
                log_file.close()
                return False
            else:
                pose_predict = msgpack_numpy.loads(base64.b64decode(pose_predict))
                if pose_predict == None:
                    log_file.write('\t'.join(map(str, [image_id, 'pose filter detect'])) + '\n')
                    log_file.close()
                    return False
                pitch, yaw, roll = pose_predict[0]
                if math.fabs(pitch) < self.pitch_threshold and \
                        math.fabs(yaw) < self.yaw_threshold and \
                        math.fabs(roll) < self.roll_threshold:
                    log_file.close()
                    return True
                else:
                    log_file.write('\t'.join(map(str, [image_id, 'pose filter threshold'])) + '\n')
                    log_file.close()
                    return False
        except:
            traceback.print_exc()
            log_file.close()
            return False


    def recognize_online_cluster(self, image, image_id):
        '''
            :param image: 将得到的图片进行识别,加入的LSH Forest,根据距离计算proba(不同的距离对应不同的准确率,根据已有的dist计算阈值);
                            和已经设定的阈值判断是不是一个新出现的人,确定是原来已有的人,还是不确定是原来已有的人
            # 增加统计的功能, 方便以后计算过滤原因和比例, 以及识别比例(same, not_same, maybe_same)
            :return:
        '''
        start = time.time()
        need_add = False
        has_save_num = 0
        current_day = get_current_day()
        log_file = open(os.path.join(log_dir, current_day+'.txt'), 'a')
        log_file.write('\t'.join(map(str, ["receive image", image_id, time.time()])) + '\n')
        try:
            image = base64.decodestring(image)
            image = zlib.decompress(image)
            im = cv2.imdecode(np.fromstring(image, dtype=np.uint8), 1)
            time_slot = get_time_slot(image_id)
            if time_slot == None:
                time_slot = 'error'
            time_slot_dir = os.path.join(tmp_face_dir, time_slot)
            if not os.path.exists(time_slot_dir):
                os.makedirs(time_slot_dir)
            tmp_pic_path = os.path.join(time_slot_dir, image_id+'.jpg')
            cv2.imwrite(tmp_pic_path, im)
            blur_result = is_blur(im)
            blur_sign, blur_var = blur_result
            if blur_sign:
                log_file.write('\t'.join(map(str, ['stat', 'blur_filter', blur_var, image_id]))+'\n')
                log_file.close()
                return self.unknown, 1.0, self.has_save_pic_feature, need_add
            align_face_img = align_face(tmp_pic_path)
            if align_face_img == None:
                log_file.write('\t'.join(map(str, ['stat', 'detect_filter', blur_var, image_id])) + '\n')
                log_file.close()
                return self.unknown, 1.0, self.has_save_pic_feature, need_add
            else:
                # 使用重新检测并对对齐的人脸进行识别
                im = align_face_img
            # 对检测到的人脸重新进行模糊检测
            blur_result = is_blur(im)
            blur_sign, blur_var = blur_result
            if blur_sign:
                log_file.write('\t'.join(map(str, ['stat', 'blur_filter', blur_var, image_id]))+'\n')
                log_file.close()
                return self.unknown, 1.0, self.has_save_pic_feature, need_add
            need_process = self.check_face_img(im, image_id)
            if not need_process:
                log_file.write('\t'.join(map(str, ['stat', 'pose_filter', blur_var, image_id])) + '\n')
                log_file.close()
                return self.unknown, 1.0, self.has_save_pic_feature, need_add
            im = cv2.resize(im, (PIC_SHAPE[1], PIC_SHAPE[2]), interpolation=cv2.INTER_LINEAR)
            im = im[:, :, ::-1]*1.0
            im = im - avg
            im = im.transpose((2, 0, 1))
            im = im[None, :]
        except:
            traceback.print_exc()
            return self.unknown, 1.0, self.has_save_pic_feature, need_add
        try:
            # 流程 : 找距离最近的图片 ; 计算prob ; 在线聚类 ; 加入LSH Forest
            im_feature = extract_feature_from_numpy(im)
            try:
                # nearest_sim_list的格式和dist_label_list的格式一样,这样可以将两个list合并,一起计算(这样不用考虑时间的因素)
                # 在识别出人名后将人名和feature放入到self.nearest
                nearest_sim_list = self.cal_nearest_sim(current_feature=im_feature)
            except:
                traceback.print_exc()
                nearest_sim_list = []
            log_file.write('\t'.join(map(str, ['nearest_sim_list :', map(str, nearest_sim_list)])) + '\n')

            # 找距离最近的图片 --- 用LSH Forest 找出最近的10张图片,然后分别计算距离
            dist_label_list = self.find_k_neighbors_with_lsh(im_feature)
            dist_label_list.extend(nearest_sim_list)
            dist_label_list = self.filter_result(dist_label_list)
            dist_label_list.sort(key=lambda x: x[0], reverse=True)
            # 计算
            if dist_label_list == None:
                this_id = self.must_be_not_same_id
                this_label = self.new_person_str + str(self.current_new_person_id)
            else:
                # 计算prob --- 根据距离计算prob
                this_id, this_label = self.evaluate_result(dist_label_list)
            # 在线聚类 --- 根据dist确定是重新增加一个人还是加入到已有的人中
            log_file.write('\t'.join(map(str, ['stat', 'recognize_id', blur_var, this_id])) + '\n')
            if dist_label_list != None and len(dist_label_list) > 0:
                log_file.write('\t'.join(map(str, ['dist_label_list :', map(str, dist_label_list)])) + '\n')
            need_save = False
            if this_id == self.same_pic_id:
                need_add = False
            elif this_id == self.must_be_same_id:
                need_add = False
                need_save = True
                this_person_folder = os.path.join(self.all_pic_data_folder, this_label+self.must_same_str)
            elif this_id == self.must_be_not_same_id:
                this_label = self.new_person_str + str(self.current_new_person_id)
                self.current_new_person_id += 1
                this_person_folder = os.path.join(self.all_pic_data_folder, this_label)
                need_add = True
                need_save = True
            elif this_id == self.maybe_same_id:
                this_person_folder = os.path.join(self.all_pic_data_folder, this_label+self.maybe_same_str)
                need_add = False # prob在灰度区域的不如入,其余情况加入
                need_save = True
            else:
                log_file.write('\t'.join(map(str, ['error para :', this_id])) + '\n')
            if need_save:
                try:
                    if not os.path.exists(this_person_folder):
                        os.makedirs(this_person_folder)
                        os.chmod(this_person_folder, stat.S_IRWXG + stat.S_IRWXO + stat.S_IRWXU)
                    this_pic_name = os.path.join(this_person_folder, image_id+'.png')
                    imsave(this_pic_name, np.transpose(im[0], (1, 2, 0)))
                except:
                    traceback.print_exc()
                    return self.unknown, 1.0, has_save_num, False

            # 加入LSH Forest --- partial_fit
            if need_add:
                self.add_one_pic(im_feature, this_label)
                has_save_num += 1
                # 根据label和image_id可以存生成文件名,确定是否要存储文件[可以选择在服务器和本地同时存储]
            if this_id == self.same_pic_id or this_id == self.must_be_not_same_id or this_id == self.must_be_same_id:
                end = time.time()
                log_file.write('\t'.join(map(str, ['stat recognize_time :', (end - start), 'this_id :', self.trans_dic.get(this_id)])) + '\n')
                log_file.close()
                return this_label.replace(self.must_same_str, ''), \
                       str(dist_label_list[0][0]), str(has_save_num), str(need_add)
            else:
                # 灰度区域,不显示人名
                end = time.time()
                log_file.write('\t'.join(map(str, ['gray area recog time :',(end - start)])) + '\n')
                log_file.close()
                # return this_label.replace(self.maybe_same_str, ''), \
                #        str(dist_label_list[0][0]), str(has_save_num), str(need_add)
                return self.unknown, str(dist_label_list[0][0]), str(has_save_num), str(need_add)
        except:
            traceback.print_exc()
            log_file.close()
            return self.unknown, str(100.0), str(has_save_num), str(False)
Exemplo n.º 14
0
class Analysis():
    def __init__(self, conf):
        self.unknown = ''
        self.n_neighbors = 5
        self.lshf = LSHForest(n_estimators=20, n_candidates=200, n_neighbors=self.n_neighbors)
        self.all_labels = []
        self.all_pic_feature = []
        self.same_pic_id = 2
        self.must_be_same_id = 1
        self.must_be_not_same_id = 0
        self.maybe_same_id = 3
        self.new_person_str = 'new_person_'
        self.current_new_person_id = self.find_current_new_person_id()
        self.must_same_str = '_Must_Same'
        self.maybe_same_str = '_Maybe_same'
        self.user_count = {}
        self.nearest_num = 5
        # 只保存最近15秒的图片
        self.nearest_time_threshold = 15
        self.feature_url = conf.feature_url
        # 以后调模型时不在修改最后一个卷积层的维度
        self.feature_dim = conf.feature_dim
        # 每次更换模型的时候需要修改这两个参数
        self.same_pic_threshold = conf.same_pic_threshold
        self.upper_threshold = conf.upper_threshold
        self.lower_threshold = conf.lower_threshold
        self.pitch_threshold = 20
        self.yaw_threshold = 20
        self.roll_threshold = 20
        self.max_dist_threshold = 100
        #  [(label, feature),...,(label, feature)]
        self.nearest = deque(maxlen=self.nearest_num)
        self.trans_dic = {self.must_be_same_id: 'must_same_id', self.same_pic_id: 'same_pic',
                self.must_be_not_same_id: 'not_same_id', self.maybe_same_id: 'maybe_same_id'}
        self.all_feature_label_file = conf.all_feature_label_file
        self.log_dir = conf.log_dir
        if not os.path.exists(self.log_dir):
            os.makedirs(self.log_dir)
        self.tmp_jpg_file = 'tmp.jpg'
        self.model_label = conf.model_label


    def find_current_new_person_id(self):
        all_id_name = get_all_name()
        current_new_person_id = -1
        for id_name in all_id_name:
            name = id_name[1]
            if name.startswith(self.new_person_str):
                current_new_person_id = max(current_new_person_id, int(name.replace(self.new_person_str, '')))
        current_new_person_id = current_new_person_id + 1
        return current_new_person_id


    def cal_nearest_sim(self, current_time, current_feature):
        nearest_sim_list = []
        try:
            length = len(self.nearest)
            for k in range(length):
                this_label, pre_feature, pre_time = self.nearest[k]
                if current_time - pre_time > self.nearest_time_threshold:
                    continue
                this_sim = pw.cosine_similarity(pre_feature, current_feature)
                nearest_sim_list.append((this_sim, this_label))
            return nearest_sim_list
        except:
            traceback.print_exc()
        return nearest_sim_list


    def extract_pic_feature(self, face_array):
        '''
            # 传入半身照片,得到人脸照片(必须要做检测,因为有可能会更新检测模型,导致识别不准)
            # 用于人工添加图片加到LSHForest
            # 仍然使用人脸识别的接口, 解析得到的特征
            :param face_array: 人脸图片(numpy格式)
            :return:face_frame, feature(numpy格式)
        '''
        try:
            cv2.imwrite(self.tmp_jpg_file, face_array)
            result = requests.post(self.feature_url, open(self.tmp_jpg_file, 'rb').read())
            if result.status_code == 200:
                try:
                    content = result.content
                    tmp = content.split('\n')
                    if len(tmp) < 3:
                        return None, None
                    face_num = int(tmp[0].split(':')[1])
                    if face_num == 1:
                        frame = map(float, tmp[1].split(','))
                        feature = map(float, tmp[2].split(',')[:-1])
                        if np.sum(feature) == 0:
                            print 'filter'
                            return None, None
                        return frame, feature
                except:
                    traceback.print_exc()
                    return None, None
            else:
                return None, None
        except:
            traceback.print_exc()
            return None, None



    def load_all_data(self):
        # 将以前标记的数据全部读入,用LSH Forest保存,方便计算距离
        # 使用半身照进行检测和识别(输入图片,得到content,解析content得到feature)
        current_day = get_current_day()
        log_file = open(os.path.join(self.log_dir, current_day + '.txt'), 'a')
        if not os.path.exists(self.all_feature_label_file):
            return
        start = time.time()
        # 从数据库中得到半身照和人名
        half_pic_name_list = get_all_annotate_half()
        for element in half_pic_name_list:
            image, name = element
            im = cv2.imdecode(np.fromstring(base64.decodestring(image), dtype=np.uint8), 1)
            tmp_1 = self.extract_pic_feature(im)
            if tmp_1 == None:
                continue
            face_frame, im_feature = tmp_1
            if im_feature == None or face_frame == None:
                continue
            if np.sum(im_feature) == 0:
                print im.shape, name, 'blur'
                continue
            print im.shape, name
            im_feature = list(im_feature)
            # type(im_feature) < type 'list' > ;  len(im_feature) 256
            this_label = name
            self.all_pic_feature.append(im_feature)
            self.all_labels.append(this_label)
            self.lshf.partial_fit(im_feature, this_label)
        end = time.time()
        self.user_count = Counter(self.all_labels)
        current_time = get_current_time()
        log_file.write('\t'.join(map(str, [current_time, self.user_count, 'fit all data time :', (end - start)])) + '\n')
        log_file.close()


    def add_one_pic(self, one_pic_feature, pic_label):
        '''
            将一个图像的特征加入到LSH Forest,同时将对应的标签加入到self.all_labels
            :param pic_feature: array shape :(1,512)
            :param pic_label: (1,)
            :return:
        '''
        self.lshf.partial_fit(one_pic_feature.reshape(1, self.feature_dim), pic_label)
        self.all_labels.append(pic_label)
        self.all_pic_feature.append(np.reshape(one_pic_feature, newshape=(1, one_pic_feature.size)))


    def add_all_new_pic(self):
        '''
            遍历数据库(将修改过的数据加入LSHForest)
            一分钟一次(避免频繁查数据库, 也不会造成太大的延迟)
            使用研究院的模型时, 只能先保存特征, 直接移动特征(在数据库中加一列)
        '''
        current_day = get_current_day()
        log_file = open(os.path.join(self.log_dir, current_day + '.txt'), 'a')
        start = time.time()
        add_num = 0
        all_new_pic_name = get_all_new_face()
        for feature_str, person_name in all_new_pic_name:
            face_feature = np.reshape(msgpack_numpy.loads(base64.b64decode(feature_str)), (1, self.feature_dim))
            self.add_one_pic(face_feature, person_name)
            add_num += 1
        if add_num > 0:
            end = time.time()
            current_time = get_current_time()
            log_file.write('\t'.join(map(str, [current_time, 'add_pic_num :', add_num, 'Dynamic_increase_time :', (end - start)])) + '\n')
            log_file.close()
        else:
            log_file.close()


    def find_k_neighbors_with_lsh(self, one_pic_feature):
        '''
            :param one_pic_feature: 图像特征
            :return: 需要返回neighbors的特征, 用于计算pariwise
        '''
        try:
            one_pic_feature = one_pic_feature.reshape(1, self.feature_dim)
            tmp = self.lshf.kneighbors(one_pic_feature, n_neighbors=self.n_neighbors, return_distance=True)
            neighbors_label = np.asarray(self.all_labels)[tmp[1][0]]
            neighbors_feature = np.asarray(self.all_pic_feature)[tmp[1][0]]
            cos_sim_list = []
            for index in range(len(neighbors_feature)):
                pair_score = pw.cosine_similarity(neighbors_feature[index], one_pic_feature)[0][0]
                cos_sim_list.append(pair_score)
            result = zip(cos_sim_list, neighbors_label)
            result = self.filter_result(result)
            result.sort(key=lambda x: x[0], reverse=True)
            return result
        except:
            traceback.print_exc()
            return None


    def filter_result(self, result):
        '''
            :param result: [(cos_sim, label), (cos_sim, label), (cos_sim, label)] 按cos_sim降序排列
            :return: this_id(Must_same, Must_not_same, May_same), this_label(人名)
        '''
        # 分值相同的, 将new_person的删去
        tmp_dic = {}
        for element in result:
            this_score, this_label = element
            if this_score in tmp_dic:
                if self.new_person_str in this_label:
                    continue
                else:
                    tmp_dic[this_score] = element
            else:
                tmp_dic[this_score] = element
        result = tmp_dic.values()
        return result


    def evaluate_result(self, result):
        '''
            :param result: [(cos_sim, same_person_result, label),
                            (cos_sim, same_person_result, label),
                            (cos_sim, same_person_result, label)]
                    程序中只根据cos_sim做判断, 不在使用same_person_result
            :return: this_id(Must_same, Must_not_same, May_same), this_label(人名)
        '''
        for index, element in enumerate(result):
            this_score, this_label = element
            if this_score > self.same_pic_threshold:
                return self.same_pic_id, this_label
            if this_score > self.upper_threshold:
                return self.must_be_same_id, this_label
            if this_score > self.lower_threshold:
                return self.maybe_same_id, this_label
        return self.must_be_not_same_id, ''


    def recognize_one_feature(self, im_feature, image_id):
        '''
            根据特征确定label
            :param image_id : 大图的文件名+face_id(第几个人脸) --- 方便定位
        '''
        start = time.time()
        feature_str = base64.b64encode(msgpack_numpy.dumps(im_feature))
        # im_feature = msgpack_numpy.loads(base64.b64decode(feature_str))
        current_day = get_current_day()
        log_file = open(os.path.join(self.log_dir, current_day + '.txt'), 'a')
        current_time = get_current_time()
        log_file.write('\t'.join(map(str, [current_time, "receive image", image_id])) + '\n')
        try:
            # 流程 : 找距离最近的图片 ; 计算prob ; 在线聚类 ; 加入LSH Forest
            try:
                current_time = float(image_id)
                nearest_sim_list = self.cal_nearest_sim(current_time=current_time, current_feature=im_feature)
                # print 'current_time :', current_time, 'nearest_sim_list :', nearest_sim_list
            except:
                traceback.print_exc()
                nearest_sim_list = []
            # 找距离最近的图片 --- 用LSH Forest 找出最近的10张图片,然后分别计算距离
            dist_label_list = self.find_k_neighbors_with_lsh(im_feature)
            dist_label_list.extend(nearest_sim_list)
            dist_label_list = self.filter_result(dist_label_list)
            dist_label_list.sort(key=lambda x: x[0], reverse=True)

            # 计算
            if dist_label_list == None:
                # 不考虑new_person的情况,小于阈值的都判断为new_person
                this_id = self.must_be_not_same_id
                this_label = 'new_person'
                # this_id = self.must_be_not_same_id
                # this_label = self.new_person_str + str(self.current_new_person_id)
            else:
                # 计算prob --- 根据距离计算prob
                this_id, this_label = self.evaluate_result(dist_label_list)
            # 在线聚类 --- 根据dist确定是重新增加一个人还是加入到已有的人中
            if dist_label_list != None and len(dist_label_list) > 0:
                current_time = get_current_time()
                log_file.write('\t'.join(map(str, [current_time, 'dist_label_list :', map(str, dist_label_list)])) + '\n')
            # need_add 决定是否加入LSHForest ;  need_save决定是否存入数据库
            if this_id == self.same_pic_id:
                need_add = False
                need_save = True
            elif this_id == self.must_be_same_id:
                need_add = False
                need_save = True
            elif this_id == self.must_be_not_same_id:
                # 现在的版本不用加入新人, 不能识别的全部返回new_person
                this_label = 'new_person'
                need_save = True
                need_add = False
                # this_label = self.new_person_str + str(self.current_new_person_id)
                # self.current_new_person_id += 1
                # need_add = True
                # need_save = True
            elif this_id == self.maybe_same_id:
                need_add = False
                need_save = False
            else:
                current_time = get_current_time()
                log_file.write('\t'.join(map(str, [current_time, 'error para :', this_id])) + '\n')
                return self.unknown, str(self.max_dist_threshold), feature_str, str(False)
            self.nearest.append((this_label, im_feature, image_id))
            # 现在不在增加new_person
            # # 加入LSH Forest --- partial_fit
            # if need_add:
            #     # 只将新人的图片加入LSHForest并保存到文件
            #     self.add_one_pic(im_feature, this_label)
            #     write_start = time.time()
            #     tmp_file = open(self.all_feature_label_file, 'a')
            #     tmp_file.write(base64.b64encode(msgpack_numpy.dumps((im_feature, this_label)))+'\n')
            #     tmp_file.close()
            #     print 'write time :', (time.time() - write_start)
            #     # 根据label和image_id可以存生成文件名,确定是否要存储文件[可以选择在服务器和本地同时存储]
            # 统计有多少图片在gray area
            log_file.write('\t'.join(map(str, ['stat', 'recognize_id', self.trans_dic[this_id], 'recog time :', (time.time() - start)])) + '\n')
            log_file.close()
            if this_id == self.same_pic_id or this_id == self.must_be_not_same_id or this_id == self.must_be_same_id:
                if this_label == None or dist_label_list == None:
                        # 数据库里可能一个人也没有, 这时this_label = None
                        return self.unknown, str(self.max_dist_threshold), feature_str, str(False)
                else:
                    return this_label.replace(self.must_same_str, ''), str(dist_label_list[0][0]), feature_str, str(need_save)
            else:
                # 灰度区域,不显示人名
                # return this_label.replace(self.maybe_same_str, ''), tr(dist_label_list[0][0]), str(has_save_num), str(need_add)
                return self.unknown, str(dist_label_list[0][0]), feature_str, str(need_save)
        except:
            traceback.print_exc()
            log_file.close()
            return self.unknown, str(self.max_dist_threshold), feature_str, str(False)


    def recognize_online_cluster(self, content, image_id):
        '''
            该程序不需要存储图片, 只需要将标志返回就可以
            增加过滤,
            :param content: 检测识别返回的结果
            :return:
        '''
        tmp = content.split('\n')
        print 'len(tmp) :', len(tmp)
        if len(tmp) < 3:
            return None
        face_num = int(tmp[0].split(':')[1])
        all_frames = []
        all_recognize_result = []
        for k in range(face_num):
            frame = map(float, tmp[2 * k + 1].split(','))
            feature = np.reshape(np.asarray(map(float, tmp[2 * k + 2].split(',')[:-1])), (1, self.feature_dim))
            person_name, score, has_save_pic_feature, need_save = self.recognize_one_feature(feature, image_id)
            all_recognize_result.append((person_name, score, has_save_pic_feature, need_save))
            all_frames.append(frame)
        return zip(all_frames, all_recognize_result)


    def offline_add(self, folder):
        # 线下自己将文件夹中的数据导入(每个图片以label命名)
        pic_list = os.listdir(folder)
        pic_info = []
        for pic in pic_list[:]:
            print 'pic :', pic
            label = pic.split('.')[0]
            label = label.decode('gbk').encode('utf-8')
            pic_path = os.path.join(folder, pic)
            img_array = cv2.imread(pic_path)
            try:
                tmp = self.extract_pic_feature(img_array)
                if tmp == None:
                    continue
                face_frame, im_feature = tmp
                if face_frame == None or im_feature == None:
                    continue
            except:
                traceback.print_exc()
                continue
            x, y, w, h = face_frame
            face = img_array[int(y):int(y + h), int(x):int(x + w), :]
            algorithm = self.model_label
            face_str = base64.encodestring(cv2.imencode('.jpg', face)[1].tostring())
            img_str = base64.encodestring(cv2.imencode('.jpg', img_array)[1].tostring())
            # tmp_array  = cv2.imdecode(np.fromstring(base64.decodestring(img_str), dtype=np.uint8), 1)
            # cv2.imwrite(str(time.time())+'.jpg', tmp_array)

            pic_info.append((label, algorithm, face_str, img_str))
        insert_pic_list(pic_info)
class FaceRecognition():
    def __init__(self):
        self.unknown = ''
        self.same_person_num = 1
        self.has_cal_dist = []
        self.NeighbourNum = 10
        # 如果管理员加载图片, 把图片放到all_pic_data_folder下指定人的目录(图片文件和特征文件的文件名相同)
        self.all_pic_feature_data_folder = '/data/liubo/face/research_feature_self'     # 研究院的模型直接存储特征
        # 保存图片可以方便以后查看效果, 方便前端显示, 也方便管理员进行标注
        self.all_pic_data_folder = '/data/liubo/face/research_self'
        if not os.path.exists(self.all_pic_data_folder):
            os.makedirs(self.all_pic_data_folder)
        if not os.path.exists(self.all_pic_feature_data_folder):
            os.makedirs(self.all_pic_feature_data_folder)
        self.n_neighbors = 10
        self.lshf = LSHForest(n_estimators=20, n_candidates=200, n_neighbors=self.n_neighbors)
        self.all_labels = []
        self.all_pic_feature = []
        self.same_pic_id = 2
        self.must_be_same_id = 1
        self.must_be_not_same_id = 0
        self.maybe_same_id = 3
        self.new_person_str = 'new_person_'
        self.current_new_person_id = self.find_current_new_person_id()
        self.must_same_str = '_Must_Same'
        self.maybe_same_str = '_Maybe_same'
        self.load_time = time.time()
        self.user_count = {}
        self.upper_threshold = upper_verif_threshold
        self.lower_threshold = lower_verif_threshold
        self.same_pic_threshold = same_pic_threshold
        self.trans_dic = {self.same_pic_id: 'same_pic', self.must_be_same_id: 'must_same_id',
                          self.must_be_not_same_id: 'must_not_same_id', self.maybe_same_id: 'maybe_same_id'}
        self.nearest = deque(maxlen=nearest_num)
        self.verification_same_person = 0


    def cal_nearest_sim(self, current_feature):
        nearest_sim_list = []
        current_day = get_current_day()
        log_file = open(os.path.join(log_dir, current_day+'.txt'), 'a')

        try:
            length = len(self.nearest)
            for k in range(length):
                try:
                    person_name, pre_feature = self.nearest[k]
                    # 不在考虑时间, 只考虑图片的相似度

                    this_sim = pw.cosine_similarity(np.reshape(np.asarray(pre_feature), (1, len(pre_feature))),
                                                        np.reshape(np.asarray(current_feature), (1, len(current_feature))))
                    nearest_sim_list.append((this_sim, verification_model.predict(this_sim), person_name))
                except:
                    log_file.write('cal_nearest_sim error'+'\n')
                    traceback.print_exc()
                    continue
            return nearest_sim_list
        except:
            traceback.print_exc()
            return nearest_sim_list


    def load_train_data(self, data_folder):
        # 直接读取图片特征, 返回所有特征和label
        all_pic_feature = []
        all_label = []
        person_list = os.listdir(data_folder)
        for person in person_list:
            if person == self.unknown or self.must_same_str in person or self.maybe_same_str in person:
                continue
            person_path = os.path.join(data_folder, person)
            pic_feature_list = os.listdir(person_path)
            for pic_feature_path in pic_feature_list:
                pic_feature = msgpack_numpy.load(open(os.path.join(person_path, pic_feature_path), 'rb'))
                all_pic_feature.append(pic_feature)
                all_label.append(person)
        all_pic_feature = np.asarray(all_pic_feature)
        all_label = np.asarray(all_label)
        return all_pic_feature, all_label


    def find_current_new_person_id(self):
        current_day = get_current_day()
        log_file = open(os.path.join(log_dir, current_day+'.txt'), 'a')

        old_person_id = []
        # 保存的是原始图片
        person_list = os.listdir(self.all_pic_data_folder)
        for person in person_list:
            if person.startswith(self.new_person_str):
                tmp = person[len(self.new_person_str):].split('_')
                if len(tmp) > 0:
                    this_id = int(tmp[0])
                    old_person_id.append(this_id)
        if len(old_person_id) == 0:
            current_new_person_id = 0
        else:
            current_new_person_id = max(old_person_id) + 1
        log_file.write('\t'.join(map(str, ['current_new_person_id :', current_new_person_id]))+'\n')
        log_file.close()
        return current_new_person_id


    def extract_pic_feature(self, pic_path):
        try:
            result = extract_feature_from_binary_data(open(pic_path, 'rb'))
            if result == None:
                return
            face_num, all_frames, all_feature = result
            biggest_face_index = find_big_face(all_frames)
            pic_frame = all_frames[biggest_face_index]
            pic_feature = all_feature[biggest_face_index]
            x, y, width, height = pic_frame
            face_pic = cv2.imread(pic_path)[y:y+width, x:x+height, :]
            return face_pic, pic_feature
        except:
            traceback.print_exc()
            return None


    def load_all_data(self):
        # 将以前标记的数据全部读入(直接读入的是特征), 用LSH Forest保存,方便计算距离
        current_day = get_current_day()
        log_file = open(os.path.join(log_dir, current_day+'.txt'), 'a')
        try:
            all_pic_feature, all_label = self.load_train_data(self.all_pic_feature_data_folder)
            train_label = np.asarray(all_label)
            if len(all_pic_feature) == len(train_label) and len(train_label) > 0:
                start = time.time()
                self.lshf.fit(all_pic_feature, train_label)
                self.all_pic_feature = list(all_pic_feature)
                self.all_labels = list(train_label)
                end = time.time()
                self.load_time = end
                self.user_count = Counter(self.all_labels)
                log_file.write('\t'.join(map(str, [self.user_count,
                                           'fit all data time :', (end - start)]))+'\n')
                log_file.close()
        except:
            traceback.print_exc()
            log_file.close()
            return


    def save_pic_feature(self, pic_path, person_name):
        #  将已经存在的文件生成特征并保存到指定文件夹下, 用于管理员加入新的图片(加入新的图片后, 提取特征, 保存到指定文件夹)
        person_pic_folder_path = os.path.join(self.all_pic_data_folder, person_name)
        person_feature_folder_path = os.path.join(self.all_pic_feature_data_folder, person_name)
        if not os.path.exists(person_pic_folder_path):
            os.makedirs(person_pic_folder_path)
        if not os.path.exists(person_feature_folder_path):
            os.makedirs(person_feature_folder_path)
        pic_name = os.path.split(pic_path)[-1]
        # 特征文件
        person_feature_path = os.path.join(person_feature_folder_path, pic_name)
        # 人脸文件
        person_pic_path = os.path.join(person_pic_folder_path, pic_name)
        result = extract_feature_from_binary_data(open(pic_path, 'rb'))
        if result == None:
            return
        face_num, all_frames, all_feature = result
        biggest_face_index = find_big_face(all_frames)
        pic_frame = all_frames[biggest_face_index]
        pic_feature = all_feature[biggest_face_index]
        x, y, width, height = pic_frame
        face_pic = cv2.imread(pic_path)[y:y+width, x:x+height, :]
        cv2.imwrite(person_pic_path, face_pic)
        msgpack_numpy.dump(pic_feature, open(person_feature_path, 'wb'))


    def add_all_new_pic(self):
        '''
            将从上次加载数据到当前新增的文件都加载到LSH Forest(有可能是新增加一个人,还有可能是对已有的人增加新图片)
            遍历文件夹(self.all_pic_feature_data_folder), 根据文件的时间判断是否需要加入该图片的特征
            系统在管理员标注图片后, 将人脸图片和特征文件同时进行移动, 所以现在只需要将特征和对应的label加入LSH就可以了
        '''
        current_day = get_current_day()
        log_file = open(os.path.join(log_dir, current_day+'.txt'), 'a')
        start = time.time()
        person_list = os.listdir(self.all_pic_data_folder)
        add_num = 0
        for person in person_list:
            if self.must_same_str in person or self.maybe_same_str in person or self.new_person_str in person:
                continue
            person_path = os.path.join(self.all_pic_data_folder, person)
            if not os.path.isdir(person_path):
                continue
            pic_list = os.listdir(person_path)
            for pic in pic_list:
                pic_path = os.path.join(person_path, pic)
                last_modify_time = os.stat(pic_path).st_atime
                if last_modify_time > self.load_time:
                    request = {
                        "label": person,
                        "request_type": 'add',
                        "one_pic_feature": pic_path
                    }
                    url = "http://127.0.0.1:%d/"%port
                    result = image_request(request, url)
                    try:
                        add_flag = json.loads(result)["add"]
                        if not add_flag:    # 加载失败
                            log_file.write('\t'.join(map(str, ['no add file :', pic_path]))+'\n')
                        else:
                            add_num += 1
                    except:
                        log_file.write('\t'.join(map(str, ['no add file :', pic_path]))+'\n')
                        traceback.print_exc()
                        continue
                    add_num += 1
        end = time.time()
        if add_num > 0:
            self.load_time = end
            log_file.write('\t'.join(map(str, ['self.load_time', self.load_time]))+'\n')
            log_file.write('\t'.join(map(str, ['add pic num :', add_num,
                                               'Dynamic increase time :', (end - start)]))+'\n')
            log_file.close()
        else:
            log_file.close()


    def add_one_new_pic(self, pic_path, label):
        current_day = get_current_day()
        log_file = open(os.path.join(log_dir, current_day+'.txt'), 'a')
        try:
            # 读入数据时已经转换成需要的尺寸
            result = self.extract_pic_feature(pic_path)
            if result == None:
                return False
            face_pic, pic_feature = result
            self.add_one_pic(pic_feature, label)
            pic_name = os.path.split(pic_path)[1]
            this_person_pic_folder = os.path.join(self.all_pic_data_folder, label)
            this_person_feature_folder = os.path.join(self.all_pic_feature_data_folder, label)
            if not os.path.exists(this_person_pic_folder):
                os.makedirs(this_person_pic_folder)
            if not os.path.exists(this_person_feature_folder):
                os.makedirs(this_person_feature_folder)
            # 直接存储图片对应的特征, 同时保存图片文件
            this_pic_feature_name = os.path.join(this_person_feature_folder, pic_name + '.p')
            msgpack_numpy.dump(pic_feature, open(this_pic_feature_name, 'wb'))
            this_pic_face_name = os.path.join(this_person_pic_folder, pic_name + '.jpg')
            cv2.imwrite(this_pic_face_name, face_pic)
            log_file.write('\t'.join(map(str, [pic_path, this_pic_face_name]))+'\n')
            return True
        except:
            traceback.print_exc()
            return False


    def add_one_pic(self, one_pic_feature, pic_label):
        '''
            将一个图像的特征加入到LSH Forest,同时将对应的标签加入到self.all_labels
            :param pic_feature: array shape :(1,1024)
            :param pic_label: (1,)
            :return:
        '''
        one_pic_feature = np.asarray(one_pic_feature)
        self.lshf.partial_fit(one_pic_feature.reshape(1, FEATURE_DIM), pic_label)
        self.all_labels.append(pic_label)
        self.all_pic_feature.append(np.reshape(one_pic_feature, newshape=(1, one_pic_feature.size)))


    def find_k_neighbors_with_lsh(self, one_pic_feature):
        '''
            :param one_pic_feature: 图像特征
            :return: 需要返回neighbors的特征,用于计算pariwise
        '''
        try:
            one_pic_feature = np.asarray(one_pic_feature)
            tmp = self.lshf.kneighbors(one_pic_feature.reshape(1, FEATURE_DIM), n_neighbors=self.n_neighbors, return_distance=True)
            neighbors_label = np.asarray(self.all_labels)[tmp[1][0]]
            neighbors_feature = np.asarray(self.all_pic_feature)[tmp[1][0]]
            pair_score_list = []
            cos_sim_list = []
            for index in range(len(neighbors_feature)):
                pair_score = pw.cosine_similarity(neighbors_feature[index].reshape(1, FEATURE_DIM),
                                     one_pic_feature.reshape(1, FEATURE_DIM))[0][0]
                cos_sim_list.append(pair_score)
                pair_score_list.append(verification_model.predict(pair_score))
            result = zip(cos_sim_list, pair_score_list, neighbors_label)
            # result = self.filter_result(result)
            # result.sort(key=lambda x:x[0], reverse=True)
            return result
        except:
            return None


    def filter_result(self, result):
        '''
            :param result: [(cos_sim, same_person_result, label),
                            (cos_sim, same_person_result, label),
                            (cos_sim, same_person_result, label)] 按cos_sim降序排列
            :return: this_id(Must_same, Must_not_same, May_same), this_label(人名)
        '''
        # 分值相同的, 将new_person的删去
        tmp_dic = {}
        for element in result:
            try:
                this_score, this_same_person_result, this_label = element
                this_score = float(this_score)
                if this_score in tmp_dic:
                    if self.new_person_str in this_label:
                        continue
                    else:
                        tmp_dic[this_score] = element
                else:
                    tmp_dic[this_score] = element
            except:
                traceback.print_exc()
                continue
        result = tmp_dic.values()
        return result


    def evaluate_result(self, result):
        '''
            :param result: [(cos_sim, same_person_result, label),
                            (cos_sim, same_person_result, label),
                            (cos_sim, same_person_result, label)]
            :return: this_id(Must_same, Must_not_same, May_same), this_label(人名)
        '''
        for index, element in enumerate(result):
            this_score, this_same_person_result, this_label = element
            if this_same_person_result == self.verification_same_person and this_score > self.same_pic_threshold:
                return self.same_pic_id, this_label
            if this_same_person_result == self.verification_same_person and this_score > self.upper_threshold:
                return self.must_be_same_id, this_label
            if this_same_person_result == self.verification_same_person and this_score > self.lower_threshold:
                return self.maybe_same_id, this_label
        return self.must_be_not_same_id, ''


    def recognize_online_cluster(self, image, image_id):
        '''
            :param image: 将得到的图片进行识别,加入的LSH Forest,根据距离计算proba(不同的距离对应不同的准确率,根据已有的dist计算阈值);
                            和已经设定的阈值判断是不是一个新出现的人,确定是原来已有的人,还是不确定是原来已有的人
            :return:
        '''
        start = time.time()
        need_add = False
        need_save = False
        current_day = get_current_day()
        log_file = open(os.path.join(log_dir, current_day+'.txt'), 'a')
        log_file.write('\t'.join(map(str, ["receive image", image_id, time.time()])) + '\n')
        feature_str = ''
        try:
            image = base64.decodestring(image)
            image = zlib.decompress(image)
            im = cv2.imdecode(np.fromstring(image, dtype=np.uint8), 1)
            log_file.write('\t'.join(map(str, ['shape :', im.shape[0], im.shape[1]])) + '\n')
            # 图片尺寸过滤
            if im.shape[0] < size_threshold or im.shape[1] < size_threshold:
                log_file.write('\t'.join(map(str, ['stat recognize_time :', (time.time() - start), 'small_size'])) + '\n')
                log_file.close()
                return self.unknown, 1.0, feature_str, need_save
            # 清晰度过滤
            blur_sign, blur_var = is_blur(cv2.resize(im, (96, 96)))
            if blur_sign:
                log_file.write('\t'.join(map(str, ['stat recognize_time :', (time.time() - start), 'blur_filter', blur_var])) + '\n')
                log_file.close()
                return self.unknown, 1.0, feature_str, need_save
            #  保存传过来的图片
            # img_file = '/tmp/research_face/%s.jpg' %image_id
            time_slot = get_time_slot(image_id)
            if time_slot == None:
                time_slot = 'error'
            time_slot_dir = os.path.join(tmp_face_dir, time_slot)
            if not os.path.exists(time_slot_dir):
                os.makedirs(time_slot_dir)
            img_file = os.path.join(time_slot_dir, image_id+'.jpg')
            cv2.imwrite(img_file, im)
        except:
            traceback.print_exc()
            log_file.close()
            return self.unknown, 1.0, feature_str, need_save
        try:
            # 流程 : 找距离最近的图片 ; 计算prob ; 在线聚类 ; 加入LSH Forest
            result = self.extract_pic_feature(img_file)
            if result == None:
                log_file.write('\t'.join(map(str, ['stat not_find_face', 'time :', (time.time() - start)]))+'\n')
                log_file.close()
                return self.unknown, 1.0, feature_str, need_save
            face_pic, im_feature = result

            try:
                # nearest_sim_list的格式和dist_label_list的格式一样,这样可以将两个list合并,一起计算(这样不用考虑时间的因素)
                # 在识别出人名后将人名和feature放入到self.nearest
                nearest_sim_list = self.cal_nearest_sim(current_feature=im_feature)
            except:
                traceback.print_exc()
                nearest_sim_list = []
            log_file.write('\t'.join(map(str, ['nearest_sim_list :', map(str, nearest_sim_list)])) + '\n')
            feature_str = base64.b64encode(msgpack_numpy.dumps(im_feature))
            log_file.write('\t'.join(map(str, ['extract_feature_time :', (time.time() - start)]))+'\n')
            # 找距离最近的图片 --- 用LSH Forest 找出最近的10张图片,然后分别计算距离

            tmp_list = self.find_k_neighbors_with_lsh(im_feature)
            nearest_sim_list.sort(key=lambda x: x[0], reverse=True)
            nearest_sim_list.extend(tmp_list)
            dist_label_list = nearest_sim_list[:]

            # 计算
            log_file.write('\t'.join(map(str, ['dist_label_list :', map(str, dist_label_list)])) + '\n')
            if dist_label_list == None:
                this_id = self.must_be_not_same_id
                this_label = self.new_person_str + str(self.current_new_person_id)
            else:
                # 计算prob --- 根据距离计算prob
                this_id, this_label = self.evaluate_result(dist_label_list)
            # 不管概率, 都要将最新的一张图片加入到self.nearest
            self.nearest.append((this_label, im_feature))
            log_file.write('\t'.join(map(str, ['self.nearest :', map(str, self.nearest)])) + '\n')
            # 在线聚类 --- 根据dist确定是重新增加一个人还是加入到已有的人中
            if this_id == self.same_pic_id:
                need_add = False
            elif this_id == self.must_be_same_id:
                need_add = False
                need_save = True
                this_person_pic_folder = os.path.join(self.all_pic_data_folder, this_label+self.must_same_str)
                this_person_feature_folder = os.path.join(self.all_pic_feature_data_folder, this_label+self.must_same_str)
            elif this_id == self.must_be_not_same_id:
                this_label = self.new_person_str + str(self.current_new_person_id)
                self.current_new_person_id += 1
                this_person_pic_folder = os.path.join(self.all_pic_data_folder, this_label)
                this_person_feature_folder = os.path.join(self.all_pic_feature_data_folder, this_label)
                need_add = True
                need_save = True
            elif this_id == self.maybe_same_id:
                this_person_pic_folder = os.path.join(self.all_pic_data_folder, this_label + self.maybe_same_str)
                this_person_feature_folder = os.path.join(self.all_pic_feature_data_folder, this_label + self.maybe_same_str)
                need_add = False # prob在灰度区域的不如入,其余情况加入
                need_save = True
            else:
                log_file.write('\t'.join(map(str, ['error para :', this_id]))+'\n')
            if need_save:
                try:
                    if not os.path.exists(this_person_pic_folder):
                        os.makedirs(this_person_pic_folder)
                    if not os.path.exists(this_person_feature_folder):
                        os.makedirs(this_person_feature_folder)
                    # 直接存储图片对应的特征, 同时保存图片文件
                    this_pic_feature_name = os.path.join(this_person_feature_folder, image_id+'.p')
                    msgpack_numpy.dump(im_feature, open(this_pic_feature_name, 'wb'))
                    this_pic_face_name = os.path.join(this_person_pic_folder, image_id+'.jpg')
                    cv2.imwrite(this_pic_face_name, face_pic)
                except:
                    traceback.print_exc()
                    return self.unknown, 1.0, feature_str, False
            # 加入LSH Forest --- partial_fit
            if need_add:
                self.add_one_pic(im_feature, this_label)
                # 根据label和image_id可以存生成文件名,确定是否要存储文件[可以选择在服务器和本地同时存储]
            if this_id == self.same_pic_id or this_id == self.must_be_not_same_id or this_id == self.must_be_same_id:
                end = time.time()
                log_file.write('\t'.join(map(str, ['stat recognize_time :',(end - start), 'this_id :', self.trans_dic.get(this_id)]))+'\n')
                log_file.close()
                need_save = True
                return this_label.replace(self.must_same_str, ''), str(dist_label_list[0][0]), str(feature_str), str(need_save)
            else:
                # 灰度区域,不显示人名
                end = time.time()
                log_file.write('\t'.join(map(str, ['stat gray_area :',(end - start)]))+'\n')
                log_file.close()
                return self.unknown, str(dist_label_list[0][0]), str(feature_str), str(False)
        except:
            traceback.print_exc()
            log_file.close()
            return self.unknown, str(100.0), str(feature_str), str(False)