def test_partial_fit(): """Checks whether inserting array is consitent with fitted data. `partial_fit` method should set all attribute values correctly. """ n_samples = 12 n_samples_partial_fit = 3 n_features = 2 rng = np.random.RandomState(42) X = rng.rand(n_samples, n_features) X_partial_fit = rng.rand(n_samples_partial_fit, n_features) lshf = LSHForest() # Test unfitted estimator lshf.partial_fit(X) assert_array_equal(X, lshf._fit_X) lshf.fit(X) # Insert wrong dimension assert_raises(ValueError, lshf.partial_fit, np.random.randn(n_samples_partial_fit, n_features - 1)) lshf.partial_fit(X_partial_fit) # size of _input_array = samples + 1 after insertion assert_equal(lshf._fit_X.shape[0], n_samples + n_samples_partial_fit) # size of original_indices_[1] = samples + 1 assert_equal(len(lshf.original_indices_[0]), n_samples + n_samples_partial_fit) # size of trees_[1] = samples + 1 assert_equal(len(lshf.trees_[1]), n_samples + n_samples_partial_fit)
def test_partial_fit(): """Checks whether inserting array is consitent with fitted data. `partial_fit` method should set all attribute values correctly. """ n_samples = 12 n_samples_partial_fit = 3 n_features = 2 rng = np.random.RandomState(42) X = rng.rand(n_samples, n_features) X_partial_fit = rng.rand(n_samples_partial_fit, n_features) lshf = LSHForest() # Test unfitted estimator lshf.partial_fit(X) assert_array_equal(X, lshf._fit_X) lshf.fit(X) # Insert wrong dimension assert_raises(ValueError, lshf.partial_fit, np.random.randn(n_samples_partial_fit, n_features - 1)) lshf.partial_fit(X_partial_fit) # size of _input_array = samples + 1 after insertion assert_equal(lshf._fit_X.shape[0], n_samples + n_samples_partial_fit) # size of original_indices_[1] = samples + 1 assert_equal(len(lshf.original_indices_[0]), n_samples + n_samples_partial_fit) # size of trees_[1] = samples + 1 assert_equal(len(lshf.trees_[1]), n_samples + n_samples_partial_fit)
def cal_acc(pack_file, stat_file, feature_dim): f = open(stat_file, 'w') f.write('train_pic_num'+'\t'+'person_name'+'\t'+'acc'+'\n') pic_num = range(1, max_person_num) for num in pic_num: all_train_data, all_train_label, all_valid_data, all_valid_label = split_train_valid(pack_file, train_pic_num=num, feature_dim=feature_dim) lshf = LSHForest(n_estimators=20, n_candidates=200, n_neighbors=5) for index in range(len(all_train_data)): try: if all_train_data[index] == None: continue lshf.partial_fit(all_train_data[index], all_train_label[index]) except: traceback.print_exc() continue # 对于每个人,分别统计准确率 person_acc_dic = {} # 准确的个数 person_all_dic = {} # 总的个数 filter_num = 0 all_num = 0 for index in range(len(all_valid_data)): try: if all_valid_data[index] == None: continue all_find_distance, all_find_index = lshf.kneighbors(all_valid_data[index], n_neighbors=5, return_distance=True) cos_sim = cosine_similarity(all_valid_data[index], all_train_data[all_find_index[0, 0]]) label = all_train_label[all_find_index[0, 0]] # if cos_sim > sim_threshold: if True: if label == all_valid_label[index]: person_acc_dic[label] = person_acc_dic.get(label, 0) + 1 person_all_dic[label] = person_all_dic.get(label, 0) + 1 else: person_all_dic[label] = person_all_dic.get(label, 0) + 1 else: filter_num += 1 all_num += 1 except: print all_valid_label[index] continue print 'train_num :', num, 'filter_rate: ', (filter_num * 1.0 / all_num) for person in person_all_dic: all_num = person_all_dic[person] right_num = person_acc_dic.get(person, 0) f.write('\t'.join(map(str, [num, person, (right_num * 1.0 / all_num)]))+'\n')
def cal_recall(pack_file, stat_file, feature_dim): # f_model = open('verf.txt', 'w') f = open(stat_file, 'w') f.write('train_pic_num'+'\t'+'person_name'+'\t'+'recall'+'\n') pic_num = range(1, max_person_num) for num in pic_num: all_train_data, all_train_label, all_valid_data, all_valid_label = split_train_valid(pack_file, train_pic_num=num, feature_dim=feature_dim) lshf = LSHForest(n_estimators=20, n_candidates=200, n_neighbors=5) for index in range(len(all_train_data)): try: if all_train_data[index] == None: continue lshf.partial_fit(all_train_data[index], all_train_label[index]) except: continue # 对于每个人,分别统计准确率 person_find_dic = {} # 准确的个数 person_all_dic = {} # 总的个数 for index in range(len(all_valid_data)): try: if all_valid_data[index] == None: continue all_find_distance, all_find_index = lshf.kneighbors(all_valid_data[index], n_neighbors=5, return_distance=True) cos_sim = cosine_similarity(all_valid_data[index], all_train_data[all_find_index[0, 0]]) label = all_train_label[all_find_index[0, 0]] real_label = all_valid_label[index] # if cos_sim > sim_threshold: if True: if label == real_label: # f_model.write('0'+'\t'+str(cos_sim)+'\n') person_find_dic[real_label] = person_find_dic.get(real_label, 0) + 1 person_all_dic[real_label] = person_all_dic.get(real_label, 0) + 1 else: # f_model.write('1' + '\t' + str(cos_sim) + '\n') person_all_dic[real_label] = person_all_dic.get(real_label, 0) + 1 except: print all_valid_label[index] continue print 'train_num :', num for person in person_all_dic: all_num = person_all_dic[person] right_num = person_find_dic.get(person, 0) f.write('\t'.join(map(str, [num, person, (right_num * 1.0 / all_num)]))+'\n')
def train_LSHForest(model, batch_size=1000, n_candidates=50, n_estimators=10): ''' Given a large wordvec or GloVe model, we need to efficiently be able to get a word back from a vector. Current methods rely on inefficient search algorithms. Args ---- model : gensim.model pretrained WordVec model batch_size : int n_candidates : int number of candidates for LSH to generate n_estimators : number of LSH trees in forest Returns ------- lshf : LSHForest ''' lshf = LSHForest(n_candidates=n_candidates, n_estimators=n_estimators) for batch in grouper(model.index2word, batch_size): array = np.array([model[word] for word in batch]) lshf.partial_fit(array) return lshf
class FaceRecognition(): def __init__(self): self.unknown = '' self.same_person_num = 1 self.has_save_pic_feature = [] self.has_cal_dist = [] self.NeighbourNum = 10 self.all_pic_data_folder = '/data/liubo/face/self' self.other_dataset_para_add = 1 self.n_neighbors = 5 self.lshf = LSHForest(n_estimators=20, n_candidates=200, n_neighbors=self.n_neighbors) self.all_labels = [] self.all_pic_feature = [] self.same_pic_id = 2 self.must_be_same_id = 1 self.must_be_not_same_id = 0 self.maybe_same_id = 3 self.new_person_str = 'new_person_' self.current_new_person_id = self.find_current_new_person_id() self.must_same_str = '_Must_Same' self.maybe_same_str = '_Maybe_same' self.load_time = time.time() self.user_count = {} # 不同的模型阈值不相同 self.upper_threshold = upper_verif_threshold self.lower_threshold = lower_verif_threshold self.same_pic_threshold = same_pic_threshold self.pitch_threshold = 20 self.yaw_threshold = 20 self.roll_threshold = 20 # [(time, feature),...,(time, feature)] : 根据时间计算当前图片与前5张图片的相似度(如果时间相差很多, 不在计算) self.nearest = deque(maxlen=nearest_num) self.trans_dic = { self.same_pic_id: 'same_pic', self.must_be_same_id: 'must_same_id', self.must_be_not_same_id: 'must_not_same_id', self.maybe_same_id: 'maybe_same_id' } self.verification_same_person = 0 def cal_nearest_sim(self, current_feature): nearest_sim_list = [] try: length = len(self.nearest) for k in range(length): try: person_name, pre_feature = self.nearest[k] # 不在考虑时间, 只考虑图片的相似度 this_sim = pw.cosine_similarity( np.reshape(np.asarray(pre_feature), (1, len(pre_feature))), np.reshape(np.asarray(current_feature), (1, len(current_feature)))) nearest_sim_list.append( (this_sim, verification_model.predict(this_sim), person_name)) except: traceback.print_exc() continue return nearest_sim_list except: traceback.print_exc() return nearest_sim_list def find_current_new_person_id(self): current_day = get_current_day() log_file = open(os.path.join(log_dir, current_day + '.txt'), 'a') old_person_id = [] person_list = os.listdir(self.all_pic_data_folder) for person in person_list: if person.startswith(self.new_person_str): tmp = person[len(self.new_person_str):].split('_') if len(tmp) > 0: this_id = int(tmp[0]) old_person_id.append(this_id) if len(old_person_id) == 0: current_new_person_id = 0 else: current_new_person_id = max(old_person_id) + 1 log_file.write('\t'.join( map(str, ['current_new_person_id :', current_new_person_id])) + '\n') log_file.close() return current_new_person_id def extract_pic_feature(self, pic_data, batch_size=1, feature_dim=FEATURE_DIM): ''' 用于提取多张图片的特征(用于处理load数据) :param pic_data: 图片数据 :param batch_size: :param feature_dim: 模型输出维度(vgg的输出是4096) :return: ''' pic_feature = np.zeros(shape=(pic_data.shape[0], feature_dim)) batch_num = pic_data.shape[0] / batch_size for index in range(batch_num): pic_feature[index*batch_size:(index+1)*batch_size, :] = \ extract_feature_from_numpy(pic_data[index*batch_size:(index+1)*batch_size]) if batch_num * batch_size < pic_data.shape[0]: pic_feature[batch_num*batch_size:, :] = \ extract_feature_from_numpy(pic_data[batch_num*batch_size:]) return pic_feature def load_all_data(self): # 将以前标记的数据全部读入,用LSH Forest保存,方便计算距离 current_day = get_current_day() log_file = open(os.path.join(log_dir, current_day + '.txt'), 'a') train_data, train_label = load_train_data(self.all_pic_data_folder) if len(train_label) == 0: return pic_feature = self.extract_pic_feature(train_data) start = time.time() self.lshf.fit(pic_feature, train_label) self.all_pic_feature = list(pic_feature) self.all_labels = list(train_label) end = time.time() self.load_time = end self.user_count = Counter(self.all_labels) log_file.write('\t'.join( map(str, [self.user_count, 'fit all data time :', (end - start)])) + '\n') log_file.close() def add_all_new_pic(self): ''' 将从上次加载数据到当前新增的文件都加载到LSH Forest(有可能是新增加一个人,还有可能是对已有的人增加新图片) 遍历文件夹(self.all_pic_data_folder),根据文件的时间判断是否需要加入该图片 用户新加入的图片先进行人脸检测, 如果能够检测到人脸,使用检测结果, 否则使用用户的原始图片 ''' current_day = get_current_day() log_file = open(os.path.join(log_dir, current_day + '.txt'), 'a') start = time.time() person_list = os.listdir(self.all_pic_data_folder) add_num = 0 for person in person_list: if self.must_same_str in person or self.maybe_same_str in person or self.new_person_str in person: continue person_path = os.path.join(self.all_pic_data_folder, person) if not os.path.isdir(person_path): continue pic_list = os.listdir(person_path) for pic in pic_list: pic_path = os.path.join(person_path, pic) last_modify_time = os.stat(pic_path).st_atime if last_modify_time > self.load_time: # 请求本地服务 request = { "label": person, "request_type": 'add', "one_pic_feature": pic_path } url = "http://127.0.0.1:%d/" % port result = image_request(request, url) try: add_flag = json.loads(result)["add"] if not add_flag: # 加载失败 log_file.write('\t'.join( map(str, ['no add file :', pic_path])) + '\n') else: add_num += 1 except: log_file.write( '\t'.join(map(str, ['no add file :', pic_path])) + '\n') traceback.print_exc() continue add_num += 1 end = time.time() if add_num > 0: self.load_time = end log_file.write( '\t'.join(map(str, ['self.load_time', self.load_time])) + '\n') log_file.write('\t'.join( map(str, [ 'add pic num :', add_num, 'Dynamic increase time :', (end - start) ])) + '\n') log_file.close() else: log_file.close() def add_one_new_pic(self, pic_path, label): try: # 读入数据时已经转换成需要的尺寸 im_feature = extract_feature_from_file(pic_path) self.add_one_pic(im_feature, label) return True except: traceback.print_exc() return False def add_one_pic(self, one_pic_feature, pic_label): ''' 将一个图像的特征加入到LSH Forest,同时将对应的标签加入到self.all_labels :param pic_feature: array shape :(1,1024) :param pic_label: (1,) :return: ''' self.lshf.partial_fit(one_pic_feature.reshape(1, FEATURE_DIM), pic_label) self.all_labels.append(pic_label) self.all_pic_feature.append( np.reshape(one_pic_feature, newshape=(1, one_pic_feature.size))) def find_k_neighbors_with_lsh(self, one_pic_feature): ''' :param one_pic_feature: 图像特征 :return: 需要返回neighbors的特征, 用于计算pariwise ''' try: tmp = self.lshf.kneighbors(one_pic_feature.reshape(1, FEATURE_DIM), n_neighbors=self.n_neighbors, return_distance=True) neighbors_label = np.asarray(self.all_labels)[tmp[1][0]] neighbors_feature = np.asarray(self.all_pic_feature)[tmp[1][0]] pair_score_list = [] cos_sim_list = [] for index in range(len(neighbors_feature)): pair_score = pw.cosine_similarity( neighbors_feature[index].reshape(1, FEATURE_DIM), one_pic_feature.reshape(1, FEATURE_DIM))[0][0] cos_sim_list.append(pair_score) pair_score_list.append(verification_model.predict(pair_score)) result = zip(cos_sim_list, pair_score_list, neighbors_label) # result = self.filter_result(result) # result.sort(key=lambda x:x[0], reverse=True) return result except: return None def filter_result(self, result): ''' :param result: [(cos_sim, same_person_result, label), (cos_sim, same_person_result, label), (cos_sim, same_person_result, label)] 按cos_sim降序排列 :return: this_id(Must_same, Must_not_same, May_same), this_label(人名) ''' # 分值相同的, 将new_person的删去 tmp_dic = {} for element in result: this_score, this_same_person_result, this_label = element if this_score in tmp_dic: if self.new_person_str in this_label: continue else: tmp_dic[this_score] = element else: tmp_dic[this_score] = element result = tmp_dic.values() return result def evaluate_result(self, result): ''' :param result: [(cos_sim, same_person_result, label), (cos_sim, same_person_result, label), (cos_sim, same_person_result, label)] :return: this_id(Must_same, Must_not_same, May_same), this_label(人名) ''' for index, element in enumerate(result): this_score, this_same_person_result, this_label = element if this_same_person_result == self.verification_same_person and this_score > self.same_pic_threshold: return self.same_pic_id, this_label if this_same_person_result == self.verification_same_person and this_score > self.upper_threshold: return self.must_be_same_id, this_label if this_same_person_result == self.verification_same_person and this_score > self.lower_threshold: return self.maybe_same_id, this_label return self.must_be_not_same_id, '' def check_face_img(self, face_img, image_id): # 计算角度 ''' :param face_img: 人脸对应的矩阵 :param image_id: 图片id :return: 是否进行识别(False:不进行识别) ''' # 姿势检测 current_day = get_current_day() log_file = open(os.path.join(log_dir, current_day + '.txt'), 'a') face_img_str = base64.b64encode(msgpack_numpy.dumps(face_img)) request = { "request_type": 'check_pose', "face_img_str": face_img_str, "image_id": image_id, } url = "http://%s:%d/" % (check_ip, check_port) result = image_request(request, url) try: pose_predict = json.loads(result)["pose_predict"] if not pose_predict: # 加载失败 log_file.write( '\t'.join(map(str, [image_id, 'pose filter request'])) + '\n') log_file.close() return False else: pose_predict = msgpack_numpy.loads( base64.b64decode(pose_predict)) if pose_predict == None: log_file.write( '\t'.join(map(str, [image_id, 'pose filter detect'])) + '\n') log_file.close() return False pitch, yaw, roll = pose_predict[0] if math.fabs(pitch) < self.pitch_threshold and \ math.fabs(yaw) < self.yaw_threshold and \ math.fabs(roll) < self.roll_threshold: log_file.close() return True else: log_file.write('\t'.join( map(str, [image_id, 'pose filter threshold'])) + '\n') log_file.close() return False except: traceback.print_exc() log_file.close() return False def recognize_online_cluster(self, image, image_id): ''' :param image: 将得到的图片进行识别,加入的LSH Forest,根据距离计算proba(不同的距离对应不同的准确率,根据已有的dist计算阈值); 和已经设定的阈值判断是不是一个新出现的人,确定是原来已有的人,还是不确定是原来已有的人 # 增加统计的功能, 方便以后计算过滤原因和比例, 以及识别比例(same, not_same, maybe_same) :return: ''' start = time.time() need_add = False has_save_num = 0 current_day = get_current_day() log_file = open(os.path.join(log_dir, current_day + '.txt'), 'a') log_file.write( '\t'.join(map(str, ["receive image", image_id, time.time()])) + '\n') try: image = base64.decodestring(image) image = zlib.decompress(image) im = cv2.imdecode(np.fromstring(image, dtype=np.uint8), 1) time_slot = get_time_slot(image_id) if time_slot == None: time_slot = 'error' time_slot_dir = os.path.join(tmp_face_dir, time_slot) if not os.path.exists(time_slot_dir): os.makedirs(time_slot_dir) tmp_pic_path = os.path.join(time_slot_dir, image_id + '.jpg') cv2.imwrite(tmp_pic_path, im) blur_result = is_blur(im) blur_sign, blur_var = blur_result if blur_sign: log_file.write('\t'.join( map(str, ['stat', 'blur_filter', blur_var, image_id])) + '\n') log_file.close() return self.unknown, 1.0, self.has_save_pic_feature, need_add align_face_img = align_face(tmp_pic_path) if align_face_img == None: log_file.write('\t'.join( map(str, ['stat', 'detect_filter', blur_var, image_id])) + '\n') log_file.close() return self.unknown, 1.0, self.has_save_pic_feature, need_add else: # 使用重新检测并对对齐的人脸进行识别 im = align_face_img # 对检测到的人脸重新进行模糊检测 blur_result = is_blur(im) blur_sign, blur_var = blur_result if blur_sign: log_file.write('\t'.join( map(str, ['stat', 'blur_filter', blur_var, image_id])) + '\n') log_file.close() return self.unknown, 1.0, self.has_save_pic_feature, need_add need_process = self.check_face_img(im, image_id) if not need_process: log_file.write('\t'.join( map(str, ['stat', 'pose_filter', blur_var, image_id])) + '\n') log_file.close() return self.unknown, 1.0, self.has_save_pic_feature, need_add im = cv2.resize(im, (PIC_SHAPE[1], PIC_SHAPE[2]), interpolation=cv2.INTER_LINEAR) im = im[:, :, ::-1] * 1.0 im = im - avg im = im.transpose((2, 0, 1)) im = im[None, :] except: traceback.print_exc() return self.unknown, 1.0, self.has_save_pic_feature, need_add try: # 流程 : 找距离最近的图片 ; 计算prob ; 在线聚类 ; 加入LSH Forest im_feature = extract_feature_from_numpy(im) try: # nearest_sim_list的格式和dist_label_list的格式一样,这样可以将两个list合并,一起计算(这样不用考虑时间的因素) # 在识别出人名后将人名和feature放入到self.nearest nearest_sim_list = self.cal_nearest_sim( current_feature=im_feature) except: traceback.print_exc() nearest_sim_list = [] log_file.write('\t'.join( map(str, ['nearest_sim_list :', map(str, nearest_sim_list)])) + '\n') # 找距离最近的图片 --- 用LSH Forest 找出最近的10张图片,然后分别计算距离 dist_label_list = self.find_k_neighbors_with_lsh(im_feature) dist_label_list.extend(nearest_sim_list) dist_label_list = self.filter_result(dist_label_list) dist_label_list.sort(key=lambda x: x[0], reverse=True) # 计算 if dist_label_list == None: this_id = self.must_be_not_same_id this_label = self.new_person_str + str( self.current_new_person_id) else: # 计算prob --- 根据距离计算prob this_id, this_label = self.evaluate_result(dist_label_list) # 在线聚类 --- 根据dist确定是重新增加一个人还是加入到已有的人中 log_file.write('\t'.join( map(str, ['stat', 'recognize_id', blur_var, this_id])) + '\n') if dist_label_list != None and len(dist_label_list) > 0: log_file.write('\t'.join( map(str, ['dist_label_list :', map(str, dist_label_list)])) + '\n') need_save = False if this_id == self.same_pic_id: need_add = False elif this_id == self.must_be_same_id: need_add = False need_save = True this_person_folder = os.path.join( self.all_pic_data_folder, this_label + self.must_same_str) elif this_id == self.must_be_not_same_id: this_label = self.new_person_str + str( self.current_new_person_id) self.current_new_person_id += 1 this_person_folder = os.path.join(self.all_pic_data_folder, this_label) need_add = True need_save = True elif this_id == self.maybe_same_id: this_person_folder = os.path.join( self.all_pic_data_folder, this_label + self.maybe_same_str) need_add = False # prob在灰度区域的不如入,其余情况加入 need_save = True else: log_file.write('\t'.join(map(str, ['error para :', this_id])) + '\n') if need_save: try: if not os.path.exists(this_person_folder): os.makedirs(this_person_folder) os.chmod(this_person_folder, stat.S_IRWXG + stat.S_IRWXO + stat.S_IRWXU) this_pic_name = os.path.join(this_person_folder, image_id + '.png') imsave(this_pic_name, np.transpose(im[0], (1, 2, 0))) except: traceback.print_exc() return self.unknown, 1.0, has_save_num, False # 加入LSH Forest --- partial_fit if need_add: self.add_one_pic(im_feature, this_label) has_save_num += 1 # 根据label和image_id可以存生成文件名,确定是否要存储文件[可以选择在服务器和本地同时存储] if this_id == self.same_pic_id or this_id == self.must_be_not_same_id or this_id == self.must_be_same_id: end = time.time() log_file.write('\t'.join( map(str, [ 'stat recognize_time :', (end - start), 'this_id :', self.trans_dic.get(this_id) ])) + '\n') log_file.close() return this_label.replace(self.must_same_str, ''), \ str(dist_label_list[0][0]), str(has_save_num), str(need_add) else: # 灰度区域,不显示人名 end = time.time() log_file.write('\t'.join( map(str, ['gray area recog time :', (end - start)])) + '\n') log_file.close() # return this_label.replace(self.maybe_same_str, ''), \ # str(dist_label_list[0][0]), str(has_save_num), str(need_add) return self.unknown, str( dist_label_list[0][0]), str(has_save_num), str(need_add) except: traceback.print_exc() log_file.close() return self.unknown, str(100.0), str(has_save_num), str(False)
class Search(): def __init__(self, model_type, n_estimators=20, n_candidates=200, n_neighbors=10): self.lshf = LSHForest(n_estimators=n_estimators, n_candidates=n_candidates, n_neighbors=n_neighbors) if model_type == 'rgb_small': self.deepid_model_file = '/data/liubo/face/vgg_face_dataset/model/vgg_face.all_data.small.rgb.deepid.model' self.deepid_weight_file = '/data/liubo/face/vgg_face_dataset/model/vgg_face.all_data.small.rgb.deepid.weight' self.part_func = None self.pic_shape = (50, 50, 3) self.feature_dim = 1024 elif model_type == 'rgb_big': self.deepid_weight_file = '/data/liubo/face/vgg_face_dataset/model/vgg_face.all_data.big.rgb.deepid.weight' self.deepid_model_file = '/data/liubo/face/vgg_face_dataset/model/vgg_face.all_data.big.rgb.deepid.model' self.part_func = None self.pic_shape = (128, 128, 3) elif model_type == 'rgb_small_right': self.deepid_weight_file = '/data/liubo/face/vgg_face_dataset/model/vgg_face.small_data.small.rgb.right_eye.deepid.weight' self.deepid_model_file = '/data/liubo/face/vgg_face_dataset/model/vgg_face.small_data.small.rgb.right_eye.deepid.model' self.part_func = get_right_eye self.pic_shape = (50, 50, 3) elif model_type == 'rgb_small_left': self.deepid_weight_file = '/data/liubo/face/vgg_face_dataset/model/vgg_face.small_data.small.rgb.left_eye.deepid.weight' self.deepid_model_file = '/data/liubo/face/vgg_face_dataset/model/vgg_face.small_data.small.rgb.left_eye.deepid.model' self.part_func = get_left_eye self.pic_shape = (50, 50, 3) elif model_type == 'rgb_small_nose': self.deepid_weight_file = '/data/liubo/face/vgg_face_dataset/model/vgg_face.all_data.all.rgb.nose.deepid.weight' self.deepid_model_file = '/data/liubo/face/vgg_face_dataset/model/vgg_face.all_data.all.rgb.nose.deepid.model' self.part_func = get_nose self.pic_shape = (50, 50, 3) elif model_type == 'new_shape': self.deepid_model_file = '/data/liubo/face/vgg_face_dataset/model/vgg_face.small_data.new_shape.rgb.deepid.model' self.deepid_weight_file = '/data/liubo/face/vgg_face_dataset/model/vgg_face.small_data.new_shape.rgb.deepid.weight' self.pic_shape = (156, 124, 3) self.feature_dim = 256 self.part_func = None self.model, self.get_Conv_FeatureMap = load_deepid_model( self.deepid_model_file, self.deepid_weight_file) self.all_label = None self.all_feature_data = None def extract_pic_feature(self, pic_data, batch_size=128, feature_dim=1024): pic_feature = np.zeros(shape=(pic_data.shape[0], feature_dim)) batch_num = pic_data.shape[0] / batch_size for index in range(batch_num): # pic_feature[index*batch_size:(index+1)*batch_size, :] = \ # self.get_Conv_FeatureMap([pic_data[index*batch_size:(index+1)*batch_size], 0])[0] pic_feature[index*batch_size:(index+1)*batch_size, :] = \ self.get_Conv_FeatureMap([np.transpose(pic_data[index*batch_size:(index+1)*batch_size], (0, 3, 1, 2)), 0])[0] if batch_num * batch_size < pic_data.shape[0]: # pic_feature[batch_num*batch_size:, :] = \ # self.get_Conv_FeatureMap([pic_data[batch_num*batch_size:], 0])[0] pic_feature[batch_num*batch_size:, :] = \ self.get_Conv_FeatureMap([np.transpose(pic_data[batch_num*batch_size:], (0, 3, 1, 2)), 0])[0] return pic_feature def train_all_data(self, vgg_folder, person_num=100, batch_person_num=20, pic_num=10): # 取前pic_num张图片加入到LSH Forest,其余图片用于判断准确率 for index in range(0 + train_person_start_index, person_num + train_person_start_index, batch_person_num): if index == 0 + train_person_start_index: pic_data, all_label = load_batch_train_data( vgg_folder, shape=self.pic_shape, start_person_index=index, pic_num=pic_num, batch_num=batch_person_num, is_train=True, part_func=self.part_func) all_data_feature = self.extract_pic_feature( pic_data, feature_dim=self.feature_dim) self.lshf.fit(all_data_feature, all_label) else: pic_data, this_label = load_batch_train_data( vgg_folder, start_person_index=index, pic_num=pic_num, shape=self.pic_shape, batch_num=batch_person_num, is_train=True, part_func=self.part_func) all_label = np.row_stack( (np.reshape(all_label, (all_label.shape[0], 1)), np.reshape(this_label, (this_label.shape[0], 1)))) pic_data_feature = self.extract_pic_feature( pic_data, feature_dim=self.feature_dim) all_data_feature = np.row_stack( (pic_data_feature, all_data_feature)) self.lshf.partial_fit(pic_data_feature, this_label) self.all_label = all_label self.all_feature_data = all_data_feature logging.info(' '.join( map(str, ['self.all_label.shape :', self.all_label.shape]))) def partical_fit(self, pic_data, this_label): ''' 增量训练, 样本比较小, 直接 :param data: :param label: :return: ''' pic_data_feature = self.extract_pic_feature( pic_data, feature_dim=self.feature_dim) self.lshf.partial_fit(pic_data_feature, this_label) self.all_label = np.row_stack( (np.reshape(self.all_label, (self.all_label.shape[0], 1)), np.reshape(this_label, (this_label.shape[0], 1)))) def find_k_neighbors(self, pic_data): pic_data_feature = self.extract_pic_feature( pic_data, feature_dim=self.feature_dim) distances, indices = self.lshf.kneighbors(pic_data_feature, n_neighbors=1) predict_label = self.all_label[indices][:, 0, 0] return predict_label def valid_model(self, vgg_folder, person_num=100, batch_person_num=20, pic_num=10, topK_acc=1): # 取前50张图片加入到LSH Forest,后50张图片用于判断准确率 right_num = 0 wrong_num = 0 clf = cPickle.load(open(clf_model_file, 'rb')) for index in range(0 + train_person_start_index, person_num + train_person_start_index, batch_person_num): pic_data, all_label = load_batch_train_data( vgg_folder, start_person_index=index, pic_num=pic_num, shape=self.pic_shape, batch_num=batch_person_num, is_train=False, part_func=self.part_func) pic_data_feature = self.extract_pic_feature( pic_data, feature_dim=self.feature_dim) distances, indices = self.lshf.kneighbors(pic_data_feature, n_neighbors=10) train_data = self.all_feature_data[indices] predict_label = self.all_label[indices][:, 0, 0] for label_index in range(len(predict_label)): this_predict_data = np.abs(train_data[0] - pic_data_feature[0]) this_result = clf.predict_proba(this_predict_data) print this_result # pdb.set_trace() if all_label[label_index] in self.all_label[indices][:, :, 0][ label_index][:topK_acc]: right_num += 1 else: wrong_num += 1 acc = right_num * 1.0 / (right_num + wrong_num) logging.info(' '.join( map(str, [ 'model_type :', model_type, 'person_num :', person_num, 'pic_num :', pic_num, 'acc :', acc ])))
class Search(): def __init__(self, model_type, n_estimators=20, n_candidates=200, n_neighbors=10): self.lshf = LSHForest(n_estimators=n_estimators, n_candidates=n_candidates, n_neighbors=n_neighbors) if model_type == 'rgb_small': self.deepid_model_file = '/data/liubo/face/vgg_face_dataset/model/vgg_face.all_data.small.rgb.deepid.model' self.deepid_weight_file = '/data/liubo/face/vgg_face_dataset/model/vgg_face.all_data.small.rgb.deepid.weight' self.part_func = None self.pic_shape = (50, 50, 3) self.feature_dim = 1024 elif model_type == 'rgb_big': self.deepid_weight_file = '/data/liubo/face/vgg_face_dataset/model/vgg_face.all_data.big.rgb.deepid.weight' self.deepid_model_file = '/data/liubo/face/vgg_face_dataset/model/vgg_face.all_data.big.rgb.deepid.model' self.part_func = None self.pic_shape = (128, 128, 3) elif model_type == 'rgb_small_right': self.deepid_weight_file = '/data/liubo/face/vgg_face_dataset/model/vgg_face.small_data.small.rgb.right_eye.deepid.weight' self.deepid_model_file = '/data/liubo/face/vgg_face_dataset/model/vgg_face.small_data.small.rgb.right_eye.deepid.model' self.part_func = get_right_eye self.pic_shape = (50, 50, 3) elif model_type == 'rgb_small_left': self.deepid_weight_file = '/data/liubo/face/vgg_face_dataset/model/vgg_face.small_data.small.rgb.left_eye.deepid.weight' self.deepid_model_file = '/data/liubo/face/vgg_face_dataset/model/vgg_face.small_data.small.rgb.left_eye.deepid.model' self.part_func = get_left_eye self.pic_shape = (50, 50, 3) elif model_type == 'rgb_small_nose': self.deepid_weight_file = '/data/liubo/face/vgg_face_dataset/model/vgg_face.all_data.all.rgb.nose.deepid.weight' self.deepid_model_file = '/data/liubo/face/vgg_face_dataset/model/vgg_face.all_data.all.rgb.nose.deepid.model' self.part_func = get_nose self.pic_shape = (50, 50, 3) elif model_type == 'new_shape': self.deepid_model_file = '/data/liubo/face/vgg_face_dataset/model/vgg_face.small_data.new_shape.rgb.deepid.model' self.deepid_weight_file = '/data/liubo/face/vgg_face_dataset/model/vgg_face.small_data.new_shape.rgb.deepid.weight' self.pic_shape = (156, 124, 3) self.feature_dim = 256 self.part_func = None self.model, self.get_Conv_FeatureMap = load_deepid_model(self.deepid_model_file, self.deepid_weight_file) self.all_label = None self.all_feature_data = None def extract_pic_feature(self, pic_data, batch_size=128, feature_dim=1024): pic_feature = np.zeros(shape=(pic_data.shape[0], feature_dim)) batch_num = pic_data.shape[0] / batch_size for index in range(batch_num): # pic_feature[index*batch_size:(index+1)*batch_size, :] = \ # self.get_Conv_FeatureMap([pic_data[index*batch_size:(index+1)*batch_size], 0])[0] pic_feature[index*batch_size:(index+1)*batch_size, :] = \ self.get_Conv_FeatureMap([np.transpose(pic_data[index*batch_size:(index+1)*batch_size], (0, 3, 1, 2)), 0])[0] if batch_num*batch_size < pic_data.shape[0]: # pic_feature[batch_num*batch_size:, :] = \ # self.get_Conv_FeatureMap([pic_data[batch_num*batch_size:], 0])[0] pic_feature[batch_num*batch_size:, :] = \ self.get_Conv_FeatureMap([np.transpose(pic_data[batch_num*batch_size:], (0, 3, 1, 2)), 0])[0] return pic_feature def train_all_data(self, vgg_folder, person_num=100, batch_person_num=20, pic_num=10): # 取前pic_num张图片加入到LSH Forest,其余图片用于判断准确率 for index in range(0+train_person_start_index, person_num+train_person_start_index, batch_person_num): if index == 0+train_person_start_index: pic_data, all_label = load_batch_train_data(vgg_folder, shape=self.pic_shape, start_person_index=index, pic_num=pic_num, batch_num=batch_person_num, is_train=True, part_func=self.part_func) all_data_feature = self.extract_pic_feature(pic_data, feature_dim=self.feature_dim) self.lshf.fit(all_data_feature, all_label) else: pic_data, this_label = load_batch_train_data(vgg_folder, start_person_index=index, pic_num=pic_num, shape=self.pic_shape, batch_num=batch_person_num,is_train=True, part_func=self.part_func) all_label = np.row_stack((np.reshape(all_label, (all_label.shape[0], 1)), np.reshape(this_label, (this_label.shape[0],1)))) pic_data_feature = self.extract_pic_feature(pic_data, feature_dim=self.feature_dim) all_data_feature = np.row_stack((pic_data_feature, all_data_feature)) self.lshf.partial_fit(pic_data_feature, this_label) self.all_label = all_label self.all_feature_data = all_data_feature logging.info(' '.join(map(str, ['self.all_label.shape :', self.all_label.shape]))) def partical_fit(self, pic_data, this_label): ''' 增量训练, 样本比较小, 直接 :param data: :param label: :return: ''' pic_data_feature = self.extract_pic_feature(pic_data, feature_dim=self.feature_dim) self.lshf.partial_fit(pic_data_feature, this_label) self.all_label = np.row_stack((np.reshape(self.all_label, (self.all_label.shape[0], 1)), np.reshape(this_label, (this_label.shape[0],1)))) def find_k_neighbors(self, pic_data): pic_data_feature = self.extract_pic_feature(pic_data, feature_dim=self.feature_dim) distances, indices = self.lshf.kneighbors(pic_data_feature, n_neighbors=1) predict_label = self.all_label[indices][:, 0, 0] return predict_label def valid_model(self, vgg_folder, person_num=100, batch_person_num=20, pic_num=10, topK_acc=1): # 取前50张图片加入到LSH Forest,后50张图片用于判断准确率 right_num = 0 wrong_num = 0 clf = cPickle.load(open(clf_model_file, 'rb')) for index in range(0+train_person_start_index, person_num+train_person_start_index, batch_person_num): pic_data, all_label = load_batch_train_data(vgg_folder, start_person_index=index, pic_num=pic_num, shape=self.pic_shape,batch_num=batch_person_num, is_train=False, part_func=self.part_func) pic_data_feature = self.extract_pic_feature(pic_data, feature_dim=self.feature_dim) distances, indices = self.lshf.kneighbors(pic_data_feature, n_neighbors=10) train_data = self.all_feature_data[indices] predict_label = self.all_label[indices][:, 0, 0] for label_index in range(len(predict_label)): this_predict_data = np.abs(train_data[0] - pic_data_feature[0]) this_result = clf.predict_proba(this_predict_data) print this_result # pdb.set_trace() if all_label[label_index] in self.all_label[indices][:, :, 0][label_index][:topK_acc]: right_num += 1 else: wrong_num += 1 acc = right_num * 1.0 / (right_num + wrong_num) logging.info(' '.join(map(str, ['model_type :', model_type, 'person_num :', person_num, 'pic_num :', pic_num, 'acc :', acc])))
''' LSHash(局部敏感哈希) https://blog.csdn.net/sinat_26917383/article/details/70243066 ''' from sklearn.neighbors import LSHForest # X_train = [[5, 5, 99], [21, 5, 5], [1, 1, 1]] # X_train1 = [ [8, 9, 1], [6, 10, 2]] # X_test = [[9, 1, 6], [3, 1, 10], [7, 10, 3]] X_train = [[1,1], [2,3], [3,2]] X_test = [[3,3]] lshf = LSHForest(random_state=42) lshf.fit(X_train) # lshf.partial_fit(X_train1) distances, indices = lshf.kneighbors(X_test, n_neighbors=2) print('distances',distances) print('indices',indices) X_train1 = [[1,1], [3,2], [3,3]] lshf.partial_fit(X_train1) distances1, indices1 = lshf.kneighbors(X_test, n_neighbors=2) # lshf.partial_fit(X_test) print('distances1',distances1) print('indices1',indices1)
class FaceRecognition(): def __init__(self): self.unknown = '' self.same_person_num = 1 self.has_cal_dist = [] self.NeighbourNum = 10 # 如果管理员加载图片, 把图片放到all_pic_data_folder下指定人的目录(图片文件和特征文件的文件名相同) self.all_pic_feature_data_folder = '/data/liubo/face/research_feature_self' # 研究院的模型直接存储特征 # 保存图片可以方便以后查看效果, 方便前端显示, 也方便管理员进行标注 self.all_pic_data_folder = '/data/liubo/face/research_self' if not os.path.exists(self.all_pic_data_folder): os.makedirs(self.all_pic_data_folder) if not os.path.exists(self.all_pic_feature_data_folder): os.makedirs(self.all_pic_feature_data_folder) self.n_neighbors = 10 self.lshf = LSHForest(n_estimators=20, n_candidates=200, n_neighbors=self.n_neighbors) self.all_labels = [] self.all_pic_feature = [] self.same_pic_id = 2 self.must_be_same_id = 1 self.must_be_not_same_id = 0 self.maybe_same_id = 3 self.new_person_str = 'new_person_' self.current_new_person_id = self.find_current_new_person_id() self.must_same_str = '_Must_Same' self.maybe_same_str = '_Maybe_same' self.load_time = time.time() self.user_count = {} self.upper_threshold = upper_verif_threshold self.lower_threshold = lower_verif_threshold self.same_pic_threshold = same_pic_threshold self.trans_dic = { self.same_pic_id: 'same_pic', self.must_be_same_id: 'must_same_id', self.must_be_not_same_id: 'must_not_same_id', self.maybe_same_id: 'maybe_same_id' } self.nearest = deque(maxlen=nearest_num) self.verification_same_person = 0 def cal_nearest_sim(self, current_feature): nearest_sim_list = [] current_day = get_current_day() log_file = open(os.path.join(log_dir, current_day + '.txt'), 'a') try: length = len(self.nearest) for k in range(length): try: person_name, pre_feature = self.nearest[k] # 不在考虑时间, 只考虑图片的相似度 this_sim = pw.cosine_similarity( np.reshape(np.asarray(pre_feature), (1, len(pre_feature))), np.reshape(np.asarray(current_feature), (1, len(current_feature)))) nearest_sim_list.append( (this_sim, verification_model.predict(this_sim), person_name)) except: log_file.write('cal_nearest_sim error' + '\n') traceback.print_exc() continue return nearest_sim_list except: traceback.print_exc() return nearest_sim_list def load_train_data(self, data_folder): # 直接读取图片特征, 返回所有特征和label all_pic_feature = [] all_label = [] person_list = os.listdir(data_folder) for person in person_list: if person == self.unknown or self.must_same_str in person or self.maybe_same_str in person: continue person_path = os.path.join(data_folder, person) pic_feature_list = os.listdir(person_path) for pic_feature_path in pic_feature_list: pic_feature = msgpack_numpy.load( open(os.path.join(person_path, pic_feature_path), 'rb')) all_pic_feature.append(pic_feature) all_label.append(person) all_pic_feature = np.asarray(all_pic_feature) all_label = np.asarray(all_label) return all_pic_feature, all_label def find_current_new_person_id(self): current_day = get_current_day() log_file = open(os.path.join(log_dir, current_day + '.txt'), 'a') old_person_id = [] # 保存的是原始图片 person_list = os.listdir(self.all_pic_data_folder) for person in person_list: if person.startswith(self.new_person_str): tmp = person[len(self.new_person_str):].split('_') if len(tmp) > 0: this_id = int(tmp[0]) old_person_id.append(this_id) if len(old_person_id) == 0: current_new_person_id = 0 else: current_new_person_id = max(old_person_id) + 1 log_file.write('\t'.join( map(str, ['current_new_person_id :', current_new_person_id])) + '\n') log_file.close() return current_new_person_id def extract_pic_feature(self, pic_path): try: result = extract_feature_from_binary_data(open(pic_path, 'rb')) if result == None: return face_num, all_frames, all_feature = result biggest_face_index = find_big_face(all_frames) pic_frame = all_frames[biggest_face_index] pic_feature = all_feature[biggest_face_index] x, y, width, height = pic_frame face_pic = cv2.imread(pic_path)[y:y + width, x:x + height, :] return face_pic, pic_feature except: traceback.print_exc() return None def load_all_data(self): # 将以前标记的数据全部读入(直接读入的是特征), 用LSH Forest保存,方便计算距离 current_day = get_current_day() log_file = open(os.path.join(log_dir, current_day + '.txt'), 'a') try: all_pic_feature, all_label = self.load_train_data( self.all_pic_feature_data_folder) train_label = np.asarray(all_label) if len(all_pic_feature) == len( train_label) and len(train_label) > 0: start = time.time() self.lshf.fit(all_pic_feature, train_label) self.all_pic_feature = list(all_pic_feature) self.all_labels = list(train_label) end = time.time() self.load_time = end self.user_count = Counter(self.all_labels) log_file.write('\t'.join( map(str, [ self.user_count, 'fit all data time :', (end - start) ])) + '\n') log_file.close() except: traceback.print_exc() log_file.close() return def save_pic_feature(self, pic_path, person_name): # 将已经存在的文件生成特征并保存到指定文件夹下, 用于管理员加入新的图片(加入新的图片后, 提取特征, 保存到指定文件夹) person_pic_folder_path = os.path.join(self.all_pic_data_folder, person_name) person_feature_folder_path = os.path.join( self.all_pic_feature_data_folder, person_name) if not os.path.exists(person_pic_folder_path): os.makedirs(person_pic_folder_path) if not os.path.exists(person_feature_folder_path): os.makedirs(person_feature_folder_path) pic_name = os.path.split(pic_path)[-1] # 特征文件 person_feature_path = os.path.join(person_feature_folder_path, pic_name) # 人脸文件 person_pic_path = os.path.join(person_pic_folder_path, pic_name) result = extract_feature_from_binary_data(open(pic_path, 'rb')) if result == None: return face_num, all_frames, all_feature = result biggest_face_index = find_big_face(all_frames) pic_frame = all_frames[biggest_face_index] pic_feature = all_feature[biggest_face_index] x, y, width, height = pic_frame face_pic = cv2.imread(pic_path)[y:y + width, x:x + height, :] cv2.imwrite(person_pic_path, face_pic) msgpack_numpy.dump(pic_feature, open(person_feature_path, 'wb')) def add_all_new_pic(self): ''' 将从上次加载数据到当前新增的文件都加载到LSH Forest(有可能是新增加一个人,还有可能是对已有的人增加新图片) 遍历文件夹(self.all_pic_feature_data_folder), 根据文件的时间判断是否需要加入该图片的特征 系统在管理员标注图片后, 将人脸图片和特征文件同时进行移动, 所以现在只需要将特征和对应的label加入LSH就可以了 ''' current_day = get_current_day() log_file = open(os.path.join(log_dir, current_day + '.txt'), 'a') start = time.time() person_list = os.listdir(self.all_pic_data_folder) add_num = 0 for person in person_list: if self.must_same_str in person or self.maybe_same_str in person or self.new_person_str in person: continue person_path = os.path.join(self.all_pic_data_folder, person) if not os.path.isdir(person_path): continue pic_list = os.listdir(person_path) for pic in pic_list: pic_path = os.path.join(person_path, pic) last_modify_time = os.stat(pic_path).st_atime if last_modify_time > self.load_time: request = { "label": person, "request_type": 'add', "one_pic_feature": pic_path } url = "http://127.0.0.1:%d/" % port result = image_request(request, url) try: add_flag = json.loads(result)["add"] if not add_flag: # 加载失败 log_file.write('\t'.join( map(str, ['no add file :', pic_path])) + '\n') else: add_num += 1 except: log_file.write( '\t'.join(map(str, ['no add file :', pic_path])) + '\n') traceback.print_exc() continue add_num += 1 end = time.time() if add_num > 0: self.load_time = end log_file.write( '\t'.join(map(str, ['self.load_time', self.load_time])) + '\n') log_file.write('\t'.join( map(str, [ 'add pic num :', add_num, 'Dynamic increase time :', (end - start) ])) + '\n') log_file.close() else: log_file.close() def add_one_new_pic(self, pic_path, label): current_day = get_current_day() log_file = open(os.path.join(log_dir, current_day + '.txt'), 'a') try: # 读入数据时已经转换成需要的尺寸 result = self.extract_pic_feature(pic_path) if result == None: return False face_pic, pic_feature = result self.add_one_pic(pic_feature, label) pic_name = os.path.split(pic_path)[1] this_person_pic_folder = os.path.join(self.all_pic_data_folder, label) this_person_feature_folder = os.path.join( self.all_pic_feature_data_folder, label) if not os.path.exists(this_person_pic_folder): os.makedirs(this_person_pic_folder) if not os.path.exists(this_person_feature_folder): os.makedirs(this_person_feature_folder) # 直接存储图片对应的特征, 同时保存图片文件 this_pic_feature_name = os.path.join(this_person_feature_folder, pic_name + '.p') msgpack_numpy.dump(pic_feature, open(this_pic_feature_name, 'wb')) this_pic_face_name = os.path.join(this_person_pic_folder, pic_name + '.jpg') cv2.imwrite(this_pic_face_name, face_pic) log_file.write( '\t'.join(map(str, [pic_path, this_pic_face_name])) + '\n') return True except: traceback.print_exc() return False def add_one_pic(self, one_pic_feature, pic_label): ''' 将一个图像的特征加入到LSH Forest,同时将对应的标签加入到self.all_labels :param pic_feature: array shape :(1,1024) :param pic_label: (1,) :return: ''' one_pic_feature = np.asarray(one_pic_feature) self.lshf.partial_fit(one_pic_feature.reshape(1, FEATURE_DIM), pic_label) self.all_labels.append(pic_label) self.all_pic_feature.append( np.reshape(one_pic_feature, newshape=(1, one_pic_feature.size))) def find_k_neighbors_with_lsh(self, one_pic_feature): ''' :param one_pic_feature: 图像特征 :return: 需要返回neighbors的特征,用于计算pariwise ''' try: one_pic_feature = np.asarray(one_pic_feature) tmp = self.lshf.kneighbors(one_pic_feature.reshape(1, FEATURE_DIM), n_neighbors=self.n_neighbors, return_distance=True) neighbors_label = np.asarray(self.all_labels)[tmp[1][0]] neighbors_feature = np.asarray(self.all_pic_feature)[tmp[1][0]] pair_score_list = [] cos_sim_list = [] for index in range(len(neighbors_feature)): pair_score = pw.cosine_similarity( neighbors_feature[index].reshape(1, FEATURE_DIM), one_pic_feature.reshape(1, FEATURE_DIM))[0][0] cos_sim_list.append(pair_score) pair_score_list.append(verification_model.predict(pair_score)) result = zip(cos_sim_list, pair_score_list, neighbors_label) # result = self.filter_result(result) # result.sort(key=lambda x:x[0], reverse=True) return result except: return None def filter_result(self, result): ''' :param result: [(cos_sim, same_person_result, label), (cos_sim, same_person_result, label), (cos_sim, same_person_result, label)] 按cos_sim降序排列 :return: this_id(Must_same, Must_not_same, May_same), this_label(人名) ''' # 分值相同的, 将new_person的删去 tmp_dic = {} for element in result: try: this_score, this_same_person_result, this_label = element this_score = float(this_score) if this_score in tmp_dic: if self.new_person_str in this_label: continue else: tmp_dic[this_score] = element else: tmp_dic[this_score] = element except: traceback.print_exc() continue result = tmp_dic.values() return result def evaluate_result(self, result): ''' :param result: [(cos_sim, same_person_result, label), (cos_sim, same_person_result, label), (cos_sim, same_person_result, label)] :return: this_id(Must_same, Must_not_same, May_same), this_label(人名) ''' for index, element in enumerate(result): this_score, this_same_person_result, this_label = element if this_same_person_result == self.verification_same_person and this_score > self.same_pic_threshold: return self.same_pic_id, this_label if this_same_person_result == self.verification_same_person and this_score > self.upper_threshold: return self.must_be_same_id, this_label if this_same_person_result == self.verification_same_person and this_score > self.lower_threshold: return self.maybe_same_id, this_label return self.must_be_not_same_id, '' def recognize_online_cluster(self, image, image_id): ''' :param image: 将得到的图片进行识别,加入的LSH Forest,根据距离计算proba(不同的距离对应不同的准确率,根据已有的dist计算阈值); 和已经设定的阈值判断是不是一个新出现的人,确定是原来已有的人,还是不确定是原来已有的人 :return: ''' start = time.time() need_add = False need_save = False current_day = get_current_day() log_file = open(os.path.join(log_dir, current_day + '.txt'), 'a') log_file.write( '\t'.join(map(str, ["receive image", image_id, time.time()])) + '\n') feature_str = '' try: image = base64.decodestring(image) image = zlib.decompress(image) im = cv2.imdecode(np.fromstring(image, dtype=np.uint8), 1) log_file.write( '\t'.join(map(str, ['shape :', im.shape[0], im.shape[1]])) + '\n') # 图片尺寸过滤 if im.shape[0] < size_threshold or im.shape[1] < size_threshold: log_file.write('\t'.join( map(str, [ 'stat recognize_time :', (time.time() - start), 'small_size' ])) + '\n') log_file.close() return self.unknown, 1.0, feature_str, need_save # 清晰度过滤 blur_sign, blur_var = is_blur(cv2.resize(im, (96, 96))) if blur_sign: log_file.write('\t'.join( map(str, [ 'stat recognize_time :', (time.time() - start), 'blur_filter', blur_var ])) + '\n') log_file.close() return self.unknown, 1.0, feature_str, need_save # 保存传过来的图片 # img_file = '/tmp/research_face/%s.jpg' %image_id time_slot = get_time_slot(image_id) if time_slot == None: time_slot = 'error' time_slot_dir = os.path.join(tmp_face_dir, time_slot) if not os.path.exists(time_slot_dir): os.makedirs(time_slot_dir) img_file = os.path.join(time_slot_dir, image_id + '.jpg') cv2.imwrite(img_file, im) except: traceback.print_exc() log_file.close() return self.unknown, 1.0, feature_str, need_save try: # 流程 : 找距离最近的图片 ; 计算prob ; 在线聚类 ; 加入LSH Forest result = self.extract_pic_feature(img_file) if result == None: log_file.write('\t'.join( map(str, [ 'stat not_find_face', 'time :', (time.time() - start) ])) + '\n') log_file.close() return self.unknown, 1.0, feature_str, need_save face_pic, im_feature = result try: # nearest_sim_list的格式和dist_label_list的格式一样,这样可以将两个list合并,一起计算(这样不用考虑时间的因素) # 在识别出人名后将人名和feature放入到self.nearest nearest_sim_list = self.cal_nearest_sim( current_feature=im_feature) except: traceback.print_exc() nearest_sim_list = [] log_file.write('\t'.join( map(str, ['nearest_sim_list :', map(str, nearest_sim_list)])) + '\n') feature_str = base64.b64encode(msgpack_numpy.dumps(im_feature)) log_file.write('\t'.join( map(str, ['extract_feature_time :', (time.time() - start)])) + '\n') # 找距离最近的图片 --- 用LSH Forest 找出最近的10张图片,然后分别计算距离 tmp_list = self.find_k_neighbors_with_lsh(im_feature) nearest_sim_list.sort(key=lambda x: x[0], reverse=True) nearest_sim_list.extend(tmp_list) dist_label_list = nearest_sim_list[:] # 计算 log_file.write('\t'.join( map(str, ['dist_label_list :', map(str, dist_label_list)])) + '\n') if dist_label_list == None: this_id = self.must_be_not_same_id this_label = self.new_person_str + str( self.current_new_person_id) else: # 计算prob --- 根据距离计算prob this_id, this_label = self.evaluate_result(dist_label_list) # 不管概率, 都要将最新的一张图片加入到self.nearest self.nearest.append((this_label, im_feature)) log_file.write( '\t'.join(map(str, ['self.nearest :', map(str, self.nearest)])) + '\n') # 在线聚类 --- 根据dist确定是重新增加一个人还是加入到已有的人中 if this_id == self.same_pic_id: need_add = False elif this_id == self.must_be_same_id: need_add = False need_save = True this_person_pic_folder = os.path.join( self.all_pic_data_folder, this_label + self.must_same_str) this_person_feature_folder = os.path.join( self.all_pic_feature_data_folder, this_label + self.must_same_str) elif this_id == self.must_be_not_same_id: this_label = self.new_person_str + str( self.current_new_person_id) self.current_new_person_id += 1 this_person_pic_folder = os.path.join(self.all_pic_data_folder, this_label) this_person_feature_folder = os.path.join( self.all_pic_feature_data_folder, this_label) need_add = True need_save = True elif this_id == self.maybe_same_id: this_person_pic_folder = os.path.join( self.all_pic_data_folder, this_label + self.maybe_same_str) this_person_feature_folder = os.path.join( self.all_pic_feature_data_folder, this_label + self.maybe_same_str) need_add = False # prob在灰度区域的不如入,其余情况加入 need_save = True else: log_file.write('\t'.join(map(str, ['error para :', this_id])) + '\n') if need_save: try: if not os.path.exists(this_person_pic_folder): os.makedirs(this_person_pic_folder) if not os.path.exists(this_person_feature_folder): os.makedirs(this_person_feature_folder) # 直接存储图片对应的特征, 同时保存图片文件 this_pic_feature_name = os.path.join( this_person_feature_folder, image_id + '.p') msgpack_numpy.dump(im_feature, open(this_pic_feature_name, 'wb')) this_pic_face_name = os.path.join(this_person_pic_folder, image_id + '.jpg') cv2.imwrite(this_pic_face_name, face_pic) except: traceback.print_exc() return self.unknown, 1.0, feature_str, False # 加入LSH Forest --- partial_fit if need_add: self.add_one_pic(im_feature, this_label) # 根据label和image_id可以存生成文件名,确定是否要存储文件[可以选择在服务器和本地同时存储] if this_id == self.same_pic_id or this_id == self.must_be_not_same_id or this_id == self.must_be_same_id: end = time.time() log_file.write('\t'.join( map(str, [ 'stat recognize_time :', (end - start), 'this_id :', self.trans_dic.get(this_id) ])) + '\n') log_file.close() need_save = True return this_label.replace(self.must_same_str, ''), str( dist_label_list[0][0]), str(feature_str), str(need_save) else: # 灰度区域,不显示人名 end = time.time() log_file.write( '\t'.join(map(str, ['stat gray_area :', (end - start)])) + '\n') log_file.close() return self.unknown, str( dist_label_list[0][0]), str(feature_str), str(False) except: traceback.print_exc() log_file.close() return self.unknown, str(100.0), str(feature_str), str(False)
import time import numpy as np from sklearn.datasets.samples_generator import make_blobs from sklearn.neighbors import LSHForest from sklearn.neighbors import NearestNeighbors import matplotlib.pyplot as plt import random X_train = [[5, 5, 2], [21, 5, 5], [1, 1, 1], [8, 9, 1], [6, 10, 2]] X_test = [[9, 1, 6], [3, 1, 10], [7, 10, 3]] dim=3000 lshf = LSHForest(random_state=42, n_estimators=65, n_candidates=200, n_neighbors=10) X_train = [ [random.randint(0, 4) for k in range(dim)] for i in range(50)] for j in range(1000): X_test = [ [random.randint(0, 4) for k in range(dim)] for i in range(1)] lshf.partial_fit(X_test) if j % 50==0: print (j) distances, indices = lshf.kneighbors(X_test, n_neighbors=33) print(distances, indices)
class ClassifierLSHForest(ClassifierMixin, LSHForest): """ KNeighborsClassifier with partial_fit method for online learning. Memory-based classifier. Wrapper around LSHForest. """ def __init__(self, n_estimators=10, radius=1.0, n_candidates=50, n_neighbors=5, min_hash_match=4, radius_cutoff_ratio=0.9, random_state=None, class_weights=None): self.lshf_ = LSHForest(n_estimators=n_estimators, radius=radius, n_candidates=n_candidates, n_neighbors=n_neighbors, min_hash_match=min_hash_match, radius_cutoff_ratio=radius_cutoff_ratio, random_state=random_state) self.y_ = None self.classes_ = list() self.class_weights_ = class_weights def fit(self, X, y): self.y_ = y self.classes_ = np.unique(y).tolist() self.lshf_.fit(X) print 'fitted' return self def partial_fit(self, X, y, *args, **kwargs): if self.y_ is None: self.y_ = y else: self.y_ = np.concatenate((self.y_, y)) print self.y_.shape for yi in y: if yi not in self.classes_: self.classes_.append(yi) self.lshf_.partial_fit(X) return self def _kernel(self, x): return np.exp(-x) def _get_class_weights(self): return compute_class_weight(self.class_weights_, self.classes_, self.y_) def _compute_weights(self, X): dists, neighbors = self.lshf_.kneighbors(X, return_distance=True) result = np.zeros((neighbors.shape[0], len(self.classes_))) for i in xrange(neighbors.shape[0]): for cl_index, cl in enumerate(self.classes_): result[i, cl_index] = self._kernel( dists[i][self.y_[neighbors[i]] == cl]).sum() if self.class_weights_ is not None: result *= self._get_class_weights() return result def predict(self, X): weights = self._compute_weights(X) result = np.argmax(weights, axis=1) for i in xrange(X.shape[0]): result = self.classes_[result[i]] return result def predict_proba(self, X): weights = self._compute_weights(X) normalizer = weights.sum(axis=1) normalizer[normalizer == 0.0] = 1.0 weights /= normalizer return weights
class FaceRecognition(): def __init__(self): self.unknown = '' self.same_person_num = 1 self.has_save_pic_feature = [] self.has_cal_dist = [] self.NeighbourNum = 10 self.all_pic_data_folder = '/data/liubo/face/self' self.other_dataset_para_add = 1 self.n_neighbors = 5 self.lshf = LSHForest(n_estimators=20, n_candidates=200, n_neighbors=self.n_neighbors) self.all_labels = [] self.all_pic_feature = [] self.same_pic_id = 2 self.must_be_same_id = 1 self.must_be_not_same_id = 0 self.maybe_same_id = 3 self.new_person_str = 'new_person_' self.current_new_person_id = self.find_current_new_person_id() self.must_same_str = '_Must_Same' self.maybe_same_str = '_Maybe_same' self.load_time = time.time() self.user_count = {} # 不同的模型阈值不相同 self.upper_threshold = upper_verif_threshold self.lower_threshold = lower_verif_threshold self.same_pic_threshold = same_pic_threshold self.pitch_threshold = 20 self.yaw_threshold = 20 self.roll_threshold = 20 # [(time, feature),...,(time, feature)] : 根据时间计算当前图片与前5张图片的相似度(如果时间相差很多, 不在计算) self.nearest = deque(maxlen=nearest_num) self.trans_dic = {self.same_pic_id: 'same_pic', self.must_be_same_id: 'must_same_id', self.must_be_not_same_id: 'must_not_same_id', self.maybe_same_id: 'maybe_same_id'} self.verification_same_person = 0 def cal_nearest_sim(self, current_feature): nearest_sim_list = [] try: length = len(self.nearest) for k in range(length): try: person_name, pre_feature = self.nearest[k] # 不在考虑时间, 只考虑图片的相似度 this_sim = pw.cosine_similarity(np.reshape(np.asarray(pre_feature), (1, len(pre_feature))), np.reshape(np.asarray(current_feature), (1, len(current_feature)))) nearest_sim_list.append((this_sim, verification_model.predict(this_sim), person_name)) except: traceback.print_exc() continue return nearest_sim_list except: traceback.print_exc() return nearest_sim_list def find_current_new_person_id(self): current_day = get_current_day() log_file = open(os.path.join(log_dir, current_day+'.txt'), 'a') old_person_id = [] person_list = os.listdir(self.all_pic_data_folder) for person in person_list: if person.startswith(self.new_person_str): tmp = person[len(self.new_person_str):].split('_') if len(tmp) > 0: this_id = int(tmp[0]) old_person_id.append(this_id) if len(old_person_id) == 0: current_new_person_id = 0 else: current_new_person_id = max(old_person_id) + 1 log_file.write('\t'.join(map(str, ['current_new_person_id :', current_new_person_id]))+'\n') log_file.close() return current_new_person_id def extract_pic_feature(self, pic_data, batch_size=1, feature_dim=FEATURE_DIM): ''' 用于提取多张图片的特征(用于处理load数据) :param pic_data: 图片数据 :param batch_size: :param feature_dim: 模型输出维度(vgg的输出是4096) :return: ''' pic_feature = np.zeros(shape=(pic_data.shape[0], feature_dim)) batch_num = pic_data.shape[0] / batch_size for index in range(batch_num): pic_feature[index*batch_size:(index+1)*batch_size, :] = \ extract_feature_from_numpy(pic_data[index*batch_size:(index+1)*batch_size]) if batch_num*batch_size < pic_data.shape[0]: pic_feature[batch_num*batch_size:, :] = \ extract_feature_from_numpy(pic_data[batch_num*batch_size:]) return pic_feature def load_all_data(self): # 将以前标记的数据全部读入,用LSH Forest保存,方便计算距离 current_day = get_current_day() log_file = open(os.path.join(log_dir, current_day+'.txt'), 'a') train_data, train_label = load_train_data(self.all_pic_data_folder) if len(train_label) == 0: return pic_feature = self.extract_pic_feature(train_data) start = time.time() self.lshf.fit(pic_feature, train_label) self.all_pic_feature = list(pic_feature) self.all_labels = list(train_label) end = time.time() self.load_time = end self.user_count = Counter(self.all_labels) log_file.write('\t'.join(map(str, [self.user_count, 'fit all data time :', (end - start)]))+'\n') log_file.close() def add_all_new_pic(self): ''' 将从上次加载数据到当前新增的文件都加载到LSH Forest(有可能是新增加一个人,还有可能是对已有的人增加新图片) 遍历文件夹(self.all_pic_data_folder),根据文件的时间判断是否需要加入该图片 用户新加入的图片先进行人脸检测, 如果能够检测到人脸,使用检测结果, 否则使用用户的原始图片 ''' current_day = get_current_day() log_file = open(os.path.join(log_dir, current_day+'.txt'), 'a') start = time.time() person_list = os.listdir(self.all_pic_data_folder) add_num = 0 for person in person_list: if self.must_same_str in person or self.maybe_same_str in person or self.new_person_str in person: continue person_path = os.path.join(self.all_pic_data_folder, person) if not os.path.isdir(person_path): continue pic_list = os.listdir(person_path) for pic in pic_list: pic_path = os.path.join(person_path, pic) last_modify_time = os.stat(pic_path).st_atime if last_modify_time > self.load_time: # 请求本地服务 request = { "label": person, "request_type": 'add', "one_pic_feature": pic_path } url = "http://127.0.0.1:%d/"%port result = image_request(request, url) try: add_flag = json.loads(result)["add"] if not add_flag:# 加载失败 log_file.write('\t'.join(map(str, ['no add file :', pic_path]))+'\n') else: add_num += 1 except: log_file.write('\t'.join(map(str, ['no add file :', pic_path]))+'\n') traceback.print_exc() continue add_num += 1 end = time.time() if add_num > 0: self.load_time = end log_file.write('\t'.join(map(str, ['self.load_time', self.load_time]))+'\n') log_file.write('\t'.join(map(str, ['add pic num :', add_num, 'Dynamic increase time :', (end - start)]))+'\n') log_file.close() else: log_file.close() def add_one_new_pic(self, pic_path, label): try: # 读入数据时已经转换成需要的尺寸 im_feature = extract_feature_from_file(pic_path) self.add_one_pic(im_feature, label) return True except: traceback.print_exc() return False def add_one_pic(self, one_pic_feature, pic_label): ''' 将一个图像的特征加入到LSH Forest,同时将对应的标签加入到self.all_labels :param pic_feature: array shape :(1,1024) :param pic_label: (1,) :return: ''' self.lshf.partial_fit(one_pic_feature.reshape(1, FEATURE_DIM), pic_label) self.all_labels.append(pic_label) self.all_pic_feature.append(np.reshape(one_pic_feature, newshape=(1, one_pic_feature.size))) def find_k_neighbors_with_lsh(self, one_pic_feature): ''' :param one_pic_feature: 图像特征 :return: 需要返回neighbors的特征, 用于计算pariwise ''' try: tmp = self.lshf.kneighbors(one_pic_feature.reshape(1, FEATURE_DIM), n_neighbors=self.n_neighbors, return_distance=True) neighbors_label = np.asarray(self.all_labels)[tmp[1][0]] neighbors_feature = np.asarray(self.all_pic_feature)[tmp[1][0]] pair_score_list = [] cos_sim_list = [] for index in range(len(neighbors_feature)): pair_score = pw.cosine_similarity(neighbors_feature[index].reshape(1, FEATURE_DIM), one_pic_feature.reshape(1, FEATURE_DIM))[0][0] cos_sim_list.append(pair_score) pair_score_list.append(verification_model.predict(pair_score)) result = zip(cos_sim_list, pair_score_list, neighbors_label) # result = self.filter_result(result) # result.sort(key=lambda x:x[0], reverse=True) return result except: return None def filter_result(self, result): ''' :param result: [(cos_sim, same_person_result, label), (cos_sim, same_person_result, label), (cos_sim, same_person_result, label)] 按cos_sim降序排列 :return: this_id(Must_same, Must_not_same, May_same), this_label(人名) ''' # 分值相同的, 将new_person的删去 tmp_dic = {} for element in result: this_score, this_same_person_result, this_label = element if this_score in tmp_dic: if self.new_person_str in this_label: continue else: tmp_dic[this_score] = element else: tmp_dic[this_score] = element result = tmp_dic.values() return result def evaluate_result(self, result): ''' :param result: [(cos_sim, same_person_result, label), (cos_sim, same_person_result, label), (cos_sim, same_person_result, label)] :return: this_id(Must_same, Must_not_same, May_same), this_label(人名) ''' for index, element in enumerate(result): this_score, this_same_person_result, this_label = element if this_same_person_result == self.verification_same_person and this_score > self.same_pic_threshold: return self.same_pic_id, this_label if this_same_person_result == self.verification_same_person and this_score > self.upper_threshold: return self.must_be_same_id, this_label if this_same_person_result == self.verification_same_person and this_score > self.lower_threshold: return self.maybe_same_id, this_label return self.must_be_not_same_id, '' def check_face_img(self, face_img, image_id): # 计算角度 ''' :param face_img: 人脸对应的矩阵 :param image_id: 图片id :return: 是否进行识别(False:不进行识别) ''' # 姿势检测 current_day = get_current_day() log_file = open(os.path.join(log_dir, current_day+'.txt'), 'a') face_img_str = base64.b64encode(msgpack_numpy.dumps(face_img)) request = { "request_type": 'check_pose', "face_img_str": face_img_str, "image_id": image_id, } url = "http://%s:%d/" % (check_ip, check_port) result = image_request(request, url) try: pose_predict = json.loads(result)["pose_predict"] if not pose_predict: # 加载失败 log_file.write('\t'.join(map(str, [image_id, 'pose filter request'])) + '\n') log_file.close() return False else: pose_predict = msgpack_numpy.loads(base64.b64decode(pose_predict)) if pose_predict == None: log_file.write('\t'.join(map(str, [image_id, 'pose filter detect'])) + '\n') log_file.close() return False pitch, yaw, roll = pose_predict[0] if math.fabs(pitch) < self.pitch_threshold and \ math.fabs(yaw) < self.yaw_threshold and \ math.fabs(roll) < self.roll_threshold: log_file.close() return True else: log_file.write('\t'.join(map(str, [image_id, 'pose filter threshold'])) + '\n') log_file.close() return False except: traceback.print_exc() log_file.close() return False def recognize_online_cluster(self, image, image_id): ''' :param image: 将得到的图片进行识别,加入的LSH Forest,根据距离计算proba(不同的距离对应不同的准确率,根据已有的dist计算阈值); 和已经设定的阈值判断是不是一个新出现的人,确定是原来已有的人,还是不确定是原来已有的人 # 增加统计的功能, 方便以后计算过滤原因和比例, 以及识别比例(same, not_same, maybe_same) :return: ''' start = time.time() need_add = False has_save_num = 0 current_day = get_current_day() log_file = open(os.path.join(log_dir, current_day+'.txt'), 'a') log_file.write('\t'.join(map(str, ["receive image", image_id, time.time()])) + '\n') try: image = base64.decodestring(image) image = zlib.decompress(image) im = cv2.imdecode(np.fromstring(image, dtype=np.uint8), 1) time_slot = get_time_slot(image_id) if time_slot == None: time_slot = 'error' time_slot_dir = os.path.join(tmp_face_dir, time_slot) if not os.path.exists(time_slot_dir): os.makedirs(time_slot_dir) tmp_pic_path = os.path.join(time_slot_dir, image_id+'.jpg') cv2.imwrite(tmp_pic_path, im) blur_result = is_blur(im) blur_sign, blur_var = blur_result if blur_sign: log_file.write('\t'.join(map(str, ['stat', 'blur_filter', blur_var, image_id]))+'\n') log_file.close() return self.unknown, 1.0, self.has_save_pic_feature, need_add align_face_img = align_face(tmp_pic_path) if align_face_img == None: log_file.write('\t'.join(map(str, ['stat', 'detect_filter', blur_var, image_id])) + '\n') log_file.close() return self.unknown, 1.0, self.has_save_pic_feature, need_add else: # 使用重新检测并对对齐的人脸进行识别 im = align_face_img # 对检测到的人脸重新进行模糊检测 blur_result = is_blur(im) blur_sign, blur_var = blur_result if blur_sign: log_file.write('\t'.join(map(str, ['stat', 'blur_filter', blur_var, image_id]))+'\n') log_file.close() return self.unknown, 1.0, self.has_save_pic_feature, need_add need_process = self.check_face_img(im, image_id) if not need_process: log_file.write('\t'.join(map(str, ['stat', 'pose_filter', blur_var, image_id])) + '\n') log_file.close() return self.unknown, 1.0, self.has_save_pic_feature, need_add im = cv2.resize(im, (PIC_SHAPE[1], PIC_SHAPE[2]), interpolation=cv2.INTER_LINEAR) im = im[:, :, ::-1]*1.0 im = im - avg im = im.transpose((2, 0, 1)) im = im[None, :] except: traceback.print_exc() return self.unknown, 1.0, self.has_save_pic_feature, need_add try: # 流程 : 找距离最近的图片 ; 计算prob ; 在线聚类 ; 加入LSH Forest im_feature = extract_feature_from_numpy(im) try: # nearest_sim_list的格式和dist_label_list的格式一样,这样可以将两个list合并,一起计算(这样不用考虑时间的因素) # 在识别出人名后将人名和feature放入到self.nearest nearest_sim_list = self.cal_nearest_sim(current_feature=im_feature) except: traceback.print_exc() nearest_sim_list = [] log_file.write('\t'.join(map(str, ['nearest_sim_list :', map(str, nearest_sim_list)])) + '\n') # 找距离最近的图片 --- 用LSH Forest 找出最近的10张图片,然后分别计算距离 dist_label_list = self.find_k_neighbors_with_lsh(im_feature) dist_label_list.extend(nearest_sim_list) dist_label_list = self.filter_result(dist_label_list) dist_label_list.sort(key=lambda x: x[0], reverse=True) # 计算 if dist_label_list == None: this_id = self.must_be_not_same_id this_label = self.new_person_str + str(self.current_new_person_id) else: # 计算prob --- 根据距离计算prob this_id, this_label = self.evaluate_result(dist_label_list) # 在线聚类 --- 根据dist确定是重新增加一个人还是加入到已有的人中 log_file.write('\t'.join(map(str, ['stat', 'recognize_id', blur_var, this_id])) + '\n') if dist_label_list != None and len(dist_label_list) > 0: log_file.write('\t'.join(map(str, ['dist_label_list :', map(str, dist_label_list)])) + '\n') need_save = False if this_id == self.same_pic_id: need_add = False elif this_id == self.must_be_same_id: need_add = False need_save = True this_person_folder = os.path.join(self.all_pic_data_folder, this_label+self.must_same_str) elif this_id == self.must_be_not_same_id: this_label = self.new_person_str + str(self.current_new_person_id) self.current_new_person_id += 1 this_person_folder = os.path.join(self.all_pic_data_folder, this_label) need_add = True need_save = True elif this_id == self.maybe_same_id: this_person_folder = os.path.join(self.all_pic_data_folder, this_label+self.maybe_same_str) need_add = False # prob在灰度区域的不如入,其余情况加入 need_save = True else: log_file.write('\t'.join(map(str, ['error para :', this_id])) + '\n') if need_save: try: if not os.path.exists(this_person_folder): os.makedirs(this_person_folder) os.chmod(this_person_folder, stat.S_IRWXG + stat.S_IRWXO + stat.S_IRWXU) this_pic_name = os.path.join(this_person_folder, image_id+'.png') imsave(this_pic_name, np.transpose(im[0], (1, 2, 0))) except: traceback.print_exc() return self.unknown, 1.0, has_save_num, False # 加入LSH Forest --- partial_fit if need_add: self.add_one_pic(im_feature, this_label) has_save_num += 1 # 根据label和image_id可以存生成文件名,确定是否要存储文件[可以选择在服务器和本地同时存储] if this_id == self.same_pic_id or this_id == self.must_be_not_same_id or this_id == self.must_be_same_id: end = time.time() log_file.write('\t'.join(map(str, ['stat recognize_time :', (end - start), 'this_id :', self.trans_dic.get(this_id)])) + '\n') log_file.close() return this_label.replace(self.must_same_str, ''), \ str(dist_label_list[0][0]), str(has_save_num), str(need_add) else: # 灰度区域,不显示人名 end = time.time() log_file.write('\t'.join(map(str, ['gray area recog time :',(end - start)])) + '\n') log_file.close() # return this_label.replace(self.maybe_same_str, ''), \ # str(dist_label_list[0][0]), str(has_save_num), str(need_add) return self.unknown, str(dist_label_list[0][0]), str(has_save_num), str(need_add) except: traceback.print_exc() log_file.close() return self.unknown, str(100.0), str(has_save_num), str(False)
class Analysis(): def __init__(self, conf): self.unknown = '' self.n_neighbors = 5 self.lshf = LSHForest(n_estimators=20, n_candidates=200, n_neighbors=self.n_neighbors) self.all_labels = [] self.all_pic_feature = [] self.same_pic_id = 2 self.must_be_same_id = 1 self.must_be_not_same_id = 0 self.maybe_same_id = 3 self.new_person_str = 'new_person_' self.current_new_person_id = self.find_current_new_person_id() self.must_same_str = '_Must_Same' self.maybe_same_str = '_Maybe_same' self.user_count = {} self.nearest_num = 5 # 只保存最近15秒的图片 self.nearest_time_threshold = 15 self.feature_url = conf.feature_url # 以后调模型时不在修改最后一个卷积层的维度 self.feature_dim = conf.feature_dim # 每次更换模型的时候需要修改这两个参数 self.same_pic_threshold = conf.same_pic_threshold self.upper_threshold = conf.upper_threshold self.lower_threshold = conf.lower_threshold self.pitch_threshold = 20 self.yaw_threshold = 20 self.roll_threshold = 20 self.max_dist_threshold = 100 # [(label, feature),...,(label, feature)] self.nearest = deque(maxlen=self.nearest_num) self.trans_dic = {self.must_be_same_id: 'must_same_id', self.same_pic_id: 'same_pic', self.must_be_not_same_id: 'not_same_id', self.maybe_same_id: 'maybe_same_id'} self.all_feature_label_file = conf.all_feature_label_file self.log_dir = conf.log_dir if not os.path.exists(self.log_dir): os.makedirs(self.log_dir) self.tmp_jpg_file = 'tmp.jpg' self.model_label = conf.model_label def find_current_new_person_id(self): all_id_name = get_all_name() current_new_person_id = -1 for id_name in all_id_name: name = id_name[1] if name.startswith(self.new_person_str): current_new_person_id = max(current_new_person_id, int(name.replace(self.new_person_str, ''))) current_new_person_id = current_new_person_id + 1 return current_new_person_id def cal_nearest_sim(self, current_time, current_feature): nearest_sim_list = [] try: length = len(self.nearest) for k in range(length): this_label, pre_feature, pre_time = self.nearest[k] if current_time - pre_time > self.nearest_time_threshold: continue this_sim = pw.cosine_similarity(pre_feature, current_feature) nearest_sim_list.append((this_sim, this_label)) return nearest_sim_list except: traceback.print_exc() return nearest_sim_list def extract_pic_feature(self, face_array): ''' # 传入半身照片,得到人脸照片(必须要做检测,因为有可能会更新检测模型,导致识别不准) # 用于人工添加图片加到LSHForest # 仍然使用人脸识别的接口, 解析得到的特征 :param face_array: 人脸图片(numpy格式) :return:face_frame, feature(numpy格式) ''' try: cv2.imwrite(self.tmp_jpg_file, face_array) result = requests.post(self.feature_url, open(self.tmp_jpg_file, 'rb').read()) if result.status_code == 200: try: content = result.content tmp = content.split('\n') if len(tmp) < 3: return None, None face_num = int(tmp[0].split(':')[1]) if face_num == 1: frame = map(float, tmp[1].split(',')) feature = map(float, tmp[2].split(',')[:-1]) if np.sum(feature) == 0: print 'filter' return None, None return frame, feature except: traceback.print_exc() return None, None else: return None, None except: traceback.print_exc() return None, None def load_all_data(self): # 将以前标记的数据全部读入,用LSH Forest保存,方便计算距离 # 使用半身照进行检测和识别(输入图片,得到content,解析content得到feature) current_day = get_current_day() log_file = open(os.path.join(self.log_dir, current_day + '.txt'), 'a') if not os.path.exists(self.all_feature_label_file): return start = time.time() # 从数据库中得到半身照和人名 half_pic_name_list = get_all_annotate_half() for element in half_pic_name_list: image, name = element im = cv2.imdecode(np.fromstring(base64.decodestring(image), dtype=np.uint8), 1) tmp_1 = self.extract_pic_feature(im) if tmp_1 == None: continue face_frame, im_feature = tmp_1 if im_feature == None or face_frame == None: continue if np.sum(im_feature) == 0: print im.shape, name, 'blur' continue print im.shape, name im_feature = list(im_feature) # type(im_feature) < type 'list' > ; len(im_feature) 256 this_label = name self.all_pic_feature.append(im_feature) self.all_labels.append(this_label) self.lshf.partial_fit(im_feature, this_label) end = time.time() self.user_count = Counter(self.all_labels) current_time = get_current_time() log_file.write('\t'.join(map(str, [current_time, self.user_count, 'fit all data time :', (end - start)])) + '\n') log_file.close() def add_one_pic(self, one_pic_feature, pic_label): ''' 将一个图像的特征加入到LSH Forest,同时将对应的标签加入到self.all_labels :param pic_feature: array shape :(1,512) :param pic_label: (1,) :return: ''' self.lshf.partial_fit(one_pic_feature.reshape(1, self.feature_dim), pic_label) self.all_labels.append(pic_label) self.all_pic_feature.append(np.reshape(one_pic_feature, newshape=(1, one_pic_feature.size))) def add_all_new_pic(self): ''' 遍历数据库(将修改过的数据加入LSHForest) 一分钟一次(避免频繁查数据库, 也不会造成太大的延迟) 使用研究院的模型时, 只能先保存特征, 直接移动特征(在数据库中加一列) ''' current_day = get_current_day() log_file = open(os.path.join(self.log_dir, current_day + '.txt'), 'a') start = time.time() add_num = 0 all_new_pic_name = get_all_new_face() for feature_str, person_name in all_new_pic_name: face_feature = np.reshape(msgpack_numpy.loads(base64.b64decode(feature_str)), (1, self.feature_dim)) self.add_one_pic(face_feature, person_name) add_num += 1 if add_num > 0: end = time.time() current_time = get_current_time() log_file.write('\t'.join(map(str, [current_time, 'add_pic_num :', add_num, 'Dynamic_increase_time :', (end - start)])) + '\n') log_file.close() else: log_file.close() def find_k_neighbors_with_lsh(self, one_pic_feature): ''' :param one_pic_feature: 图像特征 :return: 需要返回neighbors的特征, 用于计算pariwise ''' try: one_pic_feature = one_pic_feature.reshape(1, self.feature_dim) tmp = self.lshf.kneighbors(one_pic_feature, n_neighbors=self.n_neighbors, return_distance=True) neighbors_label = np.asarray(self.all_labels)[tmp[1][0]] neighbors_feature = np.asarray(self.all_pic_feature)[tmp[1][0]] cos_sim_list = [] for index in range(len(neighbors_feature)): pair_score = pw.cosine_similarity(neighbors_feature[index], one_pic_feature)[0][0] cos_sim_list.append(pair_score) result = zip(cos_sim_list, neighbors_label) result = self.filter_result(result) result.sort(key=lambda x: x[0], reverse=True) return result except: traceback.print_exc() return None def filter_result(self, result): ''' :param result: [(cos_sim, label), (cos_sim, label), (cos_sim, label)] 按cos_sim降序排列 :return: this_id(Must_same, Must_not_same, May_same), this_label(人名) ''' # 分值相同的, 将new_person的删去 tmp_dic = {} for element in result: this_score, this_label = element if this_score in tmp_dic: if self.new_person_str in this_label: continue else: tmp_dic[this_score] = element else: tmp_dic[this_score] = element result = tmp_dic.values() return result def evaluate_result(self, result): ''' :param result: [(cos_sim, same_person_result, label), (cos_sim, same_person_result, label), (cos_sim, same_person_result, label)] 程序中只根据cos_sim做判断, 不在使用same_person_result :return: this_id(Must_same, Must_not_same, May_same), this_label(人名) ''' for index, element in enumerate(result): this_score, this_label = element if this_score > self.same_pic_threshold: return self.same_pic_id, this_label if this_score > self.upper_threshold: return self.must_be_same_id, this_label if this_score > self.lower_threshold: return self.maybe_same_id, this_label return self.must_be_not_same_id, '' def recognize_one_feature(self, im_feature, image_id): ''' 根据特征确定label :param image_id : 大图的文件名+face_id(第几个人脸) --- 方便定位 ''' start = time.time() feature_str = base64.b64encode(msgpack_numpy.dumps(im_feature)) # im_feature = msgpack_numpy.loads(base64.b64decode(feature_str)) current_day = get_current_day() log_file = open(os.path.join(self.log_dir, current_day + '.txt'), 'a') current_time = get_current_time() log_file.write('\t'.join(map(str, [current_time, "receive image", image_id])) + '\n') try: # 流程 : 找距离最近的图片 ; 计算prob ; 在线聚类 ; 加入LSH Forest try: current_time = float(image_id) nearest_sim_list = self.cal_nearest_sim(current_time=current_time, current_feature=im_feature) # print 'current_time :', current_time, 'nearest_sim_list :', nearest_sim_list except: traceback.print_exc() nearest_sim_list = [] # 找距离最近的图片 --- 用LSH Forest 找出最近的10张图片,然后分别计算距离 dist_label_list = self.find_k_neighbors_with_lsh(im_feature) dist_label_list.extend(nearest_sim_list) dist_label_list = self.filter_result(dist_label_list) dist_label_list.sort(key=lambda x: x[0], reverse=True) # 计算 if dist_label_list == None: # 不考虑new_person的情况,小于阈值的都判断为new_person this_id = self.must_be_not_same_id this_label = 'new_person' # this_id = self.must_be_not_same_id # this_label = self.new_person_str + str(self.current_new_person_id) else: # 计算prob --- 根据距离计算prob this_id, this_label = self.evaluate_result(dist_label_list) # 在线聚类 --- 根据dist确定是重新增加一个人还是加入到已有的人中 if dist_label_list != None and len(dist_label_list) > 0: current_time = get_current_time() log_file.write('\t'.join(map(str, [current_time, 'dist_label_list :', map(str, dist_label_list)])) + '\n') # need_add 决定是否加入LSHForest ; need_save决定是否存入数据库 if this_id == self.same_pic_id: need_add = False need_save = True elif this_id == self.must_be_same_id: need_add = False need_save = True elif this_id == self.must_be_not_same_id: # 现在的版本不用加入新人, 不能识别的全部返回new_person this_label = 'new_person' need_save = True need_add = False # this_label = self.new_person_str + str(self.current_new_person_id) # self.current_new_person_id += 1 # need_add = True # need_save = True elif this_id == self.maybe_same_id: need_add = False need_save = False else: current_time = get_current_time() log_file.write('\t'.join(map(str, [current_time, 'error para :', this_id])) + '\n') return self.unknown, str(self.max_dist_threshold), feature_str, str(False) self.nearest.append((this_label, im_feature, image_id)) # 现在不在增加new_person # # 加入LSH Forest --- partial_fit # if need_add: # # 只将新人的图片加入LSHForest并保存到文件 # self.add_one_pic(im_feature, this_label) # write_start = time.time() # tmp_file = open(self.all_feature_label_file, 'a') # tmp_file.write(base64.b64encode(msgpack_numpy.dumps((im_feature, this_label)))+'\n') # tmp_file.close() # print 'write time :', (time.time() - write_start) # # 根据label和image_id可以存生成文件名,确定是否要存储文件[可以选择在服务器和本地同时存储] # 统计有多少图片在gray area log_file.write('\t'.join(map(str, ['stat', 'recognize_id', self.trans_dic[this_id], 'recog time :', (time.time() - start)])) + '\n') log_file.close() if this_id == self.same_pic_id or this_id == self.must_be_not_same_id or this_id == self.must_be_same_id: if this_label == None or dist_label_list == None: # 数据库里可能一个人也没有, 这时this_label = None return self.unknown, str(self.max_dist_threshold), feature_str, str(False) else: return this_label.replace(self.must_same_str, ''), str(dist_label_list[0][0]), feature_str, str(need_save) else: # 灰度区域,不显示人名 # return this_label.replace(self.maybe_same_str, ''), tr(dist_label_list[0][0]), str(has_save_num), str(need_add) return self.unknown, str(dist_label_list[0][0]), feature_str, str(need_save) except: traceback.print_exc() log_file.close() return self.unknown, str(self.max_dist_threshold), feature_str, str(False) def recognize_online_cluster(self, content, image_id): ''' 该程序不需要存储图片, 只需要将标志返回就可以 增加过滤, :param content: 检测识别返回的结果 :return: ''' tmp = content.split('\n') print 'len(tmp) :', len(tmp) if len(tmp) < 3: return None face_num = int(tmp[0].split(':')[1]) all_frames = [] all_recognize_result = [] for k in range(face_num): frame = map(float, tmp[2 * k + 1].split(',')) feature = np.reshape(np.asarray(map(float, tmp[2 * k + 2].split(',')[:-1])), (1, self.feature_dim)) person_name, score, has_save_pic_feature, need_save = self.recognize_one_feature(feature, image_id) all_recognize_result.append((person_name, score, has_save_pic_feature, need_save)) all_frames.append(frame) return zip(all_frames, all_recognize_result) def offline_add(self, folder): # 线下自己将文件夹中的数据导入(每个图片以label命名) pic_list = os.listdir(folder) pic_info = [] for pic in pic_list[:]: print 'pic :', pic label = pic.split('.')[0] label = label.decode('gbk').encode('utf-8') pic_path = os.path.join(folder, pic) img_array = cv2.imread(pic_path) try: tmp = self.extract_pic_feature(img_array) if tmp == None: continue face_frame, im_feature = tmp if face_frame == None or im_feature == None: continue except: traceback.print_exc() continue x, y, w, h = face_frame face = img_array[int(y):int(y + h), int(x):int(x + w), :] algorithm = self.model_label face_str = base64.encodestring(cv2.imencode('.jpg', face)[1].tostring()) img_str = base64.encodestring(cv2.imencode('.jpg', img_array)[1].tostring()) # tmp_array = cv2.imdecode(np.fromstring(base64.decodestring(img_str), dtype=np.uint8), 1) # cv2.imwrite(str(time.time())+'.jpg', tmp_array) pic_info.append((label, algorithm, face_str, img_str)) insert_pic_list(pic_info)
class FaceRecognition(): def __init__(self): self.unknown = '' self.same_person_num = 1 self.has_cal_dist = [] self.NeighbourNum = 10 # 如果管理员加载图片, 把图片放到all_pic_data_folder下指定人的目录(图片文件和特征文件的文件名相同) self.all_pic_feature_data_folder = '/data/liubo/face/research_feature_self' # 研究院的模型直接存储特征 # 保存图片可以方便以后查看效果, 方便前端显示, 也方便管理员进行标注 self.all_pic_data_folder = '/data/liubo/face/research_self' if not os.path.exists(self.all_pic_data_folder): os.makedirs(self.all_pic_data_folder) if not os.path.exists(self.all_pic_feature_data_folder): os.makedirs(self.all_pic_feature_data_folder) self.n_neighbors = 10 self.lshf = LSHForest(n_estimators=20, n_candidates=200, n_neighbors=self.n_neighbors) self.all_labels = [] self.all_pic_feature = [] self.same_pic_id = 2 self.must_be_same_id = 1 self.must_be_not_same_id = 0 self.maybe_same_id = 3 self.new_person_str = 'new_person_' self.current_new_person_id = self.find_current_new_person_id() self.must_same_str = '_Must_Same' self.maybe_same_str = '_Maybe_same' self.load_time = time.time() self.user_count = {} self.upper_threshold = upper_verif_threshold self.lower_threshold = lower_verif_threshold self.same_pic_threshold = same_pic_threshold self.trans_dic = {self.same_pic_id: 'same_pic', self.must_be_same_id: 'must_same_id', self.must_be_not_same_id: 'must_not_same_id', self.maybe_same_id: 'maybe_same_id'} self.nearest = deque(maxlen=nearest_num) self.verification_same_person = 0 def cal_nearest_sim(self, current_feature): nearest_sim_list = [] current_day = get_current_day() log_file = open(os.path.join(log_dir, current_day+'.txt'), 'a') try: length = len(self.nearest) for k in range(length): try: person_name, pre_feature = self.nearest[k] # 不在考虑时间, 只考虑图片的相似度 this_sim = pw.cosine_similarity(np.reshape(np.asarray(pre_feature), (1, len(pre_feature))), np.reshape(np.asarray(current_feature), (1, len(current_feature)))) nearest_sim_list.append((this_sim, verification_model.predict(this_sim), person_name)) except: log_file.write('cal_nearest_sim error'+'\n') traceback.print_exc() continue return nearest_sim_list except: traceback.print_exc() return nearest_sim_list def load_train_data(self, data_folder): # 直接读取图片特征, 返回所有特征和label all_pic_feature = [] all_label = [] person_list = os.listdir(data_folder) for person in person_list: if person == self.unknown or self.must_same_str in person or self.maybe_same_str in person: continue person_path = os.path.join(data_folder, person) pic_feature_list = os.listdir(person_path) for pic_feature_path in pic_feature_list: pic_feature = msgpack_numpy.load(open(os.path.join(person_path, pic_feature_path), 'rb')) all_pic_feature.append(pic_feature) all_label.append(person) all_pic_feature = np.asarray(all_pic_feature) all_label = np.asarray(all_label) return all_pic_feature, all_label def find_current_new_person_id(self): current_day = get_current_day() log_file = open(os.path.join(log_dir, current_day+'.txt'), 'a') old_person_id = [] # 保存的是原始图片 person_list = os.listdir(self.all_pic_data_folder) for person in person_list: if person.startswith(self.new_person_str): tmp = person[len(self.new_person_str):].split('_') if len(tmp) > 0: this_id = int(tmp[0]) old_person_id.append(this_id) if len(old_person_id) == 0: current_new_person_id = 0 else: current_new_person_id = max(old_person_id) + 1 log_file.write('\t'.join(map(str, ['current_new_person_id :', current_new_person_id]))+'\n') log_file.close() return current_new_person_id def extract_pic_feature(self, pic_path): try: result = extract_feature_from_binary_data(open(pic_path, 'rb')) if result == None: return face_num, all_frames, all_feature = result biggest_face_index = find_big_face(all_frames) pic_frame = all_frames[biggest_face_index] pic_feature = all_feature[biggest_face_index] x, y, width, height = pic_frame face_pic = cv2.imread(pic_path)[y:y+width, x:x+height, :] return face_pic, pic_feature except: traceback.print_exc() return None def load_all_data(self): # 将以前标记的数据全部读入(直接读入的是特征), 用LSH Forest保存,方便计算距离 current_day = get_current_day() log_file = open(os.path.join(log_dir, current_day+'.txt'), 'a') try: all_pic_feature, all_label = self.load_train_data(self.all_pic_feature_data_folder) train_label = np.asarray(all_label) if len(all_pic_feature) == len(train_label) and len(train_label) > 0: start = time.time() self.lshf.fit(all_pic_feature, train_label) self.all_pic_feature = list(all_pic_feature) self.all_labels = list(train_label) end = time.time() self.load_time = end self.user_count = Counter(self.all_labels) log_file.write('\t'.join(map(str, [self.user_count, 'fit all data time :', (end - start)]))+'\n') log_file.close() except: traceback.print_exc() log_file.close() return def save_pic_feature(self, pic_path, person_name): # 将已经存在的文件生成特征并保存到指定文件夹下, 用于管理员加入新的图片(加入新的图片后, 提取特征, 保存到指定文件夹) person_pic_folder_path = os.path.join(self.all_pic_data_folder, person_name) person_feature_folder_path = os.path.join(self.all_pic_feature_data_folder, person_name) if not os.path.exists(person_pic_folder_path): os.makedirs(person_pic_folder_path) if not os.path.exists(person_feature_folder_path): os.makedirs(person_feature_folder_path) pic_name = os.path.split(pic_path)[-1] # 特征文件 person_feature_path = os.path.join(person_feature_folder_path, pic_name) # 人脸文件 person_pic_path = os.path.join(person_pic_folder_path, pic_name) result = extract_feature_from_binary_data(open(pic_path, 'rb')) if result == None: return face_num, all_frames, all_feature = result biggest_face_index = find_big_face(all_frames) pic_frame = all_frames[biggest_face_index] pic_feature = all_feature[biggest_face_index] x, y, width, height = pic_frame face_pic = cv2.imread(pic_path)[y:y+width, x:x+height, :] cv2.imwrite(person_pic_path, face_pic) msgpack_numpy.dump(pic_feature, open(person_feature_path, 'wb')) def add_all_new_pic(self): ''' 将从上次加载数据到当前新增的文件都加载到LSH Forest(有可能是新增加一个人,还有可能是对已有的人增加新图片) 遍历文件夹(self.all_pic_feature_data_folder), 根据文件的时间判断是否需要加入该图片的特征 系统在管理员标注图片后, 将人脸图片和特征文件同时进行移动, 所以现在只需要将特征和对应的label加入LSH就可以了 ''' current_day = get_current_day() log_file = open(os.path.join(log_dir, current_day+'.txt'), 'a') start = time.time() person_list = os.listdir(self.all_pic_data_folder) add_num = 0 for person in person_list: if self.must_same_str in person or self.maybe_same_str in person or self.new_person_str in person: continue person_path = os.path.join(self.all_pic_data_folder, person) if not os.path.isdir(person_path): continue pic_list = os.listdir(person_path) for pic in pic_list: pic_path = os.path.join(person_path, pic) last_modify_time = os.stat(pic_path).st_atime if last_modify_time > self.load_time: request = { "label": person, "request_type": 'add', "one_pic_feature": pic_path } url = "http://127.0.0.1:%d/"%port result = image_request(request, url) try: add_flag = json.loads(result)["add"] if not add_flag: # 加载失败 log_file.write('\t'.join(map(str, ['no add file :', pic_path]))+'\n') else: add_num += 1 except: log_file.write('\t'.join(map(str, ['no add file :', pic_path]))+'\n') traceback.print_exc() continue add_num += 1 end = time.time() if add_num > 0: self.load_time = end log_file.write('\t'.join(map(str, ['self.load_time', self.load_time]))+'\n') log_file.write('\t'.join(map(str, ['add pic num :', add_num, 'Dynamic increase time :', (end - start)]))+'\n') log_file.close() else: log_file.close() def add_one_new_pic(self, pic_path, label): current_day = get_current_day() log_file = open(os.path.join(log_dir, current_day+'.txt'), 'a') try: # 读入数据时已经转换成需要的尺寸 result = self.extract_pic_feature(pic_path) if result == None: return False face_pic, pic_feature = result self.add_one_pic(pic_feature, label) pic_name = os.path.split(pic_path)[1] this_person_pic_folder = os.path.join(self.all_pic_data_folder, label) this_person_feature_folder = os.path.join(self.all_pic_feature_data_folder, label) if not os.path.exists(this_person_pic_folder): os.makedirs(this_person_pic_folder) if not os.path.exists(this_person_feature_folder): os.makedirs(this_person_feature_folder) # 直接存储图片对应的特征, 同时保存图片文件 this_pic_feature_name = os.path.join(this_person_feature_folder, pic_name + '.p') msgpack_numpy.dump(pic_feature, open(this_pic_feature_name, 'wb')) this_pic_face_name = os.path.join(this_person_pic_folder, pic_name + '.jpg') cv2.imwrite(this_pic_face_name, face_pic) log_file.write('\t'.join(map(str, [pic_path, this_pic_face_name]))+'\n') return True except: traceback.print_exc() return False def add_one_pic(self, one_pic_feature, pic_label): ''' 将一个图像的特征加入到LSH Forest,同时将对应的标签加入到self.all_labels :param pic_feature: array shape :(1,1024) :param pic_label: (1,) :return: ''' one_pic_feature = np.asarray(one_pic_feature) self.lshf.partial_fit(one_pic_feature.reshape(1, FEATURE_DIM), pic_label) self.all_labels.append(pic_label) self.all_pic_feature.append(np.reshape(one_pic_feature, newshape=(1, one_pic_feature.size))) def find_k_neighbors_with_lsh(self, one_pic_feature): ''' :param one_pic_feature: 图像特征 :return: 需要返回neighbors的特征,用于计算pariwise ''' try: one_pic_feature = np.asarray(one_pic_feature) tmp = self.lshf.kneighbors(one_pic_feature.reshape(1, FEATURE_DIM), n_neighbors=self.n_neighbors, return_distance=True) neighbors_label = np.asarray(self.all_labels)[tmp[1][0]] neighbors_feature = np.asarray(self.all_pic_feature)[tmp[1][0]] pair_score_list = [] cos_sim_list = [] for index in range(len(neighbors_feature)): pair_score = pw.cosine_similarity(neighbors_feature[index].reshape(1, FEATURE_DIM), one_pic_feature.reshape(1, FEATURE_DIM))[0][0] cos_sim_list.append(pair_score) pair_score_list.append(verification_model.predict(pair_score)) result = zip(cos_sim_list, pair_score_list, neighbors_label) # result = self.filter_result(result) # result.sort(key=lambda x:x[0], reverse=True) return result except: return None def filter_result(self, result): ''' :param result: [(cos_sim, same_person_result, label), (cos_sim, same_person_result, label), (cos_sim, same_person_result, label)] 按cos_sim降序排列 :return: this_id(Must_same, Must_not_same, May_same), this_label(人名) ''' # 分值相同的, 将new_person的删去 tmp_dic = {} for element in result: try: this_score, this_same_person_result, this_label = element this_score = float(this_score) if this_score in tmp_dic: if self.new_person_str in this_label: continue else: tmp_dic[this_score] = element else: tmp_dic[this_score] = element except: traceback.print_exc() continue result = tmp_dic.values() return result def evaluate_result(self, result): ''' :param result: [(cos_sim, same_person_result, label), (cos_sim, same_person_result, label), (cos_sim, same_person_result, label)] :return: this_id(Must_same, Must_not_same, May_same), this_label(人名) ''' for index, element in enumerate(result): this_score, this_same_person_result, this_label = element if this_same_person_result == self.verification_same_person and this_score > self.same_pic_threshold: return self.same_pic_id, this_label if this_same_person_result == self.verification_same_person and this_score > self.upper_threshold: return self.must_be_same_id, this_label if this_same_person_result == self.verification_same_person and this_score > self.lower_threshold: return self.maybe_same_id, this_label return self.must_be_not_same_id, '' def recognize_online_cluster(self, image, image_id): ''' :param image: 将得到的图片进行识别,加入的LSH Forest,根据距离计算proba(不同的距离对应不同的准确率,根据已有的dist计算阈值); 和已经设定的阈值判断是不是一个新出现的人,确定是原来已有的人,还是不确定是原来已有的人 :return: ''' start = time.time() need_add = False need_save = False current_day = get_current_day() log_file = open(os.path.join(log_dir, current_day+'.txt'), 'a') log_file.write('\t'.join(map(str, ["receive image", image_id, time.time()])) + '\n') feature_str = '' try: image = base64.decodestring(image) image = zlib.decompress(image) im = cv2.imdecode(np.fromstring(image, dtype=np.uint8), 1) log_file.write('\t'.join(map(str, ['shape :', im.shape[0], im.shape[1]])) + '\n') # 图片尺寸过滤 if im.shape[0] < size_threshold or im.shape[1] < size_threshold: log_file.write('\t'.join(map(str, ['stat recognize_time :', (time.time() - start), 'small_size'])) + '\n') log_file.close() return self.unknown, 1.0, feature_str, need_save # 清晰度过滤 blur_sign, blur_var = is_blur(cv2.resize(im, (96, 96))) if blur_sign: log_file.write('\t'.join(map(str, ['stat recognize_time :', (time.time() - start), 'blur_filter', blur_var])) + '\n') log_file.close() return self.unknown, 1.0, feature_str, need_save # 保存传过来的图片 # img_file = '/tmp/research_face/%s.jpg' %image_id time_slot = get_time_slot(image_id) if time_slot == None: time_slot = 'error' time_slot_dir = os.path.join(tmp_face_dir, time_slot) if not os.path.exists(time_slot_dir): os.makedirs(time_slot_dir) img_file = os.path.join(time_slot_dir, image_id+'.jpg') cv2.imwrite(img_file, im) except: traceback.print_exc() log_file.close() return self.unknown, 1.0, feature_str, need_save try: # 流程 : 找距离最近的图片 ; 计算prob ; 在线聚类 ; 加入LSH Forest result = self.extract_pic_feature(img_file) if result == None: log_file.write('\t'.join(map(str, ['stat not_find_face', 'time :', (time.time() - start)]))+'\n') log_file.close() return self.unknown, 1.0, feature_str, need_save face_pic, im_feature = result try: # nearest_sim_list的格式和dist_label_list的格式一样,这样可以将两个list合并,一起计算(这样不用考虑时间的因素) # 在识别出人名后将人名和feature放入到self.nearest nearest_sim_list = self.cal_nearest_sim(current_feature=im_feature) except: traceback.print_exc() nearest_sim_list = [] log_file.write('\t'.join(map(str, ['nearest_sim_list :', map(str, nearest_sim_list)])) + '\n') feature_str = base64.b64encode(msgpack_numpy.dumps(im_feature)) log_file.write('\t'.join(map(str, ['extract_feature_time :', (time.time() - start)]))+'\n') # 找距离最近的图片 --- 用LSH Forest 找出最近的10张图片,然后分别计算距离 tmp_list = self.find_k_neighbors_with_lsh(im_feature) nearest_sim_list.sort(key=lambda x: x[0], reverse=True) nearest_sim_list.extend(tmp_list) dist_label_list = nearest_sim_list[:] # 计算 log_file.write('\t'.join(map(str, ['dist_label_list :', map(str, dist_label_list)])) + '\n') if dist_label_list == None: this_id = self.must_be_not_same_id this_label = self.new_person_str + str(self.current_new_person_id) else: # 计算prob --- 根据距离计算prob this_id, this_label = self.evaluate_result(dist_label_list) # 不管概率, 都要将最新的一张图片加入到self.nearest self.nearest.append((this_label, im_feature)) log_file.write('\t'.join(map(str, ['self.nearest :', map(str, self.nearest)])) + '\n') # 在线聚类 --- 根据dist确定是重新增加一个人还是加入到已有的人中 if this_id == self.same_pic_id: need_add = False elif this_id == self.must_be_same_id: need_add = False need_save = True this_person_pic_folder = os.path.join(self.all_pic_data_folder, this_label+self.must_same_str) this_person_feature_folder = os.path.join(self.all_pic_feature_data_folder, this_label+self.must_same_str) elif this_id == self.must_be_not_same_id: this_label = self.new_person_str + str(self.current_new_person_id) self.current_new_person_id += 1 this_person_pic_folder = os.path.join(self.all_pic_data_folder, this_label) this_person_feature_folder = os.path.join(self.all_pic_feature_data_folder, this_label) need_add = True need_save = True elif this_id == self.maybe_same_id: this_person_pic_folder = os.path.join(self.all_pic_data_folder, this_label + self.maybe_same_str) this_person_feature_folder = os.path.join(self.all_pic_feature_data_folder, this_label + self.maybe_same_str) need_add = False # prob在灰度区域的不如入,其余情况加入 need_save = True else: log_file.write('\t'.join(map(str, ['error para :', this_id]))+'\n') if need_save: try: if not os.path.exists(this_person_pic_folder): os.makedirs(this_person_pic_folder) if not os.path.exists(this_person_feature_folder): os.makedirs(this_person_feature_folder) # 直接存储图片对应的特征, 同时保存图片文件 this_pic_feature_name = os.path.join(this_person_feature_folder, image_id+'.p') msgpack_numpy.dump(im_feature, open(this_pic_feature_name, 'wb')) this_pic_face_name = os.path.join(this_person_pic_folder, image_id+'.jpg') cv2.imwrite(this_pic_face_name, face_pic) except: traceback.print_exc() return self.unknown, 1.0, feature_str, False # 加入LSH Forest --- partial_fit if need_add: self.add_one_pic(im_feature, this_label) # 根据label和image_id可以存生成文件名,确定是否要存储文件[可以选择在服务器和本地同时存储] if this_id == self.same_pic_id or this_id == self.must_be_not_same_id or this_id == self.must_be_same_id: end = time.time() log_file.write('\t'.join(map(str, ['stat recognize_time :',(end - start), 'this_id :', self.trans_dic.get(this_id)]))+'\n') log_file.close() need_save = True return this_label.replace(self.must_same_str, ''), str(dist_label_list[0][0]), str(feature_str), str(need_save) else: # 灰度区域,不显示人名 end = time.time() log_file.write('\t'.join(map(str, ['stat gray_area :',(end - start)]))+'\n') log_file.close() return self.unknown, str(dist_label_list[0][0]), str(feature_str), str(False) except: traceback.print_exc() log_file.close() return self.unknown, str(100.0), str(feature_str), str(False)