Пример #1
0
    def run(self):
        print "Master pid is :%d\n" % os.getpid()
        self.predictor.start()
        testenv = Environment(226)

        while True:
            self.global_t += 1
            if self.global_t % LOG_FREQ == 0:
                t_diff = time.time() - self.start_time
                print("{} steps in {} seconds, {} steps/h".format(
                    LOG_FREQ, t_diff, 3600 * LOG_FREQ / t_diff))
                self.start_time = time.time()

            if self.global_t % SAVE_FREQ == 0:
                self.network.save(self.global_t)

            identity, frame, reward, isover, frag_cnt, kdr = load(
                self.c2s_socket.recv(copy=False).bytes)
            if len(self.client[identity]) > 0:
                self.client[identity][-1].reward = reward

            #print 'frame received from {}'.format(identity)
            self._on_state(frame, identity)

            if isover:
                self._parse_memory(identity, 0, True)
            else:
                if len(self.client[identity]) == LOCAL_T_MAX + 1:
                    self._parse_memory(identity,
                                       self.client[identity][-1].value, False)

            if isover:
                self.network.log_eval(frag_cnt, kdr)
Пример #2
0
def feature_trans_pca(src_pack_file, dst_pack_file):
    all_data = []

    person_feature_dic = msgpack_numpy.load(open(src_pack_file, 'rb'))
    for person_index, person in enumerate(person_feature_dic):
        feature_list = person_feature_dic.get(person)
        for index in range(len(feature_list)):
            try:
                if feature_list[index][1] == None:
                    continue
                this_feature = np.array(feature_list[index][1][0])
                all_data.append(this_feature)
            except:
                traceback.print_exc()
    all_data = np.asarray(all_data)
    pca = PCA(n_components=128)
    pca.fit(all_data)

    for person_index, person in enumerate(person_feature_dic):
        feature_list = person_feature_dic.get(person)
        for index in range(len(feature_list)):
            try:
                if feature_list[index][1] == None:
                    continue
                this_feature = np.array(feature_list[index][1][0])
                this_feature = np.reshape(this_feature, (1, this_feature.size))
                this_feature = pca.transform(this_feature)[0]
                feature_list[index][1][0] = this_feature
            except:
                traceback.print_exc()
    msgpack_numpy.dump(person_feature_dic, open(dst_pack_file, 'wb'))
def extract_triplet_feature():
    lfw_feature_dic = msgpack_numpy.load(open(feature_pack_file, 'rb'))
    new_lfw_feature_dic = {}
    model_file = '/data/liubo/face/vgg_face_model/annotate_siamese_graph.model'
    weight_file = '/data/liubo/face/vgg_face_model/annotate_siamese_graph.weight'
    model = model_from_json(open(model_file, 'r').read())
    opt = Adam()
    model.compile(optimizer=opt, loss=['categorical_crossentropy'])
    model.load_weights(weight_file)

    # pdb.set_trace()
    get_Conv_FeatureMap = K.function(
        [model.layers[2].layers[0].get_input_at(False),
         K.learning_phase()],
        [model.layers[2].layers[-1].get_output_at(False)])
    for person in lfw_feature_dic:
        # print person
        this_person_feature_list = lfw_feature_dic.get(person)
        this_person_triplet_feature_list = []
        for feature, path in this_person_feature_list:
            feature = np.reshape(feature, (1, feature.size))
            new_feature = get_Conv_FeatureMap([feature, 0])[0].copy()
            this_person_triplet_feature_list.append((new_feature, path))

        new_lfw_feature_dic[person] = this_person_triplet_feature_list
    msgpack_numpy.dump(new_lfw_feature_dic,
                       open(triplet_feature_pack_file, 'wb'))
Пример #4
0
def split_train_valid(pack_file, train_pic_num, feature_dim):
    start = time()
    person_feature_dic = msgpack_numpy.load(open(pack_file, 'rb'))
    all_train_data = []
    all_train_label = []
    all_valid_data = []
    all_valid_label = []
    for person_index, person in enumerate(person_feature_dic):
        feature_list = person_feature_dic.get(person)
        np.random.shuffle(feature_list)
        if len(feature_list) < train_pic_num:
            continue
        else:
            for index in range(train_pic_num):
                pic_name, feature = feature_list[index]
                feature = np.asarray(feature)
                if feature.shape != (1, feature_dim):
                    continue
                all_train_data.append(feature)
                all_train_label.append(person)
            for index in range(train_pic_num, len(feature_list)):
                pic_name, feature = feature_list[index]
                feature = np.asarray(feature)
                if feature.shape != (1, feature_dim):
                    continue
                all_valid_data.append(feature)
                all_valid_label.append(person)
    all_train_data = np.asarray(all_train_data)
    all_train_label = np.asarray(all_train_label)
    all_valid_data = np.asarray(all_valid_data)
    all_valid_label = np.asarray(all_valid_label)
    return all_train_data, all_train_label, all_valid_data, all_valid_label
Пример #5
0
def main_distance():
    all_data = []
    all_label = []
    all_pic_path_list = []
    count = 0
    verif_path_feature_dic = msgpack_numpy.load(open(feature_pack_file, 'rb'))
    for line in open(pair_file):
        if count % 100 == 0:
            print count
        count += 1
        tmp = line.rstrip().split()
        if len(tmp) == 3:
            path1 = tmp[0]
            path2 = tmp[1]
            label = int(tmp[2])
            feature1 = verif_path_feature_dic.get(path1)
            feature2 = verif_path_feature_dic.get(path2)
            # pdb.set_trace()
            # predicts = pw.cosine_similarity(feature1, feature2)
            predicts = np.fabs(feature1-feature2)
            all_data.append(predicts)
            all_label.append(label)
            all_pic_path_list.append((path1, path2))

    data = np.asarray(all_data)
    # print data.shape
    # data = np.reshape(data, newshape=(data.shape[0], 1))
    data = np.reshape(data, newshape=(data.shape[0], data.shape[2]))
    label = np.asarray(all_label)
    print data.shape, label.shape
    msgpack_numpy.dump((data, label, all_pic_path_list), open('orl_verif_fc7_finetune_fc8.p', 'wb'))
Пример #6
0
def word_length_stat():
    # 每个查询词的长度不超过30
    word_dic = msgpack_numpy.load(
        open('/data/liubo/hotspot/all_query_dic.p', 'rb'))
    word_length_dic = {}
    for word in word_dic:
        word_length = len(word)
        word_length_dic[word_length] = word_length_dic.get(word_length, 0) + 1
    print word_length_dic
    def run(self):
        while True:
            for _ in tqdm(range(config.save_freq)):
                client_id, observations = load(
                    self.c2s_socket.recv(copy=False).bytes)
                self._on_state(client_id, observations)

            self.network.save()
            config.update()
def load_data():
    # the data, shuffled and split between train and test sets
    (data, label) = msgpack_numpy.load(open('/data/pictures_annotate_feature/annotate_data.p', 'rb'))
    digit_indices = [np.where(label == i)[0] for i in range(nb_class)]
    pairs_x, pairs_y = create_pairs(data, digit_indices)
    pairs_x = pairs_x[:10000]
    pairs_y = pairs_y[:10000]
    tr_pairs, te_pairs, tr_y, te_y = train_test_split(pairs_x, pairs_y, test_size=0.1)
    print tr_pairs.shape, te_pairs.shape, tr_y.shape, te_y.shape
    return tr_pairs, te_pairs, tr_y, te_y
def create_train_valid_data(folder='/data/liubo/face/research_feature_self'):
    # 根据已经存在的数据训练人脸验证模型
    person_list = os.listdir(folder)
    path_feature_dic = {}  #
    for person in person_list:
        person_path = os.path.join(folder, person)
        pic_feature_list = os.listdir(person_path)
        for pic_feature_path in pic_feature_list:
            pic_feature_path = os.path.join(person_path, pic_feature_path)
            pic_feature = msgpack_numpy.load(open(pic_feature_path, 'rb'))
            path_feature_dic[pic_feature_path] = pic_feature
    msgpack.dump(path_feature_dic, open('research_feature.p', 'wb'))
def feature_fusion():
    kf = KFold(n_folds=10)
    all_acc = []
    (data, label, all_pic_path_list) = msgpack_numpy.load(open('original_verif_fc7_finetune_fc8.p', 'rb'))
    error_file = 'error_pair.txt'
    f = open(error_file, 'w')
    all_pic_path_list = np.asarray(all_pic_path_list)

    for k, (train, valid) in enumerate(kf.split(data, label)):
        train_data = data[train]
        valid_data = data[valid]
        train_label = label[train]
        valid_label = label[valid]
        valid_path = all_pic_path_list[valid]

        clf = LinearSVC()
        clf.fit(train_data, train_label)
        acc = accuracy_score(valid_label,  clf.predict(valid_data))
        roc_auc = roc_auc_score(valid_label,  clf.predict(valid_data))
        # for index in range(len(valid_data)):
        #     if clf.predict(valid_data[index:index+1]) != valid_label[index]:
        #         f.write(valid_path[index][0]+'\t'+valid_path[index][1]+'\n')

        # rf_clf = RandomForestClassifier(n_estimators=500, n_jobs=15)
        # rf_clf.fit(train_data, train_label)
        # rf_predict_train_label_prob = rf_clf.predict_proba(train_data)
        # rf_predict_valid_label_prob = rf_clf.predict_proba(valid_data)
        #
        # gb_clf = GradientBoostingClassifier(learning_rate=0.05, n_estimators=500)
        # gb_clf.fit(train_data, train_label)
        # gb_predict_train_label_prob = gb_clf.predict_proba(train_data)
        # gb_predict_valid_label_prob = gb_clf.predict_proba(valid_data)
        # mf_clf = RandomForestClassifier()
        # mf_train_data = np.column_stack((rf_predict_train_label_prob, gb_predict_train_label_prob))
        # mf_valid_data = np.column_stack((rf_predict_valid_label_prob, gb_predict_valid_label_prob))
        # mf_clf.fit(mf_train_data, train_label)
        # acc = accuracy_score(valid_label, mf_clf.predict(mf_valid_data))
        # roc_auc = roc_auc_score(valid_label, mf_clf.predict(mf_valid_data))
        # acc = accuracy_score(valid_label, rf_clf.predict(valid_data))
        # roc_auc = roc_auc_score(valid_label, rf_clf.predict(valid_data))
        # acc = accuracy_score(valid_label, gb_clf.predict(valid_data))
        # roc_auc = roc_auc_score(valid_label, gb_clf.predict(valid_data))

        all_acc.append(acc)
        print acc, roc_auc

        # roc_auc = roc_auc_score(valid_label, clf.predict(valid_data))
        # print acc, roc_auc
        # cPickle.dump(clf, open('/data/liubo/face/vgg_face_dataset/model/lfw_verification_model', 'wb'))
    print 'mean :', np.mean(all_acc)
    f.close()
Пример #11
0
def train_valid_verif_model():
    all_data = []
    all_label = []
    all_pic_path_list = []
    count = 0
    for line in open(pair_file):
        if count % 100 == 0:
            print count
        count += 1
        tmp = line.rstrip().split()
        if len(tmp) == 3:
            path1 = tmp[0]
            path2 = tmp[1]
            if (os.path.exists(path1)) and (os.path.exists(path2)):
                feature1 = extract_feature_from_file(path1)
                feature2 = extract_feature_from_file(path2)
                predicts = pw.cosine_similarity(feature1, feature2)
                all_data.append(predicts)
                all_label.append(int(tmp[2]))
    msgpack_numpy.dump((all_data, all_label, all_pic_path_list),
                       open(feature_pack_file, 'wb'))
    (all_data, all_label,
     all_pic_path_list) = msgpack_numpy.load(open(feature_pack_file, 'rb'))
    all_data = np.asarray(all_data)
    data = np.reshape(all_data,
                      newshape=(all_data.shape[0], all_data.shape[2]))
    label = np.asarray(all_label)
    print data.shape, label.shape

    kf = KFold(len(label), n_folds=10)
    all_acc = []
    for (train, valid) in kf:
        train_data = data[train]
        valid_data = data[valid]
        train_label = label[train]
        valid_label = label[valid]
        clf = LinearSVC()
        clf.fit(train_data, train_label)
        acc = accuracy_score(valid_label, clf.predict(valid_data))
        roc_auc = roc_auc_score(valid_label, clf.predict(valid_data))
        all_acc.append(acc)
        print acc, roc_auc
    print np.mean(all_acc)

    clf = LinearSVC()
    clf.fit(data, label)
    pdb.set_trace()
    cPickle.dump(clf, open(verification_model_file, 'wb'))
Пример #12
0
    def run(self):
        self.player = Environment(self.index * 113)
        context = zmq.Context()
        self.c2s_socket = context.socket(zmq.PUSH)
        self.c2s_socket.setsockopt(zmq.IDENTITY, self.identity)
        self.c2s_socket.connect(self.c2s)

        self.s2c_socket = context.socket(zmq.DEALER)
        self.s2c_socket.setsockopt(zmq.IDENTITY, self.identity)
        self.s2c_socket.connect(self.s2c)
        while True:
            obs = self.player.current_state()
            self.c2s_socket.send(dump((self.index, obs)), copy=False)
            if obs is not None:
                action = load(self.s2c_socket.recv(copy=False).bytes)
                self.player.action(action)
Пример #13
0
def feature_fusion():
    kf = KFold(n_folds=10)
    all_acc = []
    (data, label, all_pic_path_list) = msgpack_numpy.load(open('orl_verif_fc7_finetune_fc8.p', 'rb'))
    error_file = 'error_pair.txt'
    f = open(error_file, 'w')
    all_pic_path_list = np.asarray(all_pic_path_list)

    for k, (train, valid) in enumerate(kf.split(data, label)):
        train_data = data[train]
        valid_data = data[valid]
        train_label = label[train]
        valid_label = label[valid]
        valid_path = all_pic_path_list[valid]

        # clf = LinearSVC()
        # clf.fit(train_data, train_label)
        # acc = accuracy_score(valid_label,  clf.predict(valid_data))
        # roc_auc = roc_auc_score(valid_label,  clf.predict(valid_data))
        # for index in range(len(valid_data)):
        #     if clf.predict(valid_data[index:index+1]) != valid_label[index]:
        #         f.write(valid_path[index][0]+'\t'+valid_path[index][1]+'\n')

        rf_clf = RandomForestClassifier(n_estimators=500, n_jobs=15)
        rf_clf.fit(train_data, train_label)
        rf_predict_train_label_prob = rf_clf.predict_proba(train_data)
        rf_predict_valid_label_prob = rf_clf.predict_proba(valid_data)
        gb_clf = GradientBoostingClassifier(learning_rate=0.05, n_estimators=500)
        gb_clf.fit(train_data, train_label)
        gb_predict_train_label_prob = gb_clf.predict_proba(train_data)
        gb_predict_valid_label_prob = gb_clf.predict_proba(valid_data)
        mf_clf = RandomForestClassifier()
        mf_train_data = np.column_stack((rf_predict_train_label_prob, gb_predict_train_label_prob))
        mf_valid_data = np.column_stack((rf_predict_valid_label_prob, gb_predict_valid_label_prob))
        mf_clf.fit(mf_train_data, train_label)
        acc = accuracy_score(valid_label, mf_clf.predict(mf_valid_data))
        roc_auc = roc_auc_score(valid_label, mf_clf.predict(mf_valid_data))

        all_acc.append(acc)
        print acc, roc_auc

        # roc_auc = roc_auc_score(valid_label, clf.predict(valid_data))
        # print acc, roc_auc
        # cPickle.dump(clf, open('/data/liubo/face/vgg_face_dataset/model/lfw_verification_model', 'wb'))
    print 'mean :', np.mean(all_acc)
    f.close()
 def load_train_data(self, data_folder):
     # 直接读取图片特征, 返回所有特征和label
     all_pic_feature = []
     all_label = []
     person_list = os.listdir(data_folder)
     for person in person_list:
         if person == self.unknown or self.must_same_str in person or self.maybe_same_str in person:
             continue
         person_path = os.path.join(data_folder, person)
         pic_feature_list = os.listdir(person_path)
         for pic_feature_path in pic_feature_list:
             pic_feature = msgpack_numpy.load(open(os.path.join(person_path, pic_feature_path), 'rb'))
             all_pic_feature.append(pic_feature)
             all_label.append(person)
     all_pic_feature = np.asarray(all_pic_feature)
     all_label = np.asarray(all_label)
     return all_pic_feature, all_label
def train_valid_verif_model():
    all_data = []
    all_label = []
    all_pic_path_list = []
    count = 0
    for line in open(pair_file):
        if count % 100 == 0:
            print count
        count += 1
        tmp = line.rstrip().split()
        if len(tmp) == 3:
            path1 = tmp[0]
            path2 = tmp[1]
            if (os.path.exists(path1)) and (os.path.exists(path2)):
                feature1 = extract_feature_from_file(path1)
                feature2 = extract_feature_from_file(path2)
                predicts = pw.cosine_similarity(feature1, feature2)
                all_data.append(predicts)
                all_label.append(int(tmp[2]))
    msgpack_numpy.dump((all_data, all_label, all_pic_path_list), open(feature_pack_file, 'wb'))
    (all_data, all_label, all_pic_path_list) = msgpack_numpy.load(open(feature_pack_file, 'rb'))
    all_data = np.asarray(all_data)
    data = np.reshape(all_data, newshape=(all_data.shape[0], all_data.shape[2]))
    label = np.asarray(all_label)
    print data.shape, label.shape

    kf = KFold(len(label), n_folds=10)
    all_acc = []
    for (train, valid) in kf:
        train_data = data[train]
        valid_data = data[valid]
        train_label = label[train]
        valid_label = label[valid]
        clf = LinearSVC()
        clf.fit(train_data, train_label)
        acc = accuracy_score(valid_label, clf.predict(valid_data))
        roc_auc = roc_auc_score(valid_label, clf.predict(valid_data))
        all_acc.append(acc)
        print acc, roc_auc
    print np.mean(all_acc)

    clf = LinearSVC()
    clf.fit(data, label)
    pdb.set_trace()
    cPickle.dump(clf, open(verification_model_file, 'wb'))
 def load_train_data(self, data_folder):
     # 直接读取图片特征, 返回所有特征和label
     all_pic_feature = []
     all_label = []
     person_list = os.listdir(data_folder)
     for person in person_list:
         if person == self.unknown or self.must_same_str in person or self.maybe_same_str in person:
             continue
         person_path = os.path.join(data_folder, person)
         pic_feature_list = os.listdir(person_path)
         for pic_feature_path in pic_feature_list:
             pic_feature = msgpack_numpy.load(
                 open(os.path.join(person_path, pic_feature_path), 'rb'))
             all_pic_feature.append(pic_feature)
             all_label.append(person)
     all_pic_feature = np.asarray(all_pic_feature)
     all_label = np.asarray(all_label)
     return all_pic_feature, all_label
Пример #17
0
def feature_trans_autoencoder(src_pack_file, dst_pack_file):
    weight_file = '/data/liubo/face/annotate_face_model/skyeye_face_autoencoder.weight'
    model_file = '/data/liubo/face/annotate_face_model/skyeye_face_autoencoder.model'
    autoencoder =  model_from_json(open(model_file, 'r').read())
    autoencoder.compile(optimizer='adadelta', loss='binary_crossentropy')
    autoencoder.load_weights(weight_file)
    get_Conv_FeatureMap = K.function([autoencoder.layers[0].get_input_at(False), K.learning_phase()],
                                     [autoencoder.layers[-2].get_output_at(False)])

    person_feature_dic = msgpack_numpy.load(open(src_pack_file, 'rb'))
    for person_index, person in enumerate(person_feature_dic):
        feature_list = person_feature_dic.get(person)
        for index in range(len(feature_list)):
            try:
                if feature_list[index][1] == None:
                    continue
                this_feature = np.array(feature_list[index][1][0])
                this_feature = np.reshape(this_feature, (1, this_feature.size))
                this_feature = get_Conv_FeatureMap([this_feature, 0])[0][0]
                feature_list[index][1][0] = this_feature
            except:
                traceback.print_exc()
    msgpack_numpy.dump(person_feature_dic, open(dst_pack_file, 'wb'))
Пример #18
0
    def run(self):

        print "My pid is :%d\n" % os.getpid()

        self.player = Environment(self.index * 113)
        context = zmq.Context()
        self.c2s_socket = context.socket(zmq.PUSH)
        self.c2s_socket.setsockopt(zmq.IDENTITY, self.identity)
        self.c2s_socket.connect(self.c2s)

        self.s2c_socket = context.socket(zmq.DEALER)
        self.s2c_socket.setsockopt(zmq.IDENTITY, self.identity)
        self.s2c_socket.connect(self.s2c)
        rew, isover, frag, kdr = None, False, 0, 0

        while True:
            frame = self.player.current_state()
            self.c2s_socket.send(dump(
                (self.identity, [frame], rew, isover, frag, kdr)),
                                 copy=False)  #rew is last action's reward
            action = load(self.s2c_socket.recv(copy=False).bytes)
            rew, isover, frag, kdr = self.player.action(action)
            if isover:
                self.player.reset_stat()
def extract_triplet_feature():
    lfw_feature_dic = msgpack_numpy.load(open(feature_pack_file, 'rb'))
    new_lfw_feature_dic = {}
    model_file = '/data/liubo/face/vgg_face_model/annotate_siamese_graph.model'
    weight_file = '/data/liubo/face/vgg_face_model/annotate_siamese_graph.weight'
    model = model_from_json(open(model_file, 'r').read())
    opt = Adam()
    model.compile(optimizer=opt, loss=['categorical_crossentropy'])
    model.load_weights(weight_file)

    # pdb.set_trace()
    get_Conv_FeatureMap = K.function([model.layers[2].layers[0].get_input_at(False), K.learning_phase()],
                                     [model.layers[2].layers[-1].get_output_at(False)])
    for person in lfw_feature_dic:
        # print person
        this_person_feature_list = lfw_feature_dic.get(person)
        this_person_triplet_feature_list = []
        for feature, path in this_person_feature_list:
            feature = np.reshape(feature, (1, feature.size))
            new_feature = get_Conv_FeatureMap([feature, 0])[0].copy()
            this_person_triplet_feature_list.append((new_feature, path))

        new_lfw_feature_dic[person] = this_person_triplet_feature_list
    msgpack_numpy.dump(new_lfw_feature_dic, open(triplet_feature_pack_file, 'wb'))
Пример #20
0
    def import_data(self, debug=False):
        '''
        method for importing and processing input data
        '''
        # Importing pickled wordvectors, dictionary, inputs and labels
        with open(self.base_path + '/wordvectors', 'rb') as vectors_file:
            print("Importing wordvectors...", end=' ', flush=True)
            word_vectors = msgpack_numpy.load(vectors_file)
            print("Done")
        with open(self.base_path + '/dictionary', 'rb') as dict_file:
            print("Importing dictionary...", end=' ', flush=True)
            dictionary = msgpack.load(dict_file, raw=False)
            print("Done")
        with open('inputs_slot_filling', 'rb') as data_inputs_file:
            print("Importing inputs...", end=' ', flush=True)
            sentences = msgpack.load(data_inputs_file, raw=False)
            print("Done")
        with open('outputs_slot_filling', 'rb') as data_outputs_file:
            print("Importing labels...", end=' ', flush=True)
            outputs = msgpack.load(data_outputs_file, raw=False)
            print("Done")


        ########################################################################################################################
        # Processing inputs
        ########################################################################################################################
        print("Modifying input sentences...")

        # importing progressbar
        bar = progressbar.ProgressBar(max_value=len(sentences), redirect_stdout=True, end=' ')
        # preassigning the inputs variable for faster processing
        data_inputs = np.zeros((len(sentences), self.n_steps), dtype=np.int32)
        # initiating all the inputs to index of zero vector (zerowordvec_idx = dictionary['zerowordvec'])
        zerowordvec_idx = dictionary['zerowordvec']
        data_inputs[:, :] = zerowordvec_idx
        # Processing inputs
        lengths = np.zeros(len(sentences), dtype=np.int32)
        i = 0
        no_words_not_found = 0
        for line in sentences:
            # Initializing an empty list of Indexes
            h = []
            # Iterating each word in the line over the dictionary and appending the indexes to a list
            for k in range(len(line)):
                try:
                    idx = dictionary[line[k]]
                except:
                    idx = zerowordvec_idx
                    no_words_not_found += 1
                    with open('words_not_found_in_dic','a') as f: f.write(line[k] + '\n')
                # Appending the index(idx) of each word to the list h.
                h.append(idx)
            # appending the length of each line to the list lengths
            lengths[i] = len(line)
            # modify contents of the array
            data_inputs[i, :len(h)] = h
            # bar update
            bar.update(i)
            i = i + 1
        # bar finish
        bar.finish()
        # if words are not found in dictionary
        if no_words_not_found!=0: print('# of words not found in the dictionary(incl. repeatations) = {}'.format(no_words_not_found), end=' ', flush=True)

        # if debug print input sample to check if the input pipeline is correct
        if debug:
            print('Sample input data')
            print('=========================================================')
            print('input sentences are {}'.format(sentences[0:2]))
            print('[Vector]input sentence are {}'.format(data_inputs[0:2]))
            print('=========================================================')

        ########################################################################################################################
        # Processing labels
        ########################################################################################################################
        print("Modifying outputs...")

        # Pre assigning the data_outputs array
        data_outputs = np.zeros((self.n_examples, self.n_steps, len(self.available_slots)), dtype=np.int32)
        # Initiating all the one hot vectors to the default vector corresponding to the 'Outside' slot
        # Outside string 'O' is part of the naming convention (Begin, Inside and Outside)for classification of words (Object, Source etc) in a sentence.
        # Ref: $ROS_WORKSPACE/mbot_natural_language_processing/mbot_nlu/ros/doc/pedro_thesis.pdf
        idx_outside = self.available_slots.index('O')
        data_outputs[:, :, idx_outside] = 1
        # Initiating progress bar
        bar = progressbar.ProgressBar(max_value=len(outputs), redirect_stdout=True, end=' ')
        # Index for line wise iteration
        v = 0
        # Process outputs
        for line in outputs:
            # Index for word wise iteration
            w = 0
            for output in line:
                # find slot if it exists in available slots list
                try:
                    idx_found = self.available_slots.index(output)
                    # print('index found is ' + str(idx_found))
                except ValueError:
                    raise Exception('Could not find this output = {}  in this sentence = {} in the available list of slots'.format(output, sentences[outputs.index(line)]))
                # modify array
                data_outputs[v][w][idx_outside] = 0
                data_outputs[v][w][idx_found] = 1
                w = w + 1
            # Incrementing line index
            v = v + 1
            # Progress bar update
            bar.update(v)
        # Progress bar finished
        bar.finish()

        # debug prining
        if debug==True:
            print('Sample output data')
            print('=========================================================')
            print('output labels are {}'.format(outputs[0:2]))
            print('[Vector]output labels are {}'.format(outputs_train[0:2]))
            print('=========================================================')

        return word_vectors, data_inputs, data_outputs, lengths
Пример #21
0
def train_valid_verif_model():
    all_data = []
    all_label = []
    all_pic_path_list = []
    count = 0
    path_feature_dic = msgpack_numpy.load(open(feature_pack_file, 'rb'))
    not_in = 0
    not_in_pair = {}
    for line in open(pair_file):
        if count % 100 == 0:
            print count
        count += 1
        tmp = line.rstrip().split()
        if len(tmp) == 3:
            path1 = tmp[0]
            path2 = tmp[1]
            label = int(tmp[2])
            if path1 in path_feature_dic and path2 in path_feature_dic:
                try:
                    feature1 = np.asarray(path_feature_dic.get(path1))
                    feature2 = np.asarray(path_feature_dic.get(path2))
                    predicts = pw.cosine_similarity(feature1, feature2)
                    all_data.append(predicts)
                    all_label.append(label)
                    all_pic_path_list.append((path1, path2))
                except:
                    traceback.print_exc()
            else:
                traceback.print_exc()
    msgpack_numpy.dump((all_data, all_label, all_pic_path_list),
                       open(feature_pack_file, 'wb'))

    (all_data, all_label,
     all_pic_path_list) = msgpack_numpy.load(open(feature_pack_file, 'rb'))
    pdb.set_trace()
    all_data = np.asarray(all_data)
    all_data = np.reshape(all_data,
                          newshape=(all_data.shape[0], all_data.shape[2]))
    all_label = np.asarray(all_label)
    all_pic_path_list = np.asarray(all_pic_path_list)
    print all_data.shape, all_label.shape

    all_acc = []

    kf = KFold(n_folds=10)
    all_acc = []
    f = open('research_verif_result.txt', 'w')
    for k, (train, valid) in enumerate(
            kf.split(all_data, all_label, all_pic_path_list)):
        train_data = all_data[train]
        valid_data = all_data[valid]
        train_label = all_label[train]
        valid_label = all_label[valid]
        train_path_list = all_pic_path_list[train]
        valid_path_list = all_pic_path_list[valid]

        clf = LinearSVC()
        clf.fit(train_data, train_label)
        acc = accuracy_score(valid_label, clf.predict(valid_data))
        for k in range(len(valid_path_list)):
            f.write(
                os.path.split(valid_path_list[k][0])[1] + '\t' +
                os.path.split(valid_path_list[k][1])[1] + '\t' +
                str(valid_data[k][0]) + '\t' + str(valid_label[k]) + '\n')
        all_acc.append(acc)
        print acc
    print 'mean_acc :', np.mean(all_acc)
    f.close()
    clf = LinearSVC()
    clf.fit(all_data, all_label)
    pdb.set_trace()
    cPickle.dump(clf, open(verification_model_file, 'wb'))
    parser = OptionParser()

    parser.add_option("-n", "--num_class", dest="num_class", help="classify label num")
    parser.add_option("-m", "--model_file", dest="model_file", help="model file")
    parser.add_option("-w", "--weight_file", dest="weight_file", help="weight file")
    parser.add_option("-a", "--need_augment", dest="need_augment", help="need_augment")
    parser.add_option("-l", "--train_valid_sample_list_file", dest="train_valid_sample_list_file",
                      help="train_valid_sample_list_file")

    (options, args) = parser.parse_args()

    if options.need_augment.rstrip() == 'True':
        need_augment = True
    else:
        need_augment = False

    print 'need_augment :', need_augment
    model_file = options.model_file
    weight_file = options.weight_file
    nb_classes = int(options.num_class)
    train_valid_sample_list_file = options.train_valid_sample_list_file
    if K.image_dim_ordering() == 'th':
        pic_shape = (96, 96, 3)  # center loss的shape
    else:
        # pic_shape = (3, 96, 96)
        pic_shape = (96, 96, 3)
    (train_sample_list, valid_sample_list) = msgpack_numpy.load(open(train_valid_sample_list_file, 'rb'))
    train_valid_model(train_sample_list, valid_sample_list, pic_shape, nb_classes, model_file, weight_file)


    # person_list = os.listdir(folder)
    # all_pic_path = []
    # all_person = []
    # for person in person_list:
    #     if person == 'unknown' or person.startswith('new_person'):
    #         continue
    #     person_path = os.path.join(folder, person)
    #     pic_list = os.listdir(person_path)
    #     for pic in pic_list:
    #         pic_path = os.path.join(person_path, pic)
    #         all_pic_path.append(pic_path)
    #         all_person.append(person)
    # all_score, all_label = cal_pic_distance(all_pic_path, all_person)
    # msgpack_numpy.dump((all_score, all_label), open('all_score_label.p','wb'))
    #
    all_score, all_label = msgpack_numpy.load(open('all_score_label.p', 'rb'))
    count = Counter(all_label)
    print count
    all_score = np.reshape(np.asarray(all_score), (len(all_score), 1))
    all_label = np.asarray(all_label)
    gnb = GaussianNB()
    train_data, test_data, train_label, test_label = train_test_split(
        all_score, all_label)

    gnb.fit(train_data, train_label)
    gnb.predict_proba(test_data)
    print accuracy_score(test_label, gnb.predict(test_data))
    cPickle.dump(
        gnb, open('/data/liubo/face/vgg_face_dataset/model/dist_prob.p', 'wb'))

    pdb.set_trace()
Пример #24
0
    model.compile(optimizer=opt, loss=[contrastive_loss])
    print 'load weights'
    model.load_weights(weight_file)

    # train_model(person_feature_list_dic, model)
    # last_acc = valid_model(valid_person_feature_list_dic, model)
    # print 'first_acc :', last_acc
    for epoch_index in range(nb_epoch):
        print('-'*40)
        print('Training ', 'current epoch :', epoch_index, 'all epcoh :', nb_epoch)
        train_model(train_person_feature_list_dic, model)
        # this_acc = valid_model(valid_path_list, model, pic_shape)
        # print 'this_acc :', this_acc, 'last_acc :', last_acc
        # if this_acc > last_acc:
        #     model.save_weights(weight_file, overwrite=True)
        #     print ('save_model')
        #     last_acc = this_acc


if __name__ == '__main__':
    model_file = '/data/liubo/face/vgg_face_dataset/model/facenet.model'
    weight_file = '/data/liubo/face/vgg_face_dataset/model/facenet.weight'
    model = build_model(feature_dim=4096)
    print model.summary()
    print model.layers[2].summary()
    model.save_weights(weight_file, overwrite=True)
    open(model_file,'w').write(model.to_json())

    person_feature_list_dic = msgpack_numpy.load(open('/data/pictures_annotate_feature/person_feature_list_dic.p', 'rb'))
    train_valid_model(person_feature_list_dic, person_feature_list_dic, model_file, weight_file)
def train_valid_verif_model():
    all_data = []
    all_label = []
    all_pic_path_list = []
    count = 0
    path_feature_dic = msgpack.load(open('research_feature.p', 'rb'))
    not_in = 0
    not_in_pair = {}
    for line in open(pair_file):
        if count % 100 == 0:
            print count
        count += 1
        tmp = line.rstrip().split()
        if len(tmp) == 3:
            path1 = tmp[0]
            path2 = tmp[1]
            label = int(tmp[2])
            if path1 in path_feature_dic and path2 in path_feature_dic:
                try:
                    feature1 = np.asarray(path_feature_dic.get(path1))
                    feature2 = np.asarray(path_feature_dic.get(path2))
                    if len(feature1) < 100 or len(feature2) < 100:
                        print path1, path2
                        not_in += 1
                        not_in_pair[(path1, path2)] = 1
                        continue
                    feature1 = np.reshape(feature1, newshape=(1, feature1.shape[0]))
                    feature2 = np.reshape(feature2, newshape=(1, feature2.shape[0]))
                    predicts = pw.cosine_similarity(feature1, feature2)
                    all_data.append(predicts)
                    all_label.append(label)
                    all_pic_path_list.append((path1, path2))
                except:
                    traceback.print_exc()
                    # pdb.set_trace()
            else:
                traceback.print_exc()
                # pdb.set_trace()
    msgpack_numpy.dump((all_data, all_label, all_pic_path_list), open(feature_pack_file, 'wb'))

    (all_data, all_label, all_pic_path_list) = msgpack_numpy.load(open(feature_pack_file, 'rb'))
    all_data = np.asarray(all_data)
    all_data = np.reshape(all_data, newshape=(all_data.shape[0], all_data.shape[2]))
    all_label = np.asarray(all_label)
    print all_data.shape, all_label.shape


    kf = KFold(len(all_label), n_folds=10)
    all_acc = []
    for (train, valid) in kf:
        train_data = all_data[train]
        valid_data = all_data[valid]
        train_label = all_label[train]
        valid_label = all_label[valid]

        clf = LinearSVC()
        clf.fit(train_data, train_label)
        acc = accuracy_score(valid_label, clf.predict(valid_data))
        roc_auc = roc_auc_score(valid_label, clf.predict(valid_data))
        all_acc.append(acc)
        print acc, roc_auc
    print 'mean_acc :', np.mean(all_acc)
    clf = LinearSVC()
    clf.fit(all_data, all_label)
    cPickle.dump(clf, open(verification_model_file, 'wb'))
def main(args):

    network = importlib.import_module(args.model_def, 'inference')

    subdir = datetime.strftime(datetime.now(), '%Y%m%d-%H%M%S')
    log_dir = os.path.join(os.path.expanduser(args.logs_base_dir), subdir)
    # 创建模型文件夹
    if not os.path.isdir(log_dir):  # Create the log directory if it doesn't exist
        os.makedirs(log_dir)
    model_dir = os.path.join(os.path.expanduser(args.models_base_dir), subdir)
    if not os.path.isdir(model_dir):  # Create the model directory if it doesn't exist
        os.makedirs(model_dir)

    # Store some git revision info in a text file in the log directory
    src_path, _ = os.path.split(os.path.realpath(__file__))
    facenet.store_revision_info(src_path, log_dir, ' '.join(sys.argv))

    np.random.seed(seed=args.seed)
    train_set = facenet.get_dataset(args.data_dir)
    nrof_classes = len(train_set)

    print('Model directory: %s' % model_dir)
    print('Log directory: %s' % log_dir)
    pretrained_model = None
    if args.pretrained_model:
        pretrained_model = os.path.expanduser(args.pretrained_model)
        print('Pre-trained model: %s' % pretrained_model)

    if args.lfw_dir:
        print('LFW directory: %s' % args.lfw_dir)
        # Read the file containing the pairs used for testing
        pairs = lfw.read_pairs(os.path.expanduser(args.lfw_pairs))
        # Get the paths for the corresponding images
        lfw_paths, actual_issame = lfw.get_paths(os.path.expanduser(args.lfw_dir), pairs, args.lfw_file_ext)
    if args.baihe_pack_file:
        print('load baihe dataset')
        lfw_paths, actual_issame = msgpack_numpy.load(open(args.baihe_pack_file))

    with tf.Graph().as_default():
        tf.set_random_seed(args.seed)
        # 迭代轮数, 不同的轮数可以使用不同的学习率
        global_step = tf.Variable(0, trainable=False)

        # Get a list of image paths and their labels
        image_list, label_list = facenet.get_image_paths_and_labels(train_set)

        # Read data and apply label preserving distortions
        image_batch, label_batch = facenet.read_and_augment_data(image_list, label_list, args.image_size,
            args.batch_size, args.max_nrof_epochs, args.random_crop, args.random_flip, args.random_rotate,
            args.nrof_preprocess_threads)
        print('Total number of classes: %d' % nrof_classes)
        print('Total number of examples: %d' % len(image_list))

        print('Building training graph')

        # Placeholder for the learning rate
        learning_rate_placeholder = tf.placeholder(tf.float32, name='learning_rate')

        # Build the inference graph, 返回的是网络结构
        prelogits, _ = network.inference(image_batch, args.keep_probability, phase_train=True,
                                         weight_decay=args.weight_decay)
        # 初始化采用截断的正态分布噪声, 标准差为0.1
        # tf.truncated_normal_initializer(stddev=0.1)
        logits = slim.fully_connected(prelogits, len(train_set), activation_fn=None,
                                      weights_initializer=tf.truncated_normal_initializer(stddev=0.1),
                                      weights_regularizer=slim.l2_regularizer(args.weight_decay),
                                      scope='Logits', reuse=False)

        # Add DeCov regularization loss
        if args.decov_loss_factor > 0.0:
            logits_decov_loss = facenet.decov_loss(logits) * args.decov_loss_factor
            # 将decov_loss加入到名字为tf.GraphKeys.REGULARIZATION_LOSSES的集合当中来
            tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES, logits_decov_loss)

        # Add center loss (center_loss作为一个正则项加入到collections)
        if args.center_loss_factor > 0.0:
            prelogits_center_loss, _ = facenet.center_loss(prelogits, label_batch, args.center_loss_alfa, nrof_classes)
            # 将center加入到名字为tf.GraphKeys.REGULARIZATION_LOSSES的集合当中来
            tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES, prelogits_center_loss * args.center_loss_factor)

        # 对学习率进行指数衰退
        learning_rate = tf.train.exponential_decay(learning_rate_placeholder, global_step,
            args.learning_rate_decay_epochs*args.epoch_size, args.learning_rate_decay_factor, staircase=True)
        tf.scalar_summary('learning_rate', learning_rate)

        # Calculate the average cross entropy loss across the batch
        # 将softmax和交叉熵一起做,得到最后的损失函数,提高效率
        cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits, label_batch, name='cross_entropy_per_example')
        cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy')
        tf.add_to_collection('losses', cross_entropy_mean)

        # Calculate the total losses
        # 获取正则loss
        regularization_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
        total_loss = tf.add_n([cross_entropy_mean] + regularization_losses, name='total_loss')

        # Build a Graph that trains the model with one batch of examples and updates the model parameters
        train_op = facenet.train(total_loss, global_step, args.optimizer,
                                 learning_rate, args.moving_average_decay, tf.all_variables(), args.log_histograms)

        # Evaluation
        print('Building evaluation graph')
        lfw_label_list = range(0, len(lfw_paths))
        assert (len(lfw_paths) % args.lfw_batch_size == 0), \
            "The number of images in the LFW test set need to be divisible by the lfw_batch_size"
        eval_image_batch, eval_label_batch = facenet.read_and_augment_data(lfw_paths, lfw_label_list, args.image_size,
                                                                            args.lfw_batch_size, None, False, False,
                                                                            False, args.nrof_preprocess_threads,
                                                                            shuffle=False)
        # Node for input images
        eval_image_batch.set_shape((None, args.image_size, args.image_size, 3))
        eval_image_batch = tf.identity(eval_image_batch, name='input')
        eval_prelogits, _ = network.inference(eval_image_batch, 1.0,
                                              phase_train=False, weight_decay=0.0, reuse=True)
        eval_embeddings = tf.nn.l2_normalize(eval_prelogits, 1, 1e-10, name='embeddings')

        # Create a saver
        saver = tf.train.Saver(tf.all_variables(), max_to_keep=10)
        # saver = tf.train.Saver(tf.global_variables(), max_to_keep=10)

        # Build the summary operation based on the TF collection of Summaries.
        summary_op = tf.merge_all_summaries()

        # Start running operations on the Graph.
        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=args.gpu_memory_fraction)
        sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options, log_device_placement=False))
        # sess.run(tf.global_variables_initializer())
        # sess.run(tf.local_variables_initializer())
        sess.run(tf.initialize_all_variables())
        sess.run(tf.initialize_local_variables())
        summary_writer = tf.train.SummaryWriter(log_dir, sess.graph)
        tf.train.start_queue_runners(sess=sess)

        # 将队列runner启动,队列就开始运行,返回启动的线程
        # 注意input_queue是先入列,再出列,由于入列的时候输入是place holder,因此到后的线程的时候,会阻塞,
        # 直到下train中sess run (enqueue_op)的时候,  会向队列中载入值,后面的出列才有对象,才在各自的队列中开始执行

        with sess.as_default():

            if pretrained_model:
                print('Restoring pretrained model: %s' % pretrained_model)
                saver.restore(sess, pretrained_model)

            # Training and validation loop
            print('Running training')
            epoch = 0
            while epoch < args.max_nrof_epochs:
                try:
                    step = sess.run(global_step, feed_dict=None)
                    epoch = step // args.epoch_size
                    # Train for one epoch
                    train(args, sess, epoch, learning_rate_placeholder, global_step, total_loss, train_op, summary_op,
                          summary_writer, regularization_losses, args.learning_rate_schedule_file)

                    # Save variables and the metagraph if it doesn't exist already
                    save_variables_and_metagraph(sess, saver, summary_writer, model_dir, subdir, step)

                    # Evaluate on LFW
                    if args.lfw_dir:
                        evaluate(sess, eval_embeddings, eval_label_batch, actual_issame, args.lfw_batch_size, args.seed,
                                 args.lfw_nrof_folds, log_dir, step, summary_writer)
                    # Evaluate on baihe_data
                    if args.baihe_pack_file:
                        evaluate(sess, eval_embeddings, eval_label_batch, actual_issame, args.lfw_batch_size, args.seed,
                                 args.lfw_nrof_folds, log_dir, step, summary_writer)
                except:
                    traceback.print_exc()
                    continue
    return model_dir
def find_max_min():
    # 同一个人里找相似度最小的, 不同人里找相似度最大的
    lfw_feature_dic = msgpack_numpy.load(open(feature_pack_file, 'rb'))
    person_list = lfw_feature_dic.keys()
    same_person_score = []
    same_person_score_pair_dic = {}   # {score:[(path1,path2), ...,(path1,path2)]}
    no_same_person_score = []
    no_same_person_score_pair_dic = {}   # {score:[(path1,path2), ...,(path1,path2)]}
    heapq.heapify(same_person_score)
    pair_threshold = 3000

    for person_index, person in enumerate(person_list):
        start = time()
        path_feature_list = lfw_feature_dic.get(person)
        # 找出该人里所有可能的pair --- score越小越好(同一个人最不相似的照片)
        # 每次将最大的score去掉,加入更小的score,所以在加入是score取负,这样堆顶就是原来score最大的值
        length = len(path_feature_list)
        for index_i in range(length):
            for index_j in range(index_i, length):
                feature1, path1 = path_feature_list[index_i]
                feature2, path2 = path_feature_list[index_j]
                feature1 = np.reshape(feature1, newshape=(1, feature1.shape[0]))
                feature2 = np.reshape(feature2, newshape=(1, feature2.shape[0]))
                this_score = 0 - pw.cosine_similarity(feature1, feature2)[0][0]
                if len(same_person_score) > pair_threshold:
                    top_item = same_person_score[0]
                    if this_score < top_item:    # 更加不相似,加入
                        heapq.heappop(same_person_score)
                        heapq.heappush(same_person_score, this_score)
                        # 删除原来的pair, 加入当前pair (同一个分数可能对应于多个pair)
                        if top_item in same_person_score_pair_dic:
                            same_person_score_pair_dic.pop(top_item)
                        pair_list = same_person_score_pair_dic.get(this_score, [])
                        pair_list.append((path1, path2))
                        same_person_score_pair_dic[this_score] = pair_list
                else:
                    heapq.heappush(same_person_score, this_score)
                    pair_list = same_person_score_pair_dic.get(this_score, [])
                    pair_list.append((path1, path2))
                    same_person_score_pair_dic[this_score] = pair_list

        # 找出所有可能的不相似的pair

        for other_person_index, other_person in enumerate(person_list[person_index+1:], start=person_index+1):
            other_path_feature_list = lfw_feature_dic.get(other_person)
            if other_person == person:
                continue
            other_length = len(other_path_feature_list)
            for index_i in range(length):
                for index_j in range(other_length):
                    feature1, path1 = path_feature_list[index_i]
                    feature2, path2 = other_path_feature_list[index_j]
                    feature1 = np.reshape(feature1, newshape=(1, feature1.shape[0]))
                    feature2 = np.reshape(feature2, newshape=(1, feature2.shape[0]))
                    this_score = pw.cosine_similarity(feature1, feature2)[0][0]
                    if len(no_same_person_score) > pair_threshold:
                        top_item = no_same_person_score[0]
                        if this_score < top_item:    # 更加相似, 加入
                            heapq.heappop(no_same_person_score)
                            heapq.heappush(no_same_person_score, this_score)
                            # 删除原来的pair, 加入当前pair (同一个分数可能对应于多个pair)
                            if top_item in no_same_person_score_pair_dic:
                                no_same_person_score_pair_dic.pop(top_item)
                            pair_list = no_same_person_score_pair_dic.get(this_score, [])
                            pair_list.append((path1, path2))
                            no_same_person_score_pair_dic[this_score] = pair_list
                    else:
                        heapq.heappush(no_same_person_score, this_score)
                        pair_list = no_same_person_score_pair_dic.get(this_score, [])
                        pair_list.append((path1, path2))
                        no_same_person_score_pair_dic[this_score] = pair_list
        end = time()
        print person_index, person, (end - start), length
    msgpack_numpy.dump((same_person_score_pair_dic, same_person_score, no_same_person_score_pair_dic, no_same_person_score),
                       open(new_pair_pack_file, 'wb'))
def main_distance():
    # lfw_feature_dic = msgpack_numpy.load(open(feature_pack_file, 'rb'))
    lfw_feature_dic = msgpack_numpy.load(open(triplet_feature_pack_file, 'rb'))
    data = []
    label = []
    pic_path_list = []
    for line in open(pair_file):
        tmp = line.rstrip().split()
        if len(tmp) == 3:
            person = tmp[0] #取该人的两个特征向量
            this_person_feature_list = lfw_feature_dic.get(person, [])
            index_list = range(len(this_person_feature_list))
            np.random.shuffle(index_list)
            filter_path(this_person_feature_list, index_list)
            if len(index_list) < 2:
                continue
            feature1, path1 = this_person_feature_list[index_list[0]]
            feature2, path2 = this_person_feature_list[index_list[1]]
            feature1 = np.reshape(feature1, newshape=(1, feature1.size))
            feature2 = np.reshape(feature2, newshape=(1, feature2.size))
            predicts = pw.cosine_similarity(feature1, feature2)
            label.append(0)
            data.append(predicts)
            pic_path_list.append('\t'.join([path1, path2]))
        elif len(tmp) == 4:
            person1 = tmp[0]
            person2 = tmp[2]
            # 每个人分别取一个特征向量
            this_person_feature_list1 = lfw_feature_dic.get(person1, [])
            this_person_feature_list2 = lfw_feature_dic.get(person2, [])
            index_list1 = range(len(this_person_feature_list1))
            index_list2 = range(len(this_person_feature_list2))
            np.random.shuffle(index_list1)
            np.random.shuffle(index_list2)
            filter_path(this_person_feature_list1, index_list1)
            filter_path(this_person_feature_list2, index_list2)
            if len(index_list1) < 1 or len(index_list2) < 1:
                continue
            index_list1 = np.arange(len(this_person_feature_list1))
            index_list2 = np.arange(len(this_person_feature_list2))
            np.random.shuffle(index_list1)
            np.random.shuffle(index_list2)
            feature1, path1 = this_person_feature_list1[index_list1[0]]
            feature2, path2 = this_person_feature_list2[index_list2[0]]
            feature1 = np.reshape(feature1, newshape=(1, feature1.size))
            feature2 = np.reshape(feature2, newshape=(1, feature2.size))
            predicts = pw.cosine_similarity(feature1, feature2)
            label.append(1)
            data.append(predicts)
            pic_path_list.append('\t'.join([path1, path2]))
    data = np.asarray(data)
    print data.shape
    data = np.reshape(data, newshape=(len(data), 1))
    label = np.asarray(label)
    print data.shape, label.shape

    kf = KFold(n_folds=10)
    all_acc = []
    for k, (train, valid) in enumerate(kf.split(data, label)):

        train_data = data[train]
        valid_data = data[valid]
        train_label = label[train]
        valid_label = label[valid]

        clf = LinearSVC()
        clf.fit(train_data, train_label)
        acc = accuracy_score(valid_label, clf.predict(valid_data))
        all_acc.append(acc)
        print 'acc :', acc

    print 'mean acc :', np.mean(all_acc)
Пример #29
0
def train_valid_verif_model():
    all_data = []
    all_label = []
    all_pic_path_list = []
    count = 0
    path_feature_dic = msgpack_numpy.load(open(feature_pack_file, 'rb'))
    not_in = 0
    not_in_pair = {}
    for line in open(pair_file):
        if count % 100 == 0:
            print count
        count += 1
        tmp = line.rstrip().split()
        if len(tmp) == 3:
            path1 = tmp[0]
            path2 = tmp[1]
            label = int(tmp[2])
            if path1 in path_feature_dic and path2 in path_feature_dic:
                try:
                    feature1 = np.asarray(path_feature_dic.get(path1))
                    feature2 = np.asarray(path_feature_dic.get(path2))
                    predicts = pw.cosine_similarity(feature1, feature2)
                    all_data.append(predicts)
                    all_label.append(label)
                    all_pic_path_list.append((path1, path2))
                except:
                    traceback.print_exc()
            else:
                traceback.print_exc()
    msgpack_numpy.dump((all_data, all_label, all_pic_path_list), open(feature_pack_file, 'wb'))

    (all_data, all_label, all_pic_path_list) = msgpack_numpy.load(open(feature_pack_file, 'rb'))
    pdb.set_trace()
    all_data = np.asarray(all_data)
    all_data = np.reshape(all_data, newshape=(all_data.shape[0], all_data.shape[2]))
    all_label = np.asarray(all_label)
    all_pic_path_list = np.asarray(all_pic_path_list)
    print all_data.shape, all_label.shape

    all_acc = []

    kf = KFold(n_folds=10)
    all_acc = []
    f = open('research_verif_result.txt', 'w')
    for k, (train, valid) in enumerate(kf.split(all_data, all_label, all_pic_path_list)):
        train_data = all_data[train]
        valid_data = all_data[valid]
        train_label = all_label[train]
        valid_label = all_label[valid]
        train_path_list = all_pic_path_list[train]
        valid_path_list = all_pic_path_list[valid]

        clf = LinearSVC()
        clf.fit(train_data, train_label)
        acc = accuracy_score(valid_label, clf.predict(valid_data))
        for k in range(len(valid_path_list)):
            f.write(os.path.split(valid_path_list[k][0])[1] + '\t' + os.path.split(valid_path_list[k][1])[1] +
                    '\t' + str(valid_data[k][0])+ '\t' + str(valid_label[k]) + '\n')
        all_acc.append(acc)
        print acc
    print 'mean_acc :', np.mean(all_acc)
    f.close()
    clf = LinearSVC()
    clf.fit(all_data, all_label)
    pdb.set_trace()
    cPickle.dump(clf, open(verification_model_file, 'wb'))
Пример #30
0
                this_patience = 1


def extract_feature(model_file, weight_file):
    print 'model_file :', model_file
    print 'weight_file :', weight_file
    model = model_from_json(open(model_file, 'r').read())
    model.load_weights(weight_file)
    get_Conv_FeatureMap = K.function(
        [model.layers[0].get_input_at(False),
         K.learning_phase()], [model.layers[-2].get_output_at(False)])
    return model, get_Conv_FeatureMap


if __name__ == '__main__':
    model_file = '/data/liubo/face/vgg_face_dataset/model/originalimages.model'
    weight_file = '/data/liubo/face/vgg_face_dataset/model/originalimages.weight'
    # extract_feature(model_file, weight_file)
    # model = deep_net(pic_shape=(3, 128, 128), nb_classes=NB_CLASS)
    # model.compile('rmsprop', 'categorical_crossentropy')

    model_data, model_label = msgpack_numpy.load(
        open('/data/liubo/face/originalimages/originalimages_model.p', 'rb'))
    model_data = np.transpose(model_data, (0, 3, 1, 2))
    X_train, X_test, y_train, y_test = train_test_split(model_data,
                                                        model_label,
                                                        test_size=0.1)

    train_valid_model(X_train, y_train, X_test, y_test, NB_CLASS, model_file,
                      weight_file)
def find_max_min():
    # 同一个人里找相似度最小的, 不同人里找相似度最大的
    lfw_feature_dic = msgpack_numpy.load(open(feature_pack_file, 'rb'))
    person_list = lfw_feature_dic.keys()
    same_person_score = []
    same_person_score_pair_dic = {
    }  # {score:[(path1,path2), ...,(path1,path2)]}
    no_same_person_score = []
    no_same_person_score_pair_dic = {
    }  # {score:[(path1,path2), ...,(path1,path2)]}
    heapq.heapify(same_person_score)
    pair_threshold = 3000

    for person_index, person in enumerate(person_list):
        start = time()
        path_feature_list = lfw_feature_dic.get(person)
        # 找出该人里所有可能的pair --- score越小越好(同一个人最不相似的照片)
        # 每次将最大的score去掉,加入更小的score,所以在加入是score取负,这样堆顶就是原来score最大的值
        length = len(path_feature_list)
        for index_i in range(length):
            for index_j in range(index_i, length):
                feature1, path1 = path_feature_list[index_i]
                feature2, path2 = path_feature_list[index_j]
                feature1 = np.reshape(feature1,
                                      newshape=(1, feature1.shape[0]))
                feature2 = np.reshape(feature2,
                                      newshape=(1, feature2.shape[0]))
                this_score = 0 - pw.cosine_similarity(feature1, feature2)[0][0]
                if len(same_person_score) > pair_threshold:
                    top_item = same_person_score[0]
                    if this_score < top_item:  # 更加不相似,加入
                        heapq.heappop(same_person_score)
                        heapq.heappush(same_person_score, this_score)
                        # 删除原来的pair, 加入当前pair (同一个分数可能对应于多个pair)
                        if top_item in same_person_score_pair_dic:
                            same_person_score_pair_dic.pop(top_item)
                        pair_list = same_person_score_pair_dic.get(
                            this_score, [])
                        pair_list.append((path1, path2))
                        same_person_score_pair_dic[this_score] = pair_list
                else:
                    heapq.heappush(same_person_score, this_score)
                    pair_list = same_person_score_pair_dic.get(this_score, [])
                    pair_list.append((path1, path2))
                    same_person_score_pair_dic[this_score] = pair_list

        # 找出所有可能的不相似的pair

        for other_person_index, other_person in enumerate(
                person_list[person_index + 1:], start=person_index + 1):
            other_path_feature_list = lfw_feature_dic.get(other_person)
            if other_person == person:
                continue
            other_length = len(other_path_feature_list)
            for index_i in range(length):
                for index_j in range(other_length):
                    feature1, path1 = path_feature_list[index_i]
                    feature2, path2 = other_path_feature_list[index_j]
                    feature1 = np.reshape(feature1,
                                          newshape=(1, feature1.shape[0]))
                    feature2 = np.reshape(feature2,
                                          newshape=(1, feature2.shape[0]))
                    this_score = pw.cosine_similarity(feature1, feature2)[0][0]
                    if len(no_same_person_score) > pair_threshold:
                        top_item = no_same_person_score[0]
                        if this_score < top_item:  # 更加相似, 加入
                            heapq.heappop(no_same_person_score)
                            heapq.heappush(no_same_person_score, this_score)
                            # 删除原来的pair, 加入当前pair (同一个分数可能对应于多个pair)
                            if top_item in no_same_person_score_pair_dic:
                                no_same_person_score_pair_dic.pop(top_item)
                            pair_list = no_same_person_score_pair_dic.get(
                                this_score, [])
                            pair_list.append((path1, path2))
                            no_same_person_score_pair_dic[
                                this_score] = pair_list
                    else:
                        heapq.heappush(no_same_person_score, this_score)
                        pair_list = no_same_person_score_pair_dic.get(
                            this_score, [])
                        pair_list.append((path1, path2))
                        no_same_person_score_pair_dic[this_score] = pair_list
        end = time()
        print person_index, person, (end - start), length
    msgpack_numpy.dump((same_person_score_pair_dic, same_person_score,
                        no_same_person_score_pair_dic, no_same_person_score),
                       open(new_pair_pack_file, 'wb'))
def main_distance():
    # lfw_feature_dic = msgpack_numpy.load(open(feature_pack_file, 'rb'))
    lfw_feature_dic = msgpack_numpy.load(open(triplet_feature_pack_file, 'rb'))
    data = []
    label = []
    pic_path_list = []
    for line in open(pair_file):
        tmp = line.rstrip().split()
        if len(tmp) == 3:
            person = tmp[0]  #取该人的两个特征向量
            this_person_feature_list = lfw_feature_dic.get(person, [])
            index_list = range(len(this_person_feature_list))
            np.random.shuffle(index_list)
            filter_path(this_person_feature_list, index_list)
            if len(index_list) < 2:
                continue
            feature1, path1 = this_person_feature_list[index_list[0]]
            feature2, path2 = this_person_feature_list[index_list[1]]
            feature1 = np.reshape(feature1, newshape=(1, feature1.size))
            feature2 = np.reshape(feature2, newshape=(1, feature2.size))
            predicts = pw.cosine_similarity(feature1, feature2)
            label.append(0)
            data.append(predicts)
            pic_path_list.append('\t'.join([path1, path2]))
        elif len(tmp) == 4:
            person1 = tmp[0]
            person2 = tmp[2]
            # 每个人分别取一个特征向量
            this_person_feature_list1 = lfw_feature_dic.get(person1, [])
            this_person_feature_list2 = lfw_feature_dic.get(person2, [])
            index_list1 = range(len(this_person_feature_list1))
            index_list2 = range(len(this_person_feature_list2))
            np.random.shuffle(index_list1)
            np.random.shuffle(index_list2)
            filter_path(this_person_feature_list1, index_list1)
            filter_path(this_person_feature_list2, index_list2)
            if len(index_list1) < 1 or len(index_list2) < 1:
                continue
            index_list1 = np.arange(len(this_person_feature_list1))
            index_list2 = np.arange(len(this_person_feature_list2))
            np.random.shuffle(index_list1)
            np.random.shuffle(index_list2)
            feature1, path1 = this_person_feature_list1[index_list1[0]]
            feature2, path2 = this_person_feature_list2[index_list2[0]]
            feature1 = np.reshape(feature1, newshape=(1, feature1.size))
            feature2 = np.reshape(feature2, newshape=(1, feature2.size))
            predicts = pw.cosine_similarity(feature1, feature2)
            label.append(1)
            data.append(predicts)
            pic_path_list.append('\t'.join([path1, path2]))
    data = np.asarray(data)
    print data.shape
    data = np.reshape(data, newshape=(len(data), 1))
    label = np.asarray(label)
    print data.shape, label.shape

    kf = KFold(n_folds=10)
    all_acc = []
    for k, (train, valid) in enumerate(kf.split(data, label)):

        train_data = data[train]
        valid_data = data[valid]
        train_label = label[train]
        valid_label = label[valid]

        clf = LinearSVC()
        clf.fit(train_data, train_label)
        acc = accuracy_score(valid_label, clf.predict(valid_data))
        all_acc.append(acc)
        print 'acc :', acc

    print 'mean acc :', np.mean(all_acc)
Пример #33
0
def msgpack_load_text(stream):
    return msgpack.load(stream, encoding='utf-8')
Пример #34
0
    def import_data(self, debug=False):
        '''
        method for importing and processing input data
        '''
        # Importing pickled wordvectors, dictionary, inputs and labels
        with open(self.base_path + '/wordvectors', 'rb') as vectors_file:
            print("Importing wordvectors...", end=' ', flush=True)
            word_vectors = msgpack_numpy.load(vectors_file)
            print("Done")
        with open(self.base_path + '/dictionary', 'rb') as dict_file:
            print("Importing dictionary...", end=' ', flush=True)
            dictionary = msgpack.load(dict_file, raw=False)
            print("Done")
        with open('inputs', 'rb') as data_inputs_file:
            print("Importing inputs...", end=' ', flush=True)
            sentences = msgpack.load(data_inputs_file, raw=False)
            print("Done")
        with open('outputs', 'rb') as data_outputs_file:
            print("Importing labels...", end=' ', flush=True)
            outputs = msgpack.load(data_outputs_file, raw=False)
            print("Done")

        ########################################################################################################################
        # Processing inputs
        ########################################################################################################################
        print('Modifying input sentences...')

        # importing progressbar
        bar = progressbar.ProgressBar(max_value=len(sentences),
                                      redirect_stdout=True,
                                      end=' ')

        # preassigning the inputs variable for faster processing
        data_inputs = np.zeros((len(sentences), self.n_steps), dtype=np.int32)
        # initiating all the inputs to index of zero vector (zerowordvec_idx = dictionary['zerowordvec'])
        zerowordvec_idx = dictionary['zerowordvec']
        data_inputs[:, :] = zerowordvec_idx
        # Modifying the input senteces for training.
        lengths = []
        i = 0
        no_words_not_found = 0
        for line in sentences:
            line = line.lower()  # All the sentences to lowe caps
            line = line.strip('\n')  # Strip all the \n at the end
            line = line.replace(',', '')  # Removing ","
            line = line.rsplit(
                ' ', -1)  # Split the sentence to a list of strings(words)
            # Initializing an empty list
            h = []
            # Iterating each word in the line over the dictionary and appending the indexes to a list
            for k in range(len(line)):
                # searching the index of each word in the dictionary and saving the number to the variable "idx"
                try:
                    idx = dictionary[line[k]]
                except:
                    # Exporting the words not found in the dictionary to (for reference)
                    idx = zerowordvec_idx
                    with open('words_not_found_in_dic', 'a') as f:
                        f.write(line[k] + '\n')
                    no_words_not_found += 1
                # Appending the index(idx) of each word to the list h.
                h.append(idx)
            # appending the length of each line to the list lengths
            lengths.append(len(line))
            # modifying the array
            data_inputs[i, :len(h)] = h
            # bar update
            bar.update(i)
            i = i + 1
        # bar finish
        bar.finish()
        # if words are not found in dictionary
        if no_words_not_found != 0:
            print(
                '\nNo. of words not found in the dict = {}, pls. check words_not_found_in_dic file\n'
                .format(no_words_not_found),
                end='',
                flush=True)

        # if debug print input sample to check if the input pipeline is correct
        if debug:
            print('Sample input data')
            print('=========================================================')
            print('input sentence are {}'.format(sentences[0:2]))
            print('input lengths are {}'.format(lengths[0:2]))
            print('[Vector]input sentence are {}'.format(data_inputs[0:2]))
            print('=========================================================')
        ########################################################################################################################
        # Processing labels
        ########################################################################################################################
        print("Modifying labels...")

        # initiating progress bar
        bar = progressbar.ProgressBar(max_value=len(outputs),
                                      redirect_stdout=True,
                                      end=' ')
        # preassinging outputs variable for faster processing
        data_outputs = np.zeros((len(outputs), len(self.available_intents)),
                                dtype=np.int32)
        # Iterating over outputs list and corresponding one hot vectors is stacked( using vstack) to a list (o)
        v = 0
        for output in outputs:
            # find intent if it exists in available intents list
            try:
                idx_found = self.available_intents.index(output)
            except ValueError:
                raise Exception(
                    'Could not find this output = {} in the available list of intents'
                    .format(output))
            # modifying the output array
            data_outputs[v, idx_found] = 1
            # bar update
            bar.update(v)
            v = v + 1
        # bar finish
        bar.finish()

        # debug prining
        if debug:
            print('Sample output data')
            print('=========================================================')
            print('output labels are {}'.format(outputs[0:2]))
            print('[Vector]output labels are {}'.format(data_outputs[0:2]))
            print('=========================================================')

        return word_vectors, data_inputs, data_outputs, lengths
Пример #35
0
import msgpack_numpy
from sklearn.svm import LinearSVC
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from sklearn.cross_validation import KFold
from sklearn.metrics import accuracy_score, roc_auc_score
import pdb

reload(sys)
sys.setdefaultencoding("utf-8")
# fileConfig('logger_config.ini')
# logger_error = logging.getLogger('errorhandler')


if __name__ == '__main__':
    (paths, emb_array, actual_issame) = msgpack_numpy.load(open('lfw_feature.p', 'rb'))
    data = []
    pair_paths = []
    for index in range(len(actual_issame)):
        data.append(cosine_similarity(emb_array[2*index:2*index+1], emb_array[2*index+1:2*index+2])[0][0])
        pair_paths.append(str(paths[2*index]) + '\t' + str(paths[2*index+1]))
    data = np.reshape(np.array(data), (len(data), 1))
    label = np.reshape(np.array(actual_issame), (len(actual_issame), 1))
    pair_paths = np.array(pair_paths)

    kf = KFold(len(label), n_folds=10)
    all_acc = []
    f = open('error.txt', 'w')
    for (train, valid) in kf:
        train_data = data[train]
        valid_data = data[valid]
# read as txt file
with open(common_words, 'r') as common_words_file:
    content = common_words_file.readlines()
    common_words_l = [x.strip() for x in content]

print("read {} words from common words file".format(len(common_words_l)))

print("loading serialized big_dictionary")

with open(sys.argv[1] + '/big_dictionary', 'rb') as big_dictionary_file:
    big_dictionary = msgpack.load(big_dictionary_file, raw=False)

print("loading serialized big_wordvectors")

with open(sys.argv[1] + '/big_wordvectors', 'rb') as big_wordvectors_file:
    big_wordvectors = msgpack_numpy.load(big_wordvectors_file)

print("Finding most common words in big_dictionary and generating reduced size dictionary")
print("--------")

i = 0
dictionary = {}
wordvectors = []

number_of_loops = len(common_words_l)

for common_word in common_words_l:
    # get index
    try:
        index = big_dictionary[common_word]
        # from the index we get the correspinding vector
Пример #37
0
                break

        print('Testing...')
        Y_predict_batch = model.predict(valid_data, batch_size=batch_size, verbose=1)
        test_acc = accuracy_score(np.argmax(valid_label, axis=1), np.argmax(Y_predict_batch, axis=1))
        Y_train_preidct_batch = model.predict(train_data, batch_size=batch_size, verbose=1)
        train_acc = accuracy_score(np.argmax(train_label, axis=1), np.argmax(Y_train_preidct_batch, axis=1))
        print ('train_acc :', train_acc,  'test acc', test_acc)
        if last_crps < test_acc:
            this_patience = 0
            model.save_weights(weight_file, overwrite=True)
            print ('save_model')
            last_crps = test_acc
        else:
            if this_patience >= patience:
                break
            else:
                this_patience = 1


if __name__ == '__main__':
    model = deepface(pic_shape=(512, 7, 7), class_num=168)
    model_file = '/data/liubo/face/vgg_face_dataset/model/deepface_test.model'
    weight_file = '/data/liubo/face/vgg_face_dataset/model/deepface_test.weight'
    data, label = msgpack_numpy.load(open('/home/liubo-it/FaceRecognization/FineTune/v2/hanlin.p', 'rb'))
    data = np.asarray(data)
    label = np.asarray(label)
    train_data, valid_data, train_label, valid_label = train_test_split(data, label, test_size=0.2)
    print train_data.shape, valid_data.shape
    train_valid(train_data, valid_data, train_label, valid_label, model_file, weight_file)
Пример #38
0
    # train_model(person_feature_list_dic, model)
    # last_acc = valid_model(valid_person_feature_list_dic, model)
    # print 'first_acc :', last_acc
    for epoch_index in range(nb_epoch):
        print('-' * 40)
        print('Training ', 'current epoch :', epoch_index, 'all epcoh :',
              nb_epoch)
        train_model(train_person_feature_list_dic, model)
        # this_acc = valid_model(valid_path_list, model, pic_shape)
        # print 'this_acc :', this_acc, 'last_acc :', last_acc
        # if this_acc > last_acc:
        #     model.save_weights(weight_file, overwrite=True)
        #     print ('save_model')
        #     last_acc = this_acc


if __name__ == '__main__':
    model_file = '/data/liubo/face/vgg_face_dataset/model/facenet.model'
    weight_file = '/data/liubo/face/vgg_face_dataset/model/facenet.weight'
    model = build_model(feature_dim=4096)
    print model.summary()
    print model.layers[2].summary()
    model.save_weights(weight_file, overwrite=True)
    open(model_file, 'w').write(model.to_json())

    person_feature_list_dic = msgpack_numpy.load(
        open('/data/pictures_annotate_feature/person_feature_list_dic.p',
             'rb'))
    train_valid_model(person_feature_list_dic, person_feature_list_dic,
                      model_file, weight_file)
Пример #39
0
    train_x, valid_x, train_y, valid_y = train_test_split(x, y, test_size=0.1)
    clf = LinearSVC()
    print len(x), len(y)
    clf.fit(train_x, train_y)
    acc = accuracy_score(valid_y, clf.predict(valid_x))
    print acc
    clf = DecisionTreeClassifier()
    clf.fit(train_x, train_y)
    acc = accuracy_score(valid_y, clf.predict(valid_x))
    print acc


if __name__ == '__main__':
    # main_feature()
    # main()
    # cal_acc('dist.p')
    # main_max_min()
    # cal_acc('dist_max_min.p')

    (data, label) = msgpack_numpy.load(open('lfw_data_label.p', 'r'))
    data = np.asarray(data)
    label = np.asarray(label)
    train_x, valid_x, train_label, valid_label = train_test_split(
        data, label, test_size=0.1)
    print train_x.shape, valid_x.shape, train_label.shape, valid_label.shape
    clf = RandomForestClassifier(n_estimators=1000, n_jobs=15)
    clf.fit(train_x, train_label)
    acc = accuracy_score(valid_label, clf.predict(valid_x))
    train_acc = accuracy_score(train_label, clf.predict(train_x))
    print acc, train_acc
Пример #40
0
def main_distance():
    lfw_feature_dic = msgpack_numpy.load(open(feature_pack_file, 'rb'))
    data = []
    label = []
    pic_path_list = []
    for line in open(pair_file):
        tmp = line.rstrip().split()
        if len(tmp) == 3:
            person = tmp[0]         # 取该人的两个特征向量
            index1 = int(tmp[1])
            index2= int(tmp[2])
            this_person_feature_dic = lfw_feature_dic.get(person, {})
            if index1 in this_person_feature_dic and index2 in this_person_feature_dic:
                feature1, path1 = this_person_feature_dic[index1]
                feature2, path2 = this_person_feature_dic[index2]
                predicts = pw.cosine_similarity(feature1, feature2)
                label.append(0)
                data.append(predicts)
                pic_path_list.append('\t'.join([path1, path2]))
        elif len(tmp) == 4:
            person1 = tmp[0]
            index1 = int(tmp[1])
            person2 = tmp[2]
            index2 = int(tmp[3])
            # 每个人分别取一个特征向量
            this_person_feature_dic1 = lfw_feature_dic.get(person1, {})
            this_person_feature_dic2 = lfw_feature_dic.get(person2, {})
            if index1 in this_person_feature_dic1 and index2 in this_person_feature_dic2:
                feature1, path1 = this_person_feature_dic1[index1]
                feature2, path2 = this_person_feature_dic2[index2]
                predicts = pw.cosine_similarity(feature1, feature2)
                label.append(1)
                data.append(predicts)
                pic_path_list.append('\t'.join([path1, path2]))
    data = np.asarray(data)
    # data = np.reshape(data, newshape=(data.shape[0], data.shape[-1]))
    data = np.reshape(data, newshape=(data.shape[0], 1))
    label = np.asarray(label)

    pic_path_list = np.asarray(pic_path_list)

    kf = KFold(len(label), n_folds=10)
    all_acc = []
    f = open('error.txt', 'w')
    for (train, valid) in kf:
        train_data = data[train]
        valid_data = data[valid]
        train_label = label[train]
        valid_label = label[valid]
        train_path = pic_path_list[train]
        valid_path = pic_path_list[valid]

        clf = LinearSVC()
        clf.fit(train_data, train_label)
        acc = accuracy_score(valid_label, clf.predict(valid_data))
        roc_auc = roc_auc_score(valid_label, clf.predict(valid_data))
        for index in range(len(valid_data)):
            if valid_label[index] != clf.predict(np.reshape(valid_data[index], (1, 1))):
                f.write(str(index)+'\t'+valid_path[index]+'\n')
        all_acc.append(acc)
        print acc, roc_auc
    f.close()
    all_acc.sort(reverse=True)
    print 'mean_acc :', np.mean(all_acc[:])
Пример #41
0
if __name__ == '__main__':
    # folder = '/data/liubo/face/self'
    # person_list = os.listdir(folder)
    # all_pic_path = []
    # all_person = []
    # for person in person_list:
    #     if person == 'unknown' or person.startswith('new_person'):
    #         continue
    #     person_path = os.path.join(folder, person)
    #     pic_list = os.listdir(person_path)
    #     for pic in pic_list:
    #         pic_path = os.path.join(person_path, pic)
    #         all_pic_path.append(pic_path)
    #         all_person.append(person)
    # all_score, all_label = cal_pic_distance(all_pic_path, all_person)
    # msgpack_numpy.dump((all_score, all_label), open('all_score_label.p','wb'))
    #
    all_score, all_label = msgpack_numpy.load(open('all_score_label.p','rb'))
    count = Counter(all_label)
    print count
    all_score = np.reshape(np.asarray(all_score),(len(all_score), 1))
    all_label = np.asarray(all_label)
    gnb = GaussianNB()
    train_data, test_data, train_label, test_label = train_test_split(all_score, all_label)

    gnb.fit(train_data, train_label)
    gnb.predict_proba(test_data)
    print accuracy_score(test_label, gnb.predict(test_data))
    cPickle.dump(gnb, open('/data/liubo/face/vgg_face_dataset/model/dist_prob.p','wb'))

    pdb.set_trace()
        this_acc = valid_model(valid_path_list, model, nb_classes, pic_shape)
        train_acc = valid_model(train_path_list, model, nb_classes, pic_shape)
        print 'this_acc :', this_acc, 'last_acc :', last_acc, 'train_acc :', train_acc
        if this_acc > last_acc:
            model.save_weights(weight_file, overwrite=True)
            print ('save_model')
            last_acc = this_acc


if __name__ == '__main__':
    parser = OptionParser()

    parser.add_option("-n", "--num_class", dest="num_class", help="classify label num")
    parser.add_option("-m", "--model_file", dest="model_file", help="model file")
    parser.add_option("-w", "--weight_file", dest="weight_file", help="weight file")
    parser.add_option("-l", "--train_valid_sample_list_file", dest="train_valid_sample_list_file",
                      help="train_valid_sample_list_file")

    (options, args) = parser.parse_args()

    model_file = options.model_file
    weight_file = options.weight_file
    nb_classes = int(options.num_class)
    train_valid_sample_list_file = options.train_valid_sample_list_file
    pic_shape = (96, 96, 3)  # inception_v4的shape
    (train_sample_list, valid_sample_list) = msgpack_numpy.load(open(train_valid_sample_list_file, 'rb'))
    print 'len(train_sample_list) :', len(train_sample_list), 'len(valid_sample_list) :', len(valid_sample_list)
    train_valid_model(train_sample_list, valid_sample_list, pic_shape, nb_classes, model_file, weight_file)


Пример #43
0
            last_crps = test_acc
        else:
            if this_patience >= patience:
                break
            else:
                this_patience = 1


def extract_feature(model_file, weight_file):
    print 'model_file :', model_file
    print 'weight_file :', weight_file
    model = model_from_json(open(model_file, 'r').read())
    model.load_weights(weight_file)
    get_Conv_FeatureMap = K.function([model.layers[0].get_input_at(False), K.learning_phase()],
                                     [model.layers[-2].get_output_at(False)])
    return model, get_Conv_FeatureMap


if __name__ == '__main__':
    model_file = '/data/liubo/face/vgg_face_dataset/model/originalimages.model'
    weight_file = '/data/liubo/face/vgg_face_dataset/model/originalimages.weight'
    # extract_feature(model_file, weight_file)
    # model = deep_net(pic_shape=(3, 128, 128), nb_classes=NB_CLASS)
    # model.compile('rmsprop', 'categorical_crossentropy')

    model_data, model_label = msgpack_numpy.load(open('/data/liubo/face/originalimages/originalimages_model.p', 'rb'))
    model_data = np.transpose(model_data, (0, 3, 1, 2))
    X_train, X_test, y_train, y_test = train_test_split(model_data, model_label, test_size=0.1)

    train_valid_model(X_train, y_train, X_test, y_test, NB_CLASS, model_file, weight_file)
Пример #44
0
import pdb
import numpy as np
import cPickle

reload(sys)
sys.setdefaultencoding("utf-8")
# fileConfig('logger_config.ini')
# logger_error = logging.getLogger('errorhandler')

if __name__ == '__main__':
    threshold = float(sys.argv[1])
    method = sys.argv[2]
    day = sys.argv[3]

    (query_list, all_dist) = msgpack_numpy.load(
        open(
            '/data/liubo/hotspot/query_search/all_query_dist_beijing_{}.p'.
            format(day), 'rb'))
    query_dist_dic = cPickle.load(
        open(
            '/data/liubo/hotspot/query_search/beijing_query_dist_dic_{}.p'.
            format(day), 'rb'))

    linkage = sch.linkage(all_dist, method=method)
    cluster_result = sch.fcluster(linkage, t=threshold)
    cluster_result_dic = {}
    f_result = open(
        '/data/liubo/hotspot/query_search/beijing_{}_cluster_result_{}_{}.txt'.
        format(day, threshold, method), 'w')
    for index in range(len(cluster_result)):
        this_cluster_id = cluster_result[index]
        this_cluster_query_list = cluster_result_dic.get(this_cluster_id, [])
Пример #45
0
def load_data(data_path):
    feature, label = msgpack_numpy.load(open(data_path, 'rb'))
    return feature, label