Exemplo n.º 1
0
def write_to_file(path, data, call):
    parent_dir = FileUtil.getparentdir(path)
    FileUtil.mkdirs(parent_dir)

    with open(path, "w") as fp:
        s = call(data)
        if isinstance(s, basestring):
            fp.write(s)
        else:
            [fp.write(s0) for s0 in s]
Exemplo n.º 2
0
def write_to_file(path, data, call):
    parent_dir = FileUtil.getparentdir(path)
    FileUtil.mkdirs(parent_dir)

    with open(path, "w") as fp:
        s = call(data)
        if isinstance(s, basestring):
            fp.write(s)
        else:
            [fp.write(s0) for s0 in s]
Exemplo n.º 3
0
    def __process_img(self, img_urls):
        dir_ = os.path.join(ImageClassification.image_train_path, "img")
        FileUtil.mkdirs(dir_)

        def copy_img(img_url):
            filename = os.path.split(img_url)[1]
            filepath = os.path.join(dir_, filename)
            cv2.imwrite(filepath, cv2.imread(img_url))
            return filepath

        return [copy_img(img_url) for img_url in img_urls]
Exemplo n.º 4
0
    def __save_result(self, sentences):
        FileUtil.mkdirs(ImageClassification.image_train_path)
        current = time.strftime('%Y-%m-%d %H:%M:%S')
        out = os.path.join(ImageClassification.image_train_path, current + ".txt")

        with open(out, "w") as fp:
            for sentence in sentences:
                s = ("sentence:" + sentence.get("sentence") + "\n" +
                     "img:" + ",".join(sentence.get("img")) + "\n" +
                     "label:" + sentence.get("label")) + "\n"
                fp.write(s)
Exemplo n.º 5
0
    def __process_img(self, img_urls):
        dir_ = os.path.join(ImageClassification.image_train_path, "img")
        FileUtil.mkdirs(dir_)

        def copy_img(img_url):
            filename = os.path.split(img_url)[1]
            filepath = os.path.join(dir_, filename)
            cv2.imwrite(filepath, cv2.imread(img_url))
            return filepath

        return [copy_img(img_url) for img_url in img_urls]
Exemplo n.º 6
0
    def __save_result(self, sentences):
        FileUtil.mkdirs(ImageClassification.image_train_path)
        current = time.strftime('%Y-%m-%d %H:%M:%S')
        out = os.path.join(ImageClassification.image_train_path,
                           current + ".txt")

        with open(out, "w") as fp:
            for sentence in sentences:
                s = ("sentence:" + sentence.get("sentence") + "\n" + "img:" +
                     ",".join(sentence.get("img")) + "\n" + "label:" +
                     sentence.get("label")) + "\n"
                fp.write(s)
Exemplo n.º 7
0
def process_img(urls):
    dir_ = os.path.join(RESOURCE_BASE_URL, "collect/img")
    FileUtil.mkdirs(dir_)

    if isinstance(urls, basestring):
        urls = [[urls]]
    filepath = []
    for i, row in enumerate(urls):
        p = []
        for j, url in enumerate(row):
            filename = FileUtil.getfilename(url)
            filepath0 = os.path.join(dir_, filename)
            try:
                r = requests.get(url, timeout=1)
                if r.status_code == 200:
                    img = Image.open(StringIO(r.content))
                    if img.mode != "RGB":
                        img = img.convert("RGB")
                    img.save(filepath0)
                    p.append(filepath0)
            except requests.exceptions.Timeout, requests.exceptions.ConnectionError:
                print "Timeout"
        filepath.append(p)
Exemplo n.º 8
0
def process_img(urls):
    dir_ = os.path.join(RESOURCE_BASE_URL, "collect/img")
    FileUtil.mkdirs(dir_)

    if isinstance(urls, basestring):
        urls = [[urls]]
    filepath = []
    for i, row in enumerate(urls):
        p = []
        for j, url in enumerate(row):
            filename = FileUtil.getfilename(url)
            filepath0 = os.path.join(dir_, filename)
            try:
                r = requests.get(url, timeout=1)
                if r.status_code == 200:
                    img = Image.open(StringIO(r.content))
                    if img.mode != "RGB":
                        img = img.convert("RGB")
                    img.save(filepath0)
                    p.append(filepath0)
            except requests.exceptions.Timeout, requests.exceptions.ConnectionError:
                print "Timeout"
        filepath.append(p)
Exemplo n.º 9
0
def classifict(feature, sentences, incr=False, out=False):
    if isinstance(sentences, basestring):
        sentences = [sentences]

    # 获得主客观分类器
    feature.subjective = False
    objective_clf = get_objective_classification(feature)

    # 测试集
    # 主客观部分
    test_datas_objective, c_true_objective, danger_index_objective = feature.get_key_words(sentences)

    test_objective = test_datas_objective
    if not sp.issparse(test_datas_objective):
        test_objective = feature.cal_weight_improve(test_datas_objective, c_true_objective)

    c_pred_objective = objective_clf.predict(test_objective)

    # 获得情绪分类器
    feature.subjective = True
    emotion_clf = get_emotion_classification(feature, incr=incr)

    # 测试集
    # 情绪部分
    test_datas, c_true, danger_index = feature.get_key_words(sentences)

    test = test_datas
    if not sp.issparse(test_datas):
        test = feature.cal_weight_improve(test_datas, c_true)

    c_pred = []
    for i in range(len(sentences)):
        if i not in danger_index_objective and i not in danger_index:
            before_i_in_danger_obj = np.sum(np.asarray(danger_index_objective) < i)
            before_i_in_danger_ = np.sum(np.asarray(danger_index) < i)

            c = emotion_clf.predict(test[i - before_i_in_danger_])[0] if c_pred_objective[i - before_i_in_danger_obj] == "Y"\
                else c_pred_objective[i - before_i_in_danger_obj]
            c_pred.append(c)

    if out:
        dir_ = os.path.join(OUT_BASE_URL, "out0")
        FileUtil.mkdirs(dir_)
        current = time.strftime('%Y-%m-%d %H:%M:%S')
        o = os.path.join(dir_, current + ".xml")

        with open(o, "w") as fp:
            for i, s in enumerate(sentences):
                if i not in danger_index_objective and i not in danger_index:
                    before_i_in_danger_obj = np.sum(np.asarray(danger_index_objective) < i)
                    before_i_in_danger_ = np.sum(np.asarray(danger_index) < i)
                    fp.write(
                        """<weibo emotion-type="%s">
    <sentence emotion-1-type="%s" emotion-2-type="none" emotion-tag="%s">
        %s
    </sentence>
</weibo>
""" % (c_pred[i - before_i_in_danger_], c_pred[i - before_i_in_danger_], "N" if c_pred_objective[i - before_i_in_danger_obj] == "N" else "Y", s))
                else:
                    fp.write(
                        """<weibo emotion-type="%s">
    <sentence emotion-1-type="%s" emotion-2-type="none" emotion-tag="%s">
        %s
    </sentence>
</weibo>
""" % ("None", "None", "N", s + "\n Can't recognize because it has insufficient key_words"))

    else:
        print c_pred
Exemplo n.º 10
0
    def get_incr_classificator_thread(self, incr_datas, incr_class_label,
                                      test_datas, test_class_label):
        """
        对增量式贝叶斯的增量集部分进行处理
        :param incr_datas: [{"emorion-1-type": value, "sentence": {}},...]
                            (emotion-1-type and sentence are optional)
        :param incr_class_label:
        :param test_datas:
        :param test_class_label:
        :return:
        """
        def func1(i0):
            c_true0 = incr_class_label[i0:i0 + 1][0]
            text0 = fit_incr_datas.getrow(i0)
            c_pred0 = self.predict(text0)[0]

            if c_true0 == c_pred0:
                loss0 = 0
            else:
                clf0 = copy.deepcopy(self)
                clf0.bayes.class_log_prior_, clf0.bayes.feature_log_prob_ = clf0.bayes.update(
                    c_pred0, text0, copy=True)
                loss0 = clf0.metrics_my_zero_one_loss(test_datas)

                # clf0.bayes.class_log_prior_ = origin_class_log_prob_
                # clf0.bayes.feature_log_prob_ = origin_feature_log_prob_

            if lock1.acquire():
                text.append(text0)
                c_pred.append(c_pred0)
                loss.append(loss0)

                lock1.release()

        def func(i0):
            c_true0 = incr_class_label[i0:i0 + 1][0]
            text0 = fit_incr_datas.getrow(i0)
            c_pred0 = self.predict(text0)[0]

            if c_true0 == c_pred0:
                loss0 = 0
            else:
                if lock0.acquire():
                    self.bayes.class_log_prior_, self.bayes.feature_log_prob_ = self.bayes.update(
                        c_pred0, text0, copy=True)
                    loss0 = self.metrics_my_zero_one_loss(test_datas)

                    self.bayes.class_log_prior_ = origin_class_log_prob_
                    self.bayes.feature_log_prob_ = origin_feature_log_prob_

                    lock0.release()

            if lock1.acquire():
                text.append(text0)
                c_pred.append(c_pred0)
                loss.append(loss0)

                lock1.release()

        print "Begin Increment Classification: ", time.strftime(
            '%Y-%m-%d %H:%M:%S')
        # 将参数写入/读取
        dir_ = os.path.join(TEXT_OUT, "bayes_args")
        FileUtil.mkdirs(dir_)

        class_count_out = os.path.join(dir_, "class_count.txt")
        class_log_prob_out = os.path.join(dir_, "class_log_prob.txt")
        feature_count_out = os.path.join(dir_, "feature_count.txt")
        feature_log_prob_out = os.path.join(dir_, "feature_log_prob.txt")

        out = (class_count_out, class_log_prob_out, feature_count_out,
               feature_log_prob_out)

        if self.f or not FileUtil.isexist(out) or FileUtil.isempty(out):
            if not hasattr(self.bayes, "feature_log_prob_") or not hasattr(
                    self.bayes, "class_log_prior_"):
                raise ValueError(
                    "please use get_classificator() to get classificator firstly"
                )

            fit_incr_datas = self.fit_data(incr_datas)
            n_samples, _ = fit_incr_datas.shape
            incr_class_label = np.array(incr_class_label)

            lock0 = threading.Lock()
            lock1 = threading.Lock()

            # threadpool
            poolsize = 30
            pool = ThreadPool(poolsize)

            for i in range(n_samples):
                if i % 5 == 0:
                    print "Begin Increment Classification_%d: %s" % (
                        i / 5, time.strftime('%Y-%m-%d %H:%M:%S'))
                # 分类损失,求最小值的处理方式
                loss = []
                # 增量集中优先选择更改分类器参数的文本
                text = []
                # 增量集中优先选择更改分类器参数的文本所对应的类别
                c_pred = []
                # 增量集中优先选择更改分类器参数的文本所对应的下标
                # index = 0

                origin_class_log_prob_ = self.bayes.class_log_prior_
                origin_feature_log_prob_ = self.bayes.feature_log_prob_

                # threadpool
                requests = makeRequests(func, range(fit_incr_datas.shape[0]))
                [pool.putRequest(req) for req in requests]
                pool.wait()
                #                for i0 in range(fit_incr_datas.shape[0]):
                #                    threading.Thread(target=func, args=(i0, )).start()

                minindex = np.argmin(loss)
                self.bayes.update(c_pred[minindex], text[minindex])
                fit_incr_datas = sp.vstack([
                    fit_incr_datas[:minindex, :],
                    fit_incr_datas[minindex + 1:, :]
                ])

            bayes_args = (self.bayes.class_count_, self.bayes.class_log_prior_,
                          self.bayes.feature_count_,
                          self.bayes.feature_log_prob_)
            map(lambda x: np.savetxt(x[0], x[1]), zip(out, bayes_args))
        else:
            self.bayes.class_count_ = np.loadtxt(out[0])
            self.bayes.class_log_prior_ = np.loadtxt(out[1])
            self.bayes.feature_count_ = np.loadtxt(out[2])
            self.bayes.feature_log_prob_ = np.loadtxt(out[3])

        print "Increment Classification Done: ", time.strftime(
            '%Y-%m-%d %H:%M:%S')
        return self
Exemplo n.º 11
0
    def cross_validation(self, train_datas, class_label, score='precision'):
        """
        K-Fold Cross Validation
        采用交叉验证的方式来优化贝叶斯参数
        选出具有最佳 score 的训练集和测试集
        此时训练集和测试集就不需要事先选好,交给交叉验证来完成
        :param train_datas:
        :param class_label:
        :param score:
        :return:
        """
        score_options = ('precision', 'recall', 'f1', 'accuracy')
        if score not in score_options:
            raise ValueError('score has to be one of ' + str(score_options))

        # fit data
        fit_train_datas = self.fit_data(train_datas)

        n_samples = fit_train_datas.shape[0]
        class_label = np.array(class_label)

        max_result = []
        max_index = []
        max_ = 0
        i = 0
        while (max_ < 0.6 and i <= 200):
            i += 1
            print "Seeking %d; max: %f; %s" % (
                i, max_, time.strftime('%Y-%m-%d %H:%M:%S'))

            result = []
            index = []
            cv = cross_validation.KFold(n_samples, n_folds=4, shuffle=True)

            for train_index, test_index in cv:
                train0, train0_label = fit_train_datas[
                    train_index], class_label[train_index]
                test0, test0_label = fit_train_datas[test_index], class_label[
                    test_index]
                self.get_classificator(train0, train0_label)
                c_pred0 = self.predict(test0)

                if score == "precision":
                    result.append(self.metrics_precision(test0_label, c_pred0))
                    index.append((train_index, test_index))
                elif score == "recall":
                    result.append(self.metrics_recall(test0_label, c_pred0))
                    index.append((train_index, test_index))
                elif score == "f1":
                    result.append(self.metrics_f1(test0_label, c_pred0))
                    index.append((train_index, test_index))
                else:
                    result.append(self.metrics_accuracy(test0_label, c_pred0))
                    index.append((train_index, test_index))

            max_ = max(result)
            max_result.append(max_)
            max_index.append(index[np.argmax(result)])

        argmax = np.argmax(max_result)
        print "Seeking Done; max: %f; %s" % (
            max_result[argmax], time.strftime('%Y-%m-%d %H:%M:%S'))

        # 对最大值再训练一次,得到最优的参数
        self.get_classificator(fit_train_datas[max_index[argmax][0]],
                               class_label[max_index[argmax][0]])

        dir_ = os.path.join(TEXT_OUT, "best_train_test_index")
        FileUtil.mkdirs(dir_)
        current = time.strftime('%Y-%m-%d %H:%M:%S')
        train_index_out = os.path.join(dir_, "train_index.txt")
        test_index_out = os.path.join(dir_, "test_index.txt")

        map(lambda x: np.savetxt(x[0], x[1], fmt="%d"),
            zip((train_index_out, test_index_out), (max_index[argmax])))
Exemplo n.º 12
0
    def get_incr_classificator(self,
                               incr_datas,
                               incr_class_label,
                               test_datas,
                               test_class_label,
                               method="first"):
        """
        对增量式贝叶斯的增量集部分进行处理
        :param incr_datas: [{"emorion-1-type": value, "sentence": {}},...]
                            (emotion-1-type and sentence are optional)
        :param incr_class_label:
        :param test_datas:
        :param test_class_label:
        :return:
        """
        def func(x, y):
            block.append(fit_incr_datas[x[3] + 1:y[3], :])
            label_block.append(incr_class_label[x[3] + 1:y[3]])
            block0.append(fit_incr_datas[y[3]:y[3] + 1, :])
            return y

        def handle(clf, method):
            if method == "zero":
                return handle_zero(clf)
            elif method == "first":
                return handle_first(clf)
            elif method == "second":
                return handle_second(clf)
            elif method == "third":
                return handle_third(clf)
            elif method == "four":
                return handle_four(clf)
            elif method == "five":
                return handle_five(clf)
            else:
                pass

        def handle_zero(clf):
            """
            寻找当前分类器下预测正确的样本
            :param clf:
            :return:
            """
            incr_pre_label = clf.predict(fit_incr_datas)
            # 选出预测正确的下标
            true_index = (incr_class_label == incr_pre_label).nonzero()

            origin_class_log_prob_ = clf.bayes.class_log_prior_
            origin_feature_log_prob_ = clf.bayes.feature_log_prob_
            origin_proba = clf.predict_max_proba(test_datas)

            res = []
            for i0 in true_index[0]:
                text0 = fit_incr_datas.getrow(i0)
                c_pred0 = incr_pre_label[i0]

                clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update(
                    c_pred0, text0, copy=True)
                test_proba = clf.predict_max_proba(test_datas)
                loss0 = clf.metrics_another_zero_one_loss(
                    origin_proba, test_proba)

                res.append((loss0, text0, c_pred0, i0))

                clf.bayes.class_log_prior_ = origin_class_log_prob_
                clf.bayes.feature_log_prob_ = origin_feature_log_prob_

            return res

        def handle_first(clf):
            # 最原始的分类损失度的计算
            # 分类损失,求最小值的处理方式
            loss = 9999
            # 增量集中优先选择更改分类器参数的文本
            text = None
            # 增量集中优先选择更改分类器参数的文本所对应的类别
            c_pred = None
            # 增量集中优先选择更改分类器参数的文本所对应的下标
            index = 0

            origin_class_log_prob_ = clf.bayes.class_log_prior_
            origin_feature_log_prob_ = clf.bayes.feature_log_prob_
            for i0 in range(fit_incr_datas.shape[0]):
                c_true0 = incr_class_label[i0:i0 + 1][0]
                text0 = fit_incr_datas.getrow(i0)
                c_pred0 = clf.predict(text0)[0]
                if c_true0 == c_pred0:
                    loss = 0
                    text = text0
                    c_pred = c_pred0
                    index = i0
                    break
                else:
                    clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update(
                        c_pred0, text0, copy=True)
                    test_proba = clf.predict_max_proba(test_datas)
                    loss0 = clf.metrics_my_zero_one_loss(test_proba)
                    if loss0 < loss:
                        loss = loss0
                        text = text0
                        c_pred = c_pred0
                        index = i0

                clf.bayes.class_log_prior_ = origin_class_log_prob_
                clf.bayes.feature_log_prob_ = origin_feature_log_prob_

            return [(loss, text, c_pred, index)]

        def handle_second(clf):
            # 另一种分类损失度的计算
            #            predict_true = handle(clf, "zero")
            #            if predict_true:
            #                return predict_true

            # 分类损失,求最小值的处理方式
            loss = 9999
            # 增量集中优先选择更改分类器参数的文本
            text = None
            # 增量集中优先选择更改分类器参数的文本所对应的类别
            c_pred = None
            # 增量集中优先选择更改分类器参数的文本所对应的下标
            index = 0

            origin_class_log_prob_ = clf.bayes.class_log_prior_
            origin_feature_log_prob_ = clf.bayes.feature_log_prob_
            origin_proba = clf.predict_max_proba(test_datas)
            for i0 in range(fit_incr_datas.shape[0]):
                c_true0 = incr_class_label[i0:i0 + 1][0]
                text0 = fit_incr_datas.getrow(i0)
                c_pred0 = clf.predict(text0)[0]

                if c_true0 == c_pred0:
                    loss = 0
                    text = text0
                    c_pred = c_pred0
                    index = i0
                    break
                else:
                    clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update(
                        c_pred0, text0, copy=True)
                    test_proba = clf.predict_max_proba(test_datas)
                    loss0 = clf.metrics_another_zero_one_loss(
                        origin_proba, test_proba)
                    if loss0 < loss:
                        loss = loss0
                        text = text0
                        c_pred = c_pred0
                        index = i0

                clf.bayes.class_log_prior_ = origin_class_log_prob_
                clf.bayes.feature_log_prob_ = origin_feature_log_prob_

            return [(loss, text, c_pred, index)]

        def handle_third(clf):
            # todo
            # 如何获得合适的阖值
            def get_fit(e0):
                # 获得合适的阖值
                return 20
#                while len((r >= e0).nonzero()[0]) == 0:
#                    e0 = int(e0 / 2)
#                return e0

            global e
            # 类支持度的计算
            proba = clf.predict_proba(fit_incr_datas)
            label = clf.predict(fit_incr_datas)
            max_proba = np.max(proba, axis=1).reshape(-1, 1)
            second_max_proba = -np.partition(-proba, kth=1, axis=1)[:, 1:2]
            # 支持度
            r = np.divide(max_proba, second_max_proba)
            # 阖值
            e = get_fit(e)
            # select
            select_indices = (r >= e).nonzero()
            return [(0.0, fit_incr_datas.getrow(indice), label[indice], indice,
                     max_proba[indice][0]) for indice in select_indices[0]]

        def handle_third_another(clf):
            # 类支持度的计算
            proba = clf.predict_proba(fit_incr_datas)
            label = clf.predict(fit_incr_datas)
            max_proba = np.max(proba, axis=1).reshape(-1, 1)
            leave_proba = np.sum(proba, axis=1).reshape(-1, 1) - max_proba
            # 支持度
            r = np.divide(max_proba, leave_proba)
            # 阖值
            e = 5
            # select
            select_indices = (r >= e).nonzero()
            return [(0.0, fit_incr_datas.getrow(indice), label[indice], indice,
                     max_proba[indice][0]) for indice in select_indices[0]]

        def handle_four(clf):
            # My Own Idea
            # 存放 Test 的结果
            predict_true = handle(clf, "zero")
            if predict_true:
                return predict_true

            f_res = []

            origin_class_log_prob_ = clf.bayes.class_log_prior_
            origin_feature_log_prob_ = clf.bayes.feature_log_prob_
            origin_proba = clf.predict_max_proba(test_datas)
            origin_label = clf.predict(test_datas)
            for i0 in range(fit_incr_datas.shape[0]):
                text0 = fit_incr_datas.getrow(i0)
                c_pred0 = clf.predict(text0)[0]
                clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update(
                    c_pred0, text0, copy=True)
                test_proba = clf.predict_max_proba(test_datas)
                label = clf.predict(test_datas)
                # 考虑到类别的影响
                # 会出现以下的情况:某个样本属于某个类的概率很高,update后属于某个类别的概率也很高,但是
                # 前后两个类别可能不一致
                smooth = np.asarray([
                    1 if origin_label[j] == label[j] else -1
                    for j in range(len(origin_label))
                ])
                np.multiply(test_proba, smooth, test_proba)

                f_test0 = pair_test(origin_proba, test_proba)
                if f_test0:
                    loss0 = clf.metrics_another_zero_one_loss(
                        origin_proba, test_proba)
                else:
                    loss0 = -1
                f_res.append((loss0, text0, c_pred0, i0, f_test0))

                clf.bayes.class_log_prior_ = origin_class_log_prob_
                clf.bayes.feature_log_prob_ = origin_feature_log_prob_

            res = filter(lambda x: x[4], f_res)
            return [(r[0], r[1], r[2], r[3]) for r in res]

        def handle_five(clf):
            """
            类支持度和无显著性差异的结合
            :param clf:
            :return:
            """
            predict_true = handle(clf, "zero")
            if predict_true:
                return predict_true

            fit_for_class_support = handle(clf, "third")
            print "The result of class-support: %d samples" % len(
                fit_for_class_support)

            #            fit_for_class_support = filter(lambda x: x[4] > clf.bayes.class_log_prior_[np.where(clf.bayes.classes_ == x[2])[0][0]], fit_for_class_support)
            #            print "The result of class-support: %d samples" % len(fit_for_class_support)
            # My Own Idea
            # 存放 Test 的结果
            f_res = []

            origin_class_log_prob_ = clf.bayes.class_log_prior_
            origin_feature_log_prob_ = clf.bayes.feature_log_prob_
            origin_proba = clf.predict_max_proba(test_datas)
            origin_label = clf.predict(test_datas)

            for i0 in range(len(fit_for_class_support)):
                text0 = fit_for_class_support[i0][1]
                c_pred0 = fit_for_class_support[i0][2]
                clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update(
                    c_pred0, text0, copy=True)
                test_proba = clf.predict_max_proba(test_datas)
                label = clf.predict(test_datas)
                # 考虑到类别的影响
                # 会出现以下的情况:某个样本属于某个类的概率很高,update后属于某个类别的概率也很高,但是
                # 前后两个类别可能不一致
                smooth = np.asarray([
                    1 if origin_label[j] == label[j] else -1
                    for j in range(len(origin_label))
                ])
                np.multiply(test_proba, smooth, test_proba)

                f_test0 = pair_test(origin_proba, test_proba)
                if f_test0:
                    loss0 = clf.metrics_another_zero_one_loss(
                        origin_proba, test_proba)
                else:
                    loss0 = -1
                f_res.append((loss0, text0, c_pred0, i0, f_test0))

                clf.bayes.class_log_prior_ = origin_class_log_prob_
                clf.bayes.feature_log_prob_ = origin_feature_log_prob_

            res = filter(lambda x: x[4], f_res)
            return [(r[0], r[1], r[2], r[3]) for r in res]

        method_options = ("first", "second", "third", "four", "five")
        if method not in method_options:
            raise ValueError("method has to be one of " + str(method_options))

        print "Begin Increment Classification: ", time.strftime(
            '%Y-%m-%d %H:%M:%S')
        # 将参数写入/读取
        dir_ = os.path.join(TEXT_OUT, "bayes_args")
        FileUtil.mkdirs(dir_)

        suffix = ".blp"
        class_count_out = os.path.join(dir_, "class_count_" + method + suffix)
        class_log_prob_out = os.path.join(dir_,
                                          "class_log_prob_" + method + suffix)
        feature_count_out = os.path.join(dir_,
                                         "feature_count_" + method + suffix)
        feature_log_prob_out = os.path.join(
            dir_, "feature_log_prob_" + method + suffix)

        out = (class_count_out, class_log_prob_out, feature_count_out,
               feature_log_prob_out)

        if self.f or not FileUtil.isexist(out) or FileUtil.isempty(out):
            if not hasattr(self.bayes, "feature_log_prob_") or not hasattr(
                    self.bayes, "class_log_prior_"):
                raise ValueError(
                    "please use get_classificator() to get classificator firstly"
                )

            fit_incr_datas = self.fit_data(incr_datas)
            incr_class_label = np.asanyarray(incr_class_label)
            # 保存需要增加到key_words.txt文档中的数据
            add_to_key_words = []

            i = 0
            while fit_incr_datas.nnz > 0:
                print
                print "Begin Increment Classification_%d: %s" % (
                    i, time.strftime('%Y-%m-%d %H:%M:%S'))

                need_to_update = handle(self, method)
                # 如果没有可更新的,表示剩余的增量集并不适合当前的分类器,所以舍去
                # 更新时,增量集会不断减少
                block = []
                label_block = []
                # 更新时,训练集会不断增加
                block0 = []
                if need_to_update:
                    # 根据 loss 从小到大排序
                    accord_to_loss = sorted(need_to_update, key=lambda x: x[0])
                    for data in accord_to_loss:
                        self.bayes.update(data[2], data[1])
                    # 根据 index 排序
                    accord_to_index = sorted(need_to_update,
                                             key=lambda x: x[3])

                    #                    index = [index0[3] for index0 in accord_to_index]
                    #                    [add_to_key_words.append(raw_incr_datas[index0]) for index0 in index]
                    #                    raw_incr_datas = [raw for index0, raw in enumerate(raw_incr_datas) if index0 not in index]

                    block0.append(test_datas)
                    reduce(func, accord_to_index, (0.0, "", "", -1))
                    block.append(fit_incr_datas[accord_to_index[-1][3] +
                                                1:, :])
                    label_block.append(
                        incr_class_label[accord_to_index[-1][3] + 1:])
                    test_datas = sp.vstack(block0)
                    print "This times updates %d samples" % len(need_to_update)
                else:
                    block.append(fit_incr_datas[0:0, :])
                    label_block.append(incr_class_label[0:0])
                    print "Finally leaving %d samples that unnecessary added to train sets" % fit_incr_datas.shape[
                        0]
                fit_incr_datas = sp.vstack(block)
                incr_class_label = np.concatenate(label_block)
                i += 1

            bayes_args = (self.bayes.class_count_, self.bayes.class_log_prior_,
                          self.bayes.feature_count_,
                          self.bayes.feature_log_prob_)
            # 保存到文本
            map(lambda x: bp.pack_ndarray_file(x[0], x[1]),
                zip(bayes_args, out))
            # 追加
#            path = os.path.join(TEXT_OUT, "key_words/CHIFeature.txt")
#            FileUtil.write(path, add_to_key_words, "a")
        else:
            # speed up
            self.bayes.class_count_ = bp.unpack_ndarray_file(out[0])
            self.bayes.class_log_prior_ = bp.unpack_ndarray_file(out[1])
            self.bayes.feature_count_ = bp.unpack_ndarray_file(out[2])
            self.bayes.feature_log_prob_ = bp.unpack_ndarray_file(out[3])


#            self.bayes.class_count_ = np.loadtxt(out[0])
#            self.bayes.class_log_prior_ = np.loadtxt(out[1])
#            self.bayes.feature_count_ = np.loadtxt(out[2])
#            self.bayes.feature_log_prob_ = np.loadtxt(out[3])

        print "Increment Classification Done: ", time.strftime(
            '%Y-%m-%d %H:%M:%S')
        print
        return self
Exemplo n.º 13
0
    def get_incr_classificator_thread(self, incr_datas, incr_class_label, test_datas, test_class_label):
        """
        对增量式贝叶斯的增量集部分进行处理
        :param incr_datas: [{"emorion-1-type": value, "sentence": {}},...]
                            (emotion-1-type and sentence are optional)
        :param incr_class_label:
        :param test_datas:
        :param test_class_label:
        :return:
        """
        def func1(i0):
            c_true0 = incr_class_label[i0: i0 + 1][0]
            text0 = fit_incr_datas.getrow(i0)
            c_pred0 = self.predict(text0)[0]

            if c_true0 == c_pred0:
                loss0 = 0
            else:
                clf0 = copy.deepcopy(self)
                clf0.bayes.class_log_prior_, clf0.bayes.feature_log_prob_ = clf0.bayes.update(c_pred0, text0, copy=True)
                loss0 = clf0.metrics_my_zero_one_loss(test_datas)

                # clf0.bayes.class_log_prior_ = origin_class_log_prob_
                # clf0.bayes.feature_log_prob_ = origin_feature_log_prob_

            if lock1.acquire():
                text.append(text0)
                c_pred.append(c_pred0)
                loss.append(loss0)

                lock1.release()

        def func(i0):
            c_true0 = incr_class_label[i0: i0 + 1][0]
            text0 = fit_incr_datas.getrow(i0)
            c_pred0 = self.predict(text0)[0]

            if c_true0 == c_pred0:
                loss0 = 0
            else:
                if lock0.acquire():
                    self.bayes.class_log_prior_, self.bayes.feature_log_prob_ = self.bayes.update(c_pred0, text0, copy=True)
                    loss0 = self.metrics_my_zero_one_loss(test_datas)

                    self.bayes.class_log_prior_ = origin_class_log_prob_
                    self.bayes.feature_log_prob_ = origin_feature_log_prob_

                    lock0.release()

            if lock1.acquire():
                text.append(text0)
                c_pred.append(c_pred0)
                loss.append(loss0)

                lock1.release()

        print "Begin Increment Classification: ", time.strftime('%Y-%m-%d %H:%M:%S')
        # 将参数写入/读取
        dir_ = os.path.join(TEXT_OUT, "bayes_args")
        FileUtil.mkdirs(dir_)

        class_count_out = os.path.join(dir_, "class_count.txt")
        class_log_prob_out = os.path.join(dir_, "class_log_prob.txt")
        feature_count_out = os.path.join(dir_, "feature_count.txt")
        feature_log_prob_out = os.path.join(dir_, "feature_log_prob.txt")

        out = (class_count_out, class_log_prob_out, feature_count_out, feature_log_prob_out)

        if self.f or not FileUtil.isexist(out) or FileUtil.isempty(out):
            if not hasattr(self.bayes, "feature_log_prob_") or not hasattr(self.bayes, "class_log_prior_"):
                raise ValueError("please use get_classificator() to get classificator firstly")

            fit_incr_datas = self.fit_data(incr_datas)
            n_samples, _ = fit_incr_datas.shape
            incr_class_label = np.array(incr_class_label)

            lock0 = threading.Lock()
            lock1 = threading.Lock()

            # threadpool
            poolsize = 30
            pool = ThreadPool(poolsize)

            for i in range(n_samples):
                if i % 5 == 0:
                    print "Begin Increment Classification_%d: %s" % (i / 5, time.strftime('%Y-%m-%d %H:%M:%S'))
                # 分类损失,求最小值的处理方式
                loss = []
                # 增量集中优先选择更改分类器参数的文本
                text = []
                # 增量集中优先选择更改分类器参数的文本所对应的类别
                c_pred = []
                # 增量集中优先选择更改分类器参数的文本所对应的下标
                # index = 0

                origin_class_log_prob_ = self.bayes.class_log_prior_
                origin_feature_log_prob_ = self.bayes.feature_log_prob_

                # threadpool
                requests = makeRequests(func, range(fit_incr_datas.shape[0]))
                [pool.putRequest(req) for req in requests]
                pool.wait()
#                for i0 in range(fit_incr_datas.shape[0]):
#                    threading.Thread(target=func, args=(i0, )).start()

                minindex = np.argmin(loss)
                self.bayes.update(c_pred[minindex], text[minindex])
                fit_incr_datas = sp.vstack([fit_incr_datas[:minindex, :], fit_incr_datas[minindex + 1:, :]])

            bayes_args = (self.bayes.class_count_, self.bayes.class_log_prior_,
                          self.bayes.feature_count_, self.bayes.feature_log_prob_)
            map(lambda x: np.savetxt(x[0], x[1]), zip(out, bayes_args))
        else:
            self.bayes.class_count_ = np.loadtxt(out[0])
            self.bayes.class_log_prior_ = np.loadtxt(out[1])
            self.bayes.feature_count_ = np.loadtxt(out[2])
            self.bayes.feature_log_prob_ = np.loadtxt(out[3])

        print "Increment Classification Done: ", time.strftime('%Y-%m-%d %H:%M:%S')
        return self
Exemplo n.º 14
0
    def cross_validation(self, train_datas, class_label, score='precision'):
        """
        K-Fold Cross Validation
        采用交叉验证的方式来优化贝叶斯参数
        选出具有最佳 score 的训练集和测试集
        此时训练集和测试集就不需要事先选好,交给交叉验证来完成
        :param train_datas:
        :param class_label:
        :param score:
        :return:
        """
        score_options = ('precision', 'recall', 'f1', 'accuracy')
        if score not in score_options:
            raise ValueError('score has to be one of ' +
                             str(score_options))

        # fit data
        fit_train_datas = self.fit_data(train_datas)

        n_samples = fit_train_datas.shape[0]
        class_label = np.array(class_label)

        max_result = []
        max_index = []
        max_ = 0
        i = 0
        while(max_ < 0.6 and i <= 200):
            i += 1
            print "Seeking %d; max: %f; %s" % (i, max_, time.strftime('%Y-%m-%d %H:%M:%S'))

            result = []
            index = []
            cv = cross_validation.KFold(n_samples, n_folds=4, shuffle=True)

            for train_index, test_index in cv:
                train0, train0_label = fit_train_datas[train_index], class_label[train_index]
                test0, test0_label = fit_train_datas[test_index], class_label[test_index]
                self.get_classificator(train0, train0_label)
                c_pred0 = self.predict(test0)

                if score == "precision":
                    result.append(self.metrics_precision(test0_label, c_pred0))
                    index.append((train_index, test_index))
                elif score == "recall":
                    result.append(self.metrics_recall(test0_label, c_pred0))
                    index.append((train_index, test_index))
                elif score == "f1":
                    result.append(self.metrics_f1(test0_label, c_pred0))
                    index.append((train_index, test_index))
                else:
                    result.append(self.metrics_accuracy(test0_label, c_pred0))
                    index.append((train_index, test_index))

            max_ = max(result)
            max_result.append(max_)
            max_index.append(index[np.argmax(result)])

        argmax = np.argmax(max_result)
        print "Seeking Done; max: %f; %s" % (max_result[argmax], time.strftime('%Y-%m-%d %H:%M:%S'))

        # 对最大值再训练一次,得到最优的参数
        self.get_classificator(fit_train_datas[max_index[argmax][0]], class_label[max_index[argmax][0]])

        dir_ = os.path.join(TEXT_OUT, "best_train_test_index")
        FileUtil.mkdirs(dir_)
        current = time.strftime('%Y-%m-%d %H:%M:%S')
        train_index_out = os.path.join(dir_, "train_index.txt")
        test_index_out = os.path.join(dir_, "test_index.txt")

        map(lambda x: np.savetxt(x[0], x[1], fmt="%d"),
            zip(
                    (train_index_out, test_index_out),
                    (max_index[argmax])
                )
            )
Exemplo n.º 15
0
    def get_incr_classificator(self, incr_datas, incr_class_label, test_datas, test_class_label, method="first"):
        """
        对增量式贝叶斯的增量集部分进行处理
        :param incr_datas: [{"emorion-1-type": value, "sentence": {}},...]
                            (emotion-1-type and sentence are optional)
        :param incr_class_label:
        :param test_datas:
        :param test_class_label:
        :return:
        """
        def func(x, y):
            block.append(fit_incr_datas[x[3] + 1: y[3], :])
            label_block.append(incr_class_label[x[3] + 1: y[3]])
            block0.append(fit_incr_datas[y[3]:y[3] + 1, :])
            return y

        def handle(clf, method):
            if method == "zero":
                return handle_zero(clf)
            elif method == "first":
                return handle_first(clf)
            elif method == "second":
                return handle_second(clf)
            elif method == "third":
                return handle_third(clf)
            elif method == "four":
                return handle_four(clf)
            elif method == "five":
                return handle_five(clf)
            else:
                pass

        def handle_zero(clf):
            """
            寻找当前分类器下预测正确的样本
            :param clf:
            :return:
            """
            incr_pre_label = clf.predict(fit_incr_datas)
            # 选出预测正确的下标
            true_index = (incr_class_label == incr_pre_label).nonzero()

            origin_class_log_prob_ = clf.bayes.class_log_prior_
            origin_feature_log_prob_ = clf.bayes.feature_log_prob_
            origin_proba = clf.predict_max_proba(test_datas)

            res = []
            for i0 in true_index[0]:
                text0 = fit_incr_datas.getrow(i0)
                c_pred0 = incr_pre_label[i0]

                clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update(c_pred0, text0, copy=True)
                test_proba = clf.predict_max_proba(test_datas)
                loss0 = clf.metrics_another_zero_one_loss(origin_proba, test_proba)

                res.append((loss0, text0, c_pred0, i0))

                clf.bayes.class_log_prior_ = origin_class_log_prob_
                clf.bayes.feature_log_prob_ = origin_feature_log_prob_

            return res

        def handle_first(clf):
            # 最原始的分类损失度的计算
            # 分类损失,求最小值的处理方式
            loss = 9999
            # 增量集中优先选择更改分类器参数的文本
            text = None
            # 增量集中优先选择更改分类器参数的文本所对应的类别
            c_pred = None
            # 增量集中优先选择更改分类器参数的文本所对应的下标
            index = 0

            origin_class_log_prob_ = clf.bayes.class_log_prior_
            origin_feature_log_prob_ = clf.bayes.feature_log_prob_
            for i0 in range(fit_incr_datas.shape[0]):
                c_true0 = incr_class_label[i0: i0 + 1][0]
                text0 = fit_incr_datas.getrow(i0)
                c_pred0 = clf.predict(text0)[0]
                if c_true0 == c_pred0:
                    loss = 0
                    text = text0
                    c_pred = c_pred0
                    index = i0
                    break
                else:
                    clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update(c_pred0, text0, copy=True)
                    test_proba = clf.predict_max_proba(test_datas)
                    loss0 = clf.metrics_my_zero_one_loss(test_proba)
                    if loss0 < loss:
                        loss = loss0
                        text = text0
                        c_pred = c_pred0
                        index = i0

                clf.bayes.class_log_prior_ = origin_class_log_prob_
                clf.bayes.feature_log_prob_ = origin_feature_log_prob_

            return [(loss, text, c_pred, index)]

        def handle_second(clf):
            # 另一种分类损失度的计算
#            predict_true = handle(clf, "zero")
#            if predict_true:
#                return predict_true

            # 分类损失,求最小值的处理方式
            loss = 9999
            # 增量集中优先选择更改分类器参数的文本
            text = None
            # 增量集中优先选择更改分类器参数的文本所对应的类别
            c_pred = None
            # 增量集中优先选择更改分类器参数的文本所对应的下标
            index = 0

            origin_class_log_prob_ = clf.bayes.class_log_prior_
            origin_feature_log_prob_ = clf.bayes.feature_log_prob_
            origin_proba = clf.predict_max_proba(test_datas)
            for i0 in range(fit_incr_datas.shape[0]):
                c_true0 = incr_class_label[i0: i0 + 1][0]
                text0 = fit_incr_datas.getrow(i0)
                c_pred0 = clf.predict(text0)[0]

                if c_true0 == c_pred0:
                    loss = 0
                    text = text0
                    c_pred = c_pred0
                    index = i0
                    break
                else:
                    clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update(c_pred0, text0, copy=True)
                    test_proba = clf.predict_max_proba(test_datas)
                    loss0 = clf.metrics_another_zero_one_loss(origin_proba, test_proba)
                    if loss0 < loss:
                        loss = loss0
                        text = text0
                        c_pred = c_pred0
                        index = i0

                clf.bayes.class_log_prior_ = origin_class_log_prob_
                clf.bayes.feature_log_prob_ = origin_feature_log_prob_

            return [(loss, text, c_pred, index)]

        def handle_third(clf):
            # todo
            # 如何获得合适的阖值
            def get_fit(e0):
                # 获得合适的阖值
                return 20
#                while len((r >= e0).nonzero()[0]) == 0:
#                    e0 = int(e0 / 2)
#                return e0

            global e
            # 类支持度的计算
            proba = clf.predict_proba(fit_incr_datas)
            label = clf.predict(fit_incr_datas)
            max_proba = np.max(proba, axis=1).reshape(-1, 1)
            second_max_proba = -np.partition(-proba, kth=1, axis=1)[:, 1:2]
            # 支持度
            r = np.divide(max_proba, second_max_proba)
            # 阖值
            e = get_fit(e)
            # select
            select_indices = (r >= e).nonzero()
            return [(0.0, fit_incr_datas.getrow(indice), label[indice], indice, max_proba[indice][0]) for indice in select_indices[0]]

        def handle_third_another(clf):
            # 类支持度的计算
            proba = clf.predict_proba(fit_incr_datas)
            label = clf.predict(fit_incr_datas)
            max_proba = np.max(proba, axis=1).reshape(-1, 1)
            leave_proba = np.sum(proba, axis=1).reshape(-1, 1) - max_proba
            # 支持度
            r = np.divide(max_proba, leave_proba)
            # 阖值
            e = 5
            # select
            select_indices = (r >= e).nonzero()
            return [(0.0, fit_incr_datas.getrow(indice), label[indice], indice, max_proba[indice][0]) for indice in select_indices[0]]

        def handle_four(clf):
            # My Own Idea
            # 存放 Test 的结果
            predict_true = handle(clf, "zero")
            if predict_true:
                return predict_true

            f_res = []

            origin_class_log_prob_ = clf.bayes.class_log_prior_
            origin_feature_log_prob_ = clf.bayes.feature_log_prob_
            origin_proba = clf.predict_max_proba(test_datas)
            origin_label = clf.predict(test_datas)
            for i0 in range(fit_incr_datas.shape[0]):
                text0 = fit_incr_datas.getrow(i0)
                c_pred0 = clf.predict(text0)[0]
                clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update(c_pred0, text0, copy=True)
                test_proba = clf.predict_max_proba(test_datas)
                label = clf.predict(test_datas)
                # 考虑到类别的影响
                # 会出现以下的情况:某个样本属于某个类的概率很高,update后属于某个类别的概率也很高,但是
                # 前后两个类别可能不一致
                smooth = np.asarray([1 if origin_label[j] == label[j] else -1 for j in range(len(origin_label))])
                np.multiply(test_proba, smooth, test_proba)

                f_test0 = pair_test(origin_proba, test_proba)
                if f_test0:
                    loss0 = clf.metrics_another_zero_one_loss(origin_proba, test_proba)
                else:
                    loss0 = -1
                f_res.append((loss0, text0, c_pred0, i0, f_test0))

                clf.bayes.class_log_prior_ = origin_class_log_prob_
                clf.bayes.feature_log_prob_ = origin_feature_log_prob_

            res = filter(lambda x: x[4], f_res)
            return [(r[0], r[1], r[2], r[3]) for r in res]

        def handle_five(clf):
            """
            类支持度和无显著性差异的结合
            :param clf:
            :return:
            """
            predict_true = handle(clf, "zero")
            if predict_true:
                return predict_true

            fit_for_class_support = handle(clf, "third")
            print "The result of class-support: %d samples" % len(fit_for_class_support)

#            fit_for_class_support = filter(lambda x: x[4] > clf.bayes.class_log_prior_[np.where(clf.bayes.classes_ == x[2])[0][0]], fit_for_class_support)
#            print "The result of class-support: %d samples" % len(fit_for_class_support)
            # My Own Idea
            # 存放 Test 的结果
            f_res = []

            origin_class_log_prob_ = clf.bayes.class_log_prior_
            origin_feature_log_prob_ = clf.bayes.feature_log_prob_
            origin_proba = clf.predict_max_proba(test_datas)
            origin_label = clf.predict(test_datas)

            for i0 in range(len(fit_for_class_support)):
                text0 = fit_for_class_support[i0][1]
                c_pred0 = fit_for_class_support[i0][2]
                clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update(c_pred0, text0, copy=True)
                test_proba = clf.predict_max_proba(test_datas)
                label = clf.predict(test_datas)
                # 考虑到类别的影响
                # 会出现以下的情况:某个样本属于某个类的概率很高,update后属于某个类别的概率也很高,但是
                # 前后两个类别可能不一致
                smooth = np.asarray([1 if origin_label[j] == label[j] else -1 for j in range(len(origin_label))])
                np.multiply(test_proba, smooth, test_proba)

                f_test0 = pair_test(origin_proba, test_proba)
                if f_test0:
                    loss0 = clf.metrics_another_zero_one_loss(origin_proba, test_proba)
                else:
                    loss0 = -1
                f_res.append((loss0, text0, c_pred0, i0, f_test0))

                clf.bayes.class_log_prior_ = origin_class_log_prob_
                clf.bayes.feature_log_prob_ = origin_feature_log_prob_

            res = filter(lambda x: x[4], f_res)
            return [(r[0], r[1], r[2], r[3]) for r in res]

        method_options = ("first", "second", "third", "four", "five")
        if method not in method_options:
            raise ValueError("method has to be one of " + str(method_options))

        print "Begin Increment Classification: ", time.strftime('%Y-%m-%d %H:%M:%S')
        # 将参数写入/读取
        dir_ = os.path.join(TEXT_OUT, "bayes_args")
        FileUtil.mkdirs(dir_)

        suffix = ".blp"
        class_count_out = os.path.join(dir_, "class_count_" + method + suffix)
        class_log_prob_out = os.path.join(dir_, "class_log_prob_" + method + suffix)
        feature_count_out = os.path.join(dir_, "feature_count_" + method + suffix)
        feature_log_prob_out = os.path.join(dir_, "feature_log_prob_" + method + suffix)

        out = (class_count_out, class_log_prob_out, feature_count_out, feature_log_prob_out)

        if self.f or not FileUtil.isexist(out) or FileUtil.isempty(out):
            if not hasattr(self.bayes, "feature_log_prob_") or not hasattr(self.bayes, "class_log_prior_"):
                raise ValueError("please use get_classificator() to get classificator firstly")

            fit_incr_datas = self.fit_data(incr_datas)
            incr_class_label = np.asanyarray(incr_class_label)
            # 保存需要增加到key_words.txt文档中的数据
            add_to_key_words = []

            i = 0
            while fit_incr_datas.nnz > 0:
                print
                print "Begin Increment Classification_%d: %s" % (i, time.strftime('%Y-%m-%d %H:%M:%S'))

                need_to_update = handle(self, method)
                # 如果没有可更新的,表示剩余的增量集并不适合当前的分类器,所以舍去
                # 更新时,增量集会不断减少
                block = []
                label_block = []
                # 更新时,训练集会不断增加
                block0 = []
                if need_to_update:
                    # 根据 loss 从小到大排序
                    accord_to_loss = sorted(need_to_update, key=lambda x: x[0])
                    for data in accord_to_loss:
                        self.bayes.update(data[2], data[1])
                    # 根据 index 排序
                    accord_to_index = sorted(need_to_update, key=lambda x: x[3])

#                    index = [index0[3] for index0 in accord_to_index]
#                    [add_to_key_words.append(raw_incr_datas[index0]) for index0 in index]
#                    raw_incr_datas = [raw for index0, raw in enumerate(raw_incr_datas) if index0 not in index]

                    block0.append(test_datas)
                    reduce(func, accord_to_index, (0.0, "", "", -1))
                    block.append(fit_incr_datas[accord_to_index[-1][3] + 1:, :])
                    label_block.append(incr_class_label[accord_to_index[-1][3] + 1:])
                    test_datas = sp.vstack(block0)
                    print "This times updates %d samples" % len(need_to_update)
                else:
                    block.append(fit_incr_datas[0:0, :])
                    label_block.append(incr_class_label[0:0])
                    print "Finally leaving %d samples that unnecessary added to train sets" % fit_incr_datas.shape[0]
                fit_incr_datas = sp.vstack(block)
                incr_class_label = np.concatenate(label_block)
                i += 1

            bayes_args = (self.bayes.class_count_, self.bayes.class_log_prior_,
                          self.bayes.feature_count_, self.bayes.feature_log_prob_)
            # 保存到文本
            map(lambda x: bp.pack_ndarray_file(x[0], x[1]), zip(bayes_args, out))
            # 追加
#            path = os.path.join(TEXT_OUT, "key_words/CHIFeature.txt")
#            FileUtil.write(path, add_to_key_words, "a")
        else:
            # speed up
            self.bayes.class_count_ = bp.unpack_ndarray_file(out[0])
            self.bayes.class_log_prior_ = bp.unpack_ndarray_file(out[1])
            self.bayes.feature_count_ = bp.unpack_ndarray_file(out[2])
            self.bayes.feature_log_prob_ = bp.unpack_ndarray_file(out[3])

#            self.bayes.class_count_ = np.loadtxt(out[0])
#            self.bayes.class_log_prior_ = np.loadtxt(out[1])
#            self.bayes.feature_count_ = np.loadtxt(out[2])
#            self.bayes.feature_log_prob_ = np.loadtxt(out[3])

        print "Increment Classification Done: ", time.strftime('%Y-%m-%d %H:%M:%S')
        print
        return self