def write_to_file(path, data, call): parent_dir = FileUtil.getparentdir(path) FileUtil.mkdirs(parent_dir) with open(path, "w") as fp: s = call(data) if isinstance(s, basestring): fp.write(s) else: [fp.write(s0) for s0 in s]
def __process_img(self, img_urls): dir_ = os.path.join(ImageClassification.image_train_path, "img") FileUtil.mkdirs(dir_) def copy_img(img_url): filename = os.path.split(img_url)[1] filepath = os.path.join(dir_, filename) cv2.imwrite(filepath, cv2.imread(img_url)) return filepath return [copy_img(img_url) for img_url in img_urls]
def __save_result(self, sentences): FileUtil.mkdirs(ImageClassification.image_train_path) current = time.strftime('%Y-%m-%d %H:%M:%S') out = os.path.join(ImageClassification.image_train_path, current + ".txt") with open(out, "w") as fp: for sentence in sentences: s = ("sentence:" + sentence.get("sentence") + "\n" + "img:" + ",".join(sentence.get("img")) + "\n" + "label:" + sentence.get("label")) + "\n" fp.write(s)
def process_img(urls): dir_ = os.path.join(RESOURCE_BASE_URL, "collect/img") FileUtil.mkdirs(dir_) if isinstance(urls, basestring): urls = [[urls]] filepath = [] for i, row in enumerate(urls): p = [] for j, url in enumerate(row): filename = FileUtil.getfilename(url) filepath0 = os.path.join(dir_, filename) try: r = requests.get(url, timeout=1) if r.status_code == 200: img = Image.open(StringIO(r.content)) if img.mode != "RGB": img = img.convert("RGB") img.save(filepath0) p.append(filepath0) except requests.exceptions.Timeout, requests.exceptions.ConnectionError: print "Timeout" filepath.append(p)
def classifict(feature, sentences, incr=False, out=False): if isinstance(sentences, basestring): sentences = [sentences] # 获得主客观分类器 feature.subjective = False objective_clf = get_objective_classification(feature) # 测试集 # 主客观部分 test_datas_objective, c_true_objective, danger_index_objective = feature.get_key_words(sentences) test_objective = test_datas_objective if not sp.issparse(test_datas_objective): test_objective = feature.cal_weight_improve(test_datas_objective, c_true_objective) c_pred_objective = objective_clf.predict(test_objective) # 获得情绪分类器 feature.subjective = True emotion_clf = get_emotion_classification(feature, incr=incr) # 测试集 # 情绪部分 test_datas, c_true, danger_index = feature.get_key_words(sentences) test = test_datas if not sp.issparse(test_datas): test = feature.cal_weight_improve(test_datas, c_true) c_pred = [] for i in range(len(sentences)): if i not in danger_index_objective and i not in danger_index: before_i_in_danger_obj = np.sum(np.asarray(danger_index_objective) < i) before_i_in_danger_ = np.sum(np.asarray(danger_index) < i) c = emotion_clf.predict(test[i - before_i_in_danger_])[0] if c_pred_objective[i - before_i_in_danger_obj] == "Y"\ else c_pred_objective[i - before_i_in_danger_obj] c_pred.append(c) if out: dir_ = os.path.join(OUT_BASE_URL, "out0") FileUtil.mkdirs(dir_) current = time.strftime('%Y-%m-%d %H:%M:%S') o = os.path.join(dir_, current + ".xml") with open(o, "w") as fp: for i, s in enumerate(sentences): if i not in danger_index_objective and i not in danger_index: before_i_in_danger_obj = np.sum(np.asarray(danger_index_objective) < i) before_i_in_danger_ = np.sum(np.asarray(danger_index) < i) fp.write( """<weibo emotion-type="%s"> <sentence emotion-1-type="%s" emotion-2-type="none" emotion-tag="%s"> %s </sentence> </weibo> """ % (c_pred[i - before_i_in_danger_], c_pred[i - before_i_in_danger_], "N" if c_pred_objective[i - before_i_in_danger_obj] == "N" else "Y", s)) else: fp.write( """<weibo emotion-type="%s"> <sentence emotion-1-type="%s" emotion-2-type="none" emotion-tag="%s"> %s </sentence> </weibo> """ % ("None", "None", "N", s + "\n Can't recognize because it has insufficient key_words")) else: print c_pred
def get_incr_classificator_thread(self, incr_datas, incr_class_label, test_datas, test_class_label): """ 对增量式贝叶斯的增量集部分进行处理 :param incr_datas: [{"emorion-1-type": value, "sentence": {}},...] (emotion-1-type and sentence are optional) :param incr_class_label: :param test_datas: :param test_class_label: :return: """ def func1(i0): c_true0 = incr_class_label[i0:i0 + 1][0] text0 = fit_incr_datas.getrow(i0) c_pred0 = self.predict(text0)[0] if c_true0 == c_pred0: loss0 = 0 else: clf0 = copy.deepcopy(self) clf0.bayes.class_log_prior_, clf0.bayes.feature_log_prob_ = clf0.bayes.update( c_pred0, text0, copy=True) loss0 = clf0.metrics_my_zero_one_loss(test_datas) # clf0.bayes.class_log_prior_ = origin_class_log_prob_ # clf0.bayes.feature_log_prob_ = origin_feature_log_prob_ if lock1.acquire(): text.append(text0) c_pred.append(c_pred0) loss.append(loss0) lock1.release() def func(i0): c_true0 = incr_class_label[i0:i0 + 1][0] text0 = fit_incr_datas.getrow(i0) c_pred0 = self.predict(text0)[0] if c_true0 == c_pred0: loss0 = 0 else: if lock0.acquire(): self.bayes.class_log_prior_, self.bayes.feature_log_prob_ = self.bayes.update( c_pred0, text0, copy=True) loss0 = self.metrics_my_zero_one_loss(test_datas) self.bayes.class_log_prior_ = origin_class_log_prob_ self.bayes.feature_log_prob_ = origin_feature_log_prob_ lock0.release() if lock1.acquire(): text.append(text0) c_pred.append(c_pred0) loss.append(loss0) lock1.release() print "Begin Increment Classification: ", time.strftime( '%Y-%m-%d %H:%M:%S') # 将参数写入/读取 dir_ = os.path.join(TEXT_OUT, "bayes_args") FileUtil.mkdirs(dir_) class_count_out = os.path.join(dir_, "class_count.txt") class_log_prob_out = os.path.join(dir_, "class_log_prob.txt") feature_count_out = os.path.join(dir_, "feature_count.txt") feature_log_prob_out = os.path.join(dir_, "feature_log_prob.txt") out = (class_count_out, class_log_prob_out, feature_count_out, feature_log_prob_out) if self.f or not FileUtil.isexist(out) or FileUtil.isempty(out): if not hasattr(self.bayes, "feature_log_prob_") or not hasattr( self.bayes, "class_log_prior_"): raise ValueError( "please use get_classificator() to get classificator firstly" ) fit_incr_datas = self.fit_data(incr_datas) n_samples, _ = fit_incr_datas.shape incr_class_label = np.array(incr_class_label) lock0 = threading.Lock() lock1 = threading.Lock() # threadpool poolsize = 30 pool = ThreadPool(poolsize) for i in range(n_samples): if i % 5 == 0: print "Begin Increment Classification_%d: %s" % ( i / 5, time.strftime('%Y-%m-%d %H:%M:%S')) # 分类损失,求最小值的处理方式 loss = [] # 增量集中优先选择更改分类器参数的文本 text = [] # 增量集中优先选择更改分类器参数的文本所对应的类别 c_pred = [] # 增量集中优先选择更改分类器参数的文本所对应的下标 # index = 0 origin_class_log_prob_ = self.bayes.class_log_prior_ origin_feature_log_prob_ = self.bayes.feature_log_prob_ # threadpool requests = makeRequests(func, range(fit_incr_datas.shape[0])) [pool.putRequest(req) for req in requests] pool.wait() # for i0 in range(fit_incr_datas.shape[0]): # threading.Thread(target=func, args=(i0, )).start() minindex = np.argmin(loss) self.bayes.update(c_pred[minindex], text[minindex]) fit_incr_datas = sp.vstack([ fit_incr_datas[:minindex, :], fit_incr_datas[minindex + 1:, :] ]) bayes_args = (self.bayes.class_count_, self.bayes.class_log_prior_, self.bayes.feature_count_, self.bayes.feature_log_prob_) map(lambda x: np.savetxt(x[0], x[1]), zip(out, bayes_args)) else: self.bayes.class_count_ = np.loadtxt(out[0]) self.bayes.class_log_prior_ = np.loadtxt(out[1]) self.bayes.feature_count_ = np.loadtxt(out[2]) self.bayes.feature_log_prob_ = np.loadtxt(out[3]) print "Increment Classification Done: ", time.strftime( '%Y-%m-%d %H:%M:%S') return self
def cross_validation(self, train_datas, class_label, score='precision'): """ K-Fold Cross Validation 采用交叉验证的方式来优化贝叶斯参数 选出具有最佳 score 的训练集和测试集 此时训练集和测试集就不需要事先选好,交给交叉验证来完成 :param train_datas: :param class_label: :param score: :return: """ score_options = ('precision', 'recall', 'f1', 'accuracy') if score not in score_options: raise ValueError('score has to be one of ' + str(score_options)) # fit data fit_train_datas = self.fit_data(train_datas) n_samples = fit_train_datas.shape[0] class_label = np.array(class_label) max_result = [] max_index = [] max_ = 0 i = 0 while (max_ < 0.6 and i <= 200): i += 1 print "Seeking %d; max: %f; %s" % ( i, max_, time.strftime('%Y-%m-%d %H:%M:%S')) result = [] index = [] cv = cross_validation.KFold(n_samples, n_folds=4, shuffle=True) for train_index, test_index in cv: train0, train0_label = fit_train_datas[ train_index], class_label[train_index] test0, test0_label = fit_train_datas[test_index], class_label[ test_index] self.get_classificator(train0, train0_label) c_pred0 = self.predict(test0) if score == "precision": result.append(self.metrics_precision(test0_label, c_pred0)) index.append((train_index, test_index)) elif score == "recall": result.append(self.metrics_recall(test0_label, c_pred0)) index.append((train_index, test_index)) elif score == "f1": result.append(self.metrics_f1(test0_label, c_pred0)) index.append((train_index, test_index)) else: result.append(self.metrics_accuracy(test0_label, c_pred0)) index.append((train_index, test_index)) max_ = max(result) max_result.append(max_) max_index.append(index[np.argmax(result)]) argmax = np.argmax(max_result) print "Seeking Done; max: %f; %s" % ( max_result[argmax], time.strftime('%Y-%m-%d %H:%M:%S')) # 对最大值再训练一次,得到最优的参数 self.get_classificator(fit_train_datas[max_index[argmax][0]], class_label[max_index[argmax][0]]) dir_ = os.path.join(TEXT_OUT, "best_train_test_index") FileUtil.mkdirs(dir_) current = time.strftime('%Y-%m-%d %H:%M:%S') train_index_out = os.path.join(dir_, "train_index.txt") test_index_out = os.path.join(dir_, "test_index.txt") map(lambda x: np.savetxt(x[0], x[1], fmt="%d"), zip((train_index_out, test_index_out), (max_index[argmax])))
def get_incr_classificator(self, incr_datas, incr_class_label, test_datas, test_class_label, method="first"): """ 对增量式贝叶斯的增量集部分进行处理 :param incr_datas: [{"emorion-1-type": value, "sentence": {}},...] (emotion-1-type and sentence are optional) :param incr_class_label: :param test_datas: :param test_class_label: :return: """ def func(x, y): block.append(fit_incr_datas[x[3] + 1:y[3], :]) label_block.append(incr_class_label[x[3] + 1:y[3]]) block0.append(fit_incr_datas[y[3]:y[3] + 1, :]) return y def handle(clf, method): if method == "zero": return handle_zero(clf) elif method == "first": return handle_first(clf) elif method == "second": return handle_second(clf) elif method == "third": return handle_third(clf) elif method == "four": return handle_four(clf) elif method == "five": return handle_five(clf) else: pass def handle_zero(clf): """ 寻找当前分类器下预测正确的样本 :param clf: :return: """ incr_pre_label = clf.predict(fit_incr_datas) # 选出预测正确的下标 true_index = (incr_class_label == incr_pre_label).nonzero() origin_class_log_prob_ = clf.bayes.class_log_prior_ origin_feature_log_prob_ = clf.bayes.feature_log_prob_ origin_proba = clf.predict_max_proba(test_datas) res = [] for i0 in true_index[0]: text0 = fit_incr_datas.getrow(i0) c_pred0 = incr_pre_label[i0] clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update( c_pred0, text0, copy=True) test_proba = clf.predict_max_proba(test_datas) loss0 = clf.metrics_another_zero_one_loss( origin_proba, test_proba) res.append((loss0, text0, c_pred0, i0)) clf.bayes.class_log_prior_ = origin_class_log_prob_ clf.bayes.feature_log_prob_ = origin_feature_log_prob_ return res def handle_first(clf): # 最原始的分类损失度的计算 # 分类损失,求最小值的处理方式 loss = 9999 # 增量集中优先选择更改分类器参数的文本 text = None # 增量集中优先选择更改分类器参数的文本所对应的类别 c_pred = None # 增量集中优先选择更改分类器参数的文本所对应的下标 index = 0 origin_class_log_prob_ = clf.bayes.class_log_prior_ origin_feature_log_prob_ = clf.bayes.feature_log_prob_ for i0 in range(fit_incr_datas.shape[0]): c_true0 = incr_class_label[i0:i0 + 1][0] text0 = fit_incr_datas.getrow(i0) c_pred0 = clf.predict(text0)[0] if c_true0 == c_pred0: loss = 0 text = text0 c_pred = c_pred0 index = i0 break else: clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update( c_pred0, text0, copy=True) test_proba = clf.predict_max_proba(test_datas) loss0 = clf.metrics_my_zero_one_loss(test_proba) if loss0 < loss: loss = loss0 text = text0 c_pred = c_pred0 index = i0 clf.bayes.class_log_prior_ = origin_class_log_prob_ clf.bayes.feature_log_prob_ = origin_feature_log_prob_ return [(loss, text, c_pred, index)] def handle_second(clf): # 另一种分类损失度的计算 # predict_true = handle(clf, "zero") # if predict_true: # return predict_true # 分类损失,求最小值的处理方式 loss = 9999 # 增量集中优先选择更改分类器参数的文本 text = None # 增量集中优先选择更改分类器参数的文本所对应的类别 c_pred = None # 增量集中优先选择更改分类器参数的文本所对应的下标 index = 0 origin_class_log_prob_ = clf.bayes.class_log_prior_ origin_feature_log_prob_ = clf.bayes.feature_log_prob_ origin_proba = clf.predict_max_proba(test_datas) for i0 in range(fit_incr_datas.shape[0]): c_true0 = incr_class_label[i0:i0 + 1][0] text0 = fit_incr_datas.getrow(i0) c_pred0 = clf.predict(text0)[0] if c_true0 == c_pred0: loss = 0 text = text0 c_pred = c_pred0 index = i0 break else: clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update( c_pred0, text0, copy=True) test_proba = clf.predict_max_proba(test_datas) loss0 = clf.metrics_another_zero_one_loss( origin_proba, test_proba) if loss0 < loss: loss = loss0 text = text0 c_pred = c_pred0 index = i0 clf.bayes.class_log_prior_ = origin_class_log_prob_ clf.bayes.feature_log_prob_ = origin_feature_log_prob_ return [(loss, text, c_pred, index)] def handle_third(clf): # todo # 如何获得合适的阖值 def get_fit(e0): # 获得合适的阖值 return 20 # while len((r >= e0).nonzero()[0]) == 0: # e0 = int(e0 / 2) # return e0 global e # 类支持度的计算 proba = clf.predict_proba(fit_incr_datas) label = clf.predict(fit_incr_datas) max_proba = np.max(proba, axis=1).reshape(-1, 1) second_max_proba = -np.partition(-proba, kth=1, axis=1)[:, 1:2] # 支持度 r = np.divide(max_proba, second_max_proba) # 阖值 e = get_fit(e) # select select_indices = (r >= e).nonzero() return [(0.0, fit_incr_datas.getrow(indice), label[indice], indice, max_proba[indice][0]) for indice in select_indices[0]] def handle_third_another(clf): # 类支持度的计算 proba = clf.predict_proba(fit_incr_datas) label = clf.predict(fit_incr_datas) max_proba = np.max(proba, axis=1).reshape(-1, 1) leave_proba = np.sum(proba, axis=1).reshape(-1, 1) - max_proba # 支持度 r = np.divide(max_proba, leave_proba) # 阖值 e = 5 # select select_indices = (r >= e).nonzero() return [(0.0, fit_incr_datas.getrow(indice), label[indice], indice, max_proba[indice][0]) for indice in select_indices[0]] def handle_four(clf): # My Own Idea # 存放 Test 的结果 predict_true = handle(clf, "zero") if predict_true: return predict_true f_res = [] origin_class_log_prob_ = clf.bayes.class_log_prior_ origin_feature_log_prob_ = clf.bayes.feature_log_prob_ origin_proba = clf.predict_max_proba(test_datas) origin_label = clf.predict(test_datas) for i0 in range(fit_incr_datas.shape[0]): text0 = fit_incr_datas.getrow(i0) c_pred0 = clf.predict(text0)[0] clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update( c_pred0, text0, copy=True) test_proba = clf.predict_max_proba(test_datas) label = clf.predict(test_datas) # 考虑到类别的影响 # 会出现以下的情况:某个样本属于某个类的概率很高,update后属于某个类别的概率也很高,但是 # 前后两个类别可能不一致 smooth = np.asarray([ 1 if origin_label[j] == label[j] else -1 for j in range(len(origin_label)) ]) np.multiply(test_proba, smooth, test_proba) f_test0 = pair_test(origin_proba, test_proba) if f_test0: loss0 = clf.metrics_another_zero_one_loss( origin_proba, test_proba) else: loss0 = -1 f_res.append((loss0, text0, c_pred0, i0, f_test0)) clf.bayes.class_log_prior_ = origin_class_log_prob_ clf.bayes.feature_log_prob_ = origin_feature_log_prob_ res = filter(lambda x: x[4], f_res) return [(r[0], r[1], r[2], r[3]) for r in res] def handle_five(clf): """ 类支持度和无显著性差异的结合 :param clf: :return: """ predict_true = handle(clf, "zero") if predict_true: return predict_true fit_for_class_support = handle(clf, "third") print "The result of class-support: %d samples" % len( fit_for_class_support) # fit_for_class_support = filter(lambda x: x[4] > clf.bayes.class_log_prior_[np.where(clf.bayes.classes_ == x[2])[0][0]], fit_for_class_support) # print "The result of class-support: %d samples" % len(fit_for_class_support) # My Own Idea # 存放 Test 的结果 f_res = [] origin_class_log_prob_ = clf.bayes.class_log_prior_ origin_feature_log_prob_ = clf.bayes.feature_log_prob_ origin_proba = clf.predict_max_proba(test_datas) origin_label = clf.predict(test_datas) for i0 in range(len(fit_for_class_support)): text0 = fit_for_class_support[i0][1] c_pred0 = fit_for_class_support[i0][2] clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update( c_pred0, text0, copy=True) test_proba = clf.predict_max_proba(test_datas) label = clf.predict(test_datas) # 考虑到类别的影响 # 会出现以下的情况:某个样本属于某个类的概率很高,update后属于某个类别的概率也很高,但是 # 前后两个类别可能不一致 smooth = np.asarray([ 1 if origin_label[j] == label[j] else -1 for j in range(len(origin_label)) ]) np.multiply(test_proba, smooth, test_proba) f_test0 = pair_test(origin_proba, test_proba) if f_test0: loss0 = clf.metrics_another_zero_one_loss( origin_proba, test_proba) else: loss0 = -1 f_res.append((loss0, text0, c_pred0, i0, f_test0)) clf.bayes.class_log_prior_ = origin_class_log_prob_ clf.bayes.feature_log_prob_ = origin_feature_log_prob_ res = filter(lambda x: x[4], f_res) return [(r[0], r[1], r[2], r[3]) for r in res] method_options = ("first", "second", "third", "four", "five") if method not in method_options: raise ValueError("method has to be one of " + str(method_options)) print "Begin Increment Classification: ", time.strftime( '%Y-%m-%d %H:%M:%S') # 将参数写入/读取 dir_ = os.path.join(TEXT_OUT, "bayes_args") FileUtil.mkdirs(dir_) suffix = ".blp" class_count_out = os.path.join(dir_, "class_count_" + method + suffix) class_log_prob_out = os.path.join(dir_, "class_log_prob_" + method + suffix) feature_count_out = os.path.join(dir_, "feature_count_" + method + suffix) feature_log_prob_out = os.path.join( dir_, "feature_log_prob_" + method + suffix) out = (class_count_out, class_log_prob_out, feature_count_out, feature_log_prob_out) if self.f or not FileUtil.isexist(out) or FileUtil.isempty(out): if not hasattr(self.bayes, "feature_log_prob_") or not hasattr( self.bayes, "class_log_prior_"): raise ValueError( "please use get_classificator() to get classificator firstly" ) fit_incr_datas = self.fit_data(incr_datas) incr_class_label = np.asanyarray(incr_class_label) # 保存需要增加到key_words.txt文档中的数据 add_to_key_words = [] i = 0 while fit_incr_datas.nnz > 0: print print "Begin Increment Classification_%d: %s" % ( i, time.strftime('%Y-%m-%d %H:%M:%S')) need_to_update = handle(self, method) # 如果没有可更新的,表示剩余的增量集并不适合当前的分类器,所以舍去 # 更新时,增量集会不断减少 block = [] label_block = [] # 更新时,训练集会不断增加 block0 = [] if need_to_update: # 根据 loss 从小到大排序 accord_to_loss = sorted(need_to_update, key=lambda x: x[0]) for data in accord_to_loss: self.bayes.update(data[2], data[1]) # 根据 index 排序 accord_to_index = sorted(need_to_update, key=lambda x: x[3]) # index = [index0[3] for index0 in accord_to_index] # [add_to_key_words.append(raw_incr_datas[index0]) for index0 in index] # raw_incr_datas = [raw for index0, raw in enumerate(raw_incr_datas) if index0 not in index] block0.append(test_datas) reduce(func, accord_to_index, (0.0, "", "", -1)) block.append(fit_incr_datas[accord_to_index[-1][3] + 1:, :]) label_block.append( incr_class_label[accord_to_index[-1][3] + 1:]) test_datas = sp.vstack(block0) print "This times updates %d samples" % len(need_to_update) else: block.append(fit_incr_datas[0:0, :]) label_block.append(incr_class_label[0:0]) print "Finally leaving %d samples that unnecessary added to train sets" % fit_incr_datas.shape[ 0] fit_incr_datas = sp.vstack(block) incr_class_label = np.concatenate(label_block) i += 1 bayes_args = (self.bayes.class_count_, self.bayes.class_log_prior_, self.bayes.feature_count_, self.bayes.feature_log_prob_) # 保存到文本 map(lambda x: bp.pack_ndarray_file(x[0], x[1]), zip(bayes_args, out)) # 追加 # path = os.path.join(TEXT_OUT, "key_words/CHIFeature.txt") # FileUtil.write(path, add_to_key_words, "a") else: # speed up self.bayes.class_count_ = bp.unpack_ndarray_file(out[0]) self.bayes.class_log_prior_ = bp.unpack_ndarray_file(out[1]) self.bayes.feature_count_ = bp.unpack_ndarray_file(out[2]) self.bayes.feature_log_prob_ = bp.unpack_ndarray_file(out[3]) # self.bayes.class_count_ = np.loadtxt(out[0]) # self.bayes.class_log_prior_ = np.loadtxt(out[1]) # self.bayes.feature_count_ = np.loadtxt(out[2]) # self.bayes.feature_log_prob_ = np.loadtxt(out[3]) print "Increment Classification Done: ", time.strftime( '%Y-%m-%d %H:%M:%S') print return self
def get_incr_classificator_thread(self, incr_datas, incr_class_label, test_datas, test_class_label): """ 对增量式贝叶斯的增量集部分进行处理 :param incr_datas: [{"emorion-1-type": value, "sentence": {}},...] (emotion-1-type and sentence are optional) :param incr_class_label: :param test_datas: :param test_class_label: :return: """ def func1(i0): c_true0 = incr_class_label[i0: i0 + 1][0] text0 = fit_incr_datas.getrow(i0) c_pred0 = self.predict(text0)[0] if c_true0 == c_pred0: loss0 = 0 else: clf0 = copy.deepcopy(self) clf0.bayes.class_log_prior_, clf0.bayes.feature_log_prob_ = clf0.bayes.update(c_pred0, text0, copy=True) loss0 = clf0.metrics_my_zero_one_loss(test_datas) # clf0.bayes.class_log_prior_ = origin_class_log_prob_ # clf0.bayes.feature_log_prob_ = origin_feature_log_prob_ if lock1.acquire(): text.append(text0) c_pred.append(c_pred0) loss.append(loss0) lock1.release() def func(i0): c_true0 = incr_class_label[i0: i0 + 1][0] text0 = fit_incr_datas.getrow(i0) c_pred0 = self.predict(text0)[0] if c_true0 == c_pred0: loss0 = 0 else: if lock0.acquire(): self.bayes.class_log_prior_, self.bayes.feature_log_prob_ = self.bayes.update(c_pred0, text0, copy=True) loss0 = self.metrics_my_zero_one_loss(test_datas) self.bayes.class_log_prior_ = origin_class_log_prob_ self.bayes.feature_log_prob_ = origin_feature_log_prob_ lock0.release() if lock1.acquire(): text.append(text0) c_pred.append(c_pred0) loss.append(loss0) lock1.release() print "Begin Increment Classification: ", time.strftime('%Y-%m-%d %H:%M:%S') # 将参数写入/读取 dir_ = os.path.join(TEXT_OUT, "bayes_args") FileUtil.mkdirs(dir_) class_count_out = os.path.join(dir_, "class_count.txt") class_log_prob_out = os.path.join(dir_, "class_log_prob.txt") feature_count_out = os.path.join(dir_, "feature_count.txt") feature_log_prob_out = os.path.join(dir_, "feature_log_prob.txt") out = (class_count_out, class_log_prob_out, feature_count_out, feature_log_prob_out) if self.f or not FileUtil.isexist(out) or FileUtil.isempty(out): if not hasattr(self.bayes, "feature_log_prob_") or not hasattr(self.bayes, "class_log_prior_"): raise ValueError("please use get_classificator() to get classificator firstly") fit_incr_datas = self.fit_data(incr_datas) n_samples, _ = fit_incr_datas.shape incr_class_label = np.array(incr_class_label) lock0 = threading.Lock() lock1 = threading.Lock() # threadpool poolsize = 30 pool = ThreadPool(poolsize) for i in range(n_samples): if i % 5 == 0: print "Begin Increment Classification_%d: %s" % (i / 5, time.strftime('%Y-%m-%d %H:%M:%S')) # 分类损失,求最小值的处理方式 loss = [] # 增量集中优先选择更改分类器参数的文本 text = [] # 增量集中优先选择更改分类器参数的文本所对应的类别 c_pred = [] # 增量集中优先选择更改分类器参数的文本所对应的下标 # index = 0 origin_class_log_prob_ = self.bayes.class_log_prior_ origin_feature_log_prob_ = self.bayes.feature_log_prob_ # threadpool requests = makeRequests(func, range(fit_incr_datas.shape[0])) [pool.putRequest(req) for req in requests] pool.wait() # for i0 in range(fit_incr_datas.shape[0]): # threading.Thread(target=func, args=(i0, )).start() minindex = np.argmin(loss) self.bayes.update(c_pred[minindex], text[minindex]) fit_incr_datas = sp.vstack([fit_incr_datas[:minindex, :], fit_incr_datas[minindex + 1:, :]]) bayes_args = (self.bayes.class_count_, self.bayes.class_log_prior_, self.bayes.feature_count_, self.bayes.feature_log_prob_) map(lambda x: np.savetxt(x[0], x[1]), zip(out, bayes_args)) else: self.bayes.class_count_ = np.loadtxt(out[0]) self.bayes.class_log_prior_ = np.loadtxt(out[1]) self.bayes.feature_count_ = np.loadtxt(out[2]) self.bayes.feature_log_prob_ = np.loadtxt(out[3]) print "Increment Classification Done: ", time.strftime('%Y-%m-%d %H:%M:%S') return self
def cross_validation(self, train_datas, class_label, score='precision'): """ K-Fold Cross Validation 采用交叉验证的方式来优化贝叶斯参数 选出具有最佳 score 的训练集和测试集 此时训练集和测试集就不需要事先选好,交给交叉验证来完成 :param train_datas: :param class_label: :param score: :return: """ score_options = ('precision', 'recall', 'f1', 'accuracy') if score not in score_options: raise ValueError('score has to be one of ' + str(score_options)) # fit data fit_train_datas = self.fit_data(train_datas) n_samples = fit_train_datas.shape[0] class_label = np.array(class_label) max_result = [] max_index = [] max_ = 0 i = 0 while(max_ < 0.6 and i <= 200): i += 1 print "Seeking %d; max: %f; %s" % (i, max_, time.strftime('%Y-%m-%d %H:%M:%S')) result = [] index = [] cv = cross_validation.KFold(n_samples, n_folds=4, shuffle=True) for train_index, test_index in cv: train0, train0_label = fit_train_datas[train_index], class_label[train_index] test0, test0_label = fit_train_datas[test_index], class_label[test_index] self.get_classificator(train0, train0_label) c_pred0 = self.predict(test0) if score == "precision": result.append(self.metrics_precision(test0_label, c_pred0)) index.append((train_index, test_index)) elif score == "recall": result.append(self.metrics_recall(test0_label, c_pred0)) index.append((train_index, test_index)) elif score == "f1": result.append(self.metrics_f1(test0_label, c_pred0)) index.append((train_index, test_index)) else: result.append(self.metrics_accuracy(test0_label, c_pred0)) index.append((train_index, test_index)) max_ = max(result) max_result.append(max_) max_index.append(index[np.argmax(result)]) argmax = np.argmax(max_result) print "Seeking Done; max: %f; %s" % (max_result[argmax], time.strftime('%Y-%m-%d %H:%M:%S')) # 对最大值再训练一次,得到最优的参数 self.get_classificator(fit_train_datas[max_index[argmax][0]], class_label[max_index[argmax][0]]) dir_ = os.path.join(TEXT_OUT, "best_train_test_index") FileUtil.mkdirs(dir_) current = time.strftime('%Y-%m-%d %H:%M:%S') train_index_out = os.path.join(dir_, "train_index.txt") test_index_out = os.path.join(dir_, "test_index.txt") map(lambda x: np.savetxt(x[0], x[1], fmt="%d"), zip( (train_index_out, test_index_out), (max_index[argmax]) ) )
def get_incr_classificator(self, incr_datas, incr_class_label, test_datas, test_class_label, method="first"): """ 对增量式贝叶斯的增量集部分进行处理 :param incr_datas: [{"emorion-1-type": value, "sentence": {}},...] (emotion-1-type and sentence are optional) :param incr_class_label: :param test_datas: :param test_class_label: :return: """ def func(x, y): block.append(fit_incr_datas[x[3] + 1: y[3], :]) label_block.append(incr_class_label[x[3] + 1: y[3]]) block0.append(fit_incr_datas[y[3]:y[3] + 1, :]) return y def handle(clf, method): if method == "zero": return handle_zero(clf) elif method == "first": return handle_first(clf) elif method == "second": return handle_second(clf) elif method == "third": return handle_third(clf) elif method == "four": return handle_four(clf) elif method == "five": return handle_five(clf) else: pass def handle_zero(clf): """ 寻找当前分类器下预测正确的样本 :param clf: :return: """ incr_pre_label = clf.predict(fit_incr_datas) # 选出预测正确的下标 true_index = (incr_class_label == incr_pre_label).nonzero() origin_class_log_prob_ = clf.bayes.class_log_prior_ origin_feature_log_prob_ = clf.bayes.feature_log_prob_ origin_proba = clf.predict_max_proba(test_datas) res = [] for i0 in true_index[0]: text0 = fit_incr_datas.getrow(i0) c_pred0 = incr_pre_label[i0] clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update(c_pred0, text0, copy=True) test_proba = clf.predict_max_proba(test_datas) loss0 = clf.metrics_another_zero_one_loss(origin_proba, test_proba) res.append((loss0, text0, c_pred0, i0)) clf.bayes.class_log_prior_ = origin_class_log_prob_ clf.bayes.feature_log_prob_ = origin_feature_log_prob_ return res def handle_first(clf): # 最原始的分类损失度的计算 # 分类损失,求最小值的处理方式 loss = 9999 # 增量集中优先选择更改分类器参数的文本 text = None # 增量集中优先选择更改分类器参数的文本所对应的类别 c_pred = None # 增量集中优先选择更改分类器参数的文本所对应的下标 index = 0 origin_class_log_prob_ = clf.bayes.class_log_prior_ origin_feature_log_prob_ = clf.bayes.feature_log_prob_ for i0 in range(fit_incr_datas.shape[0]): c_true0 = incr_class_label[i0: i0 + 1][0] text0 = fit_incr_datas.getrow(i0) c_pred0 = clf.predict(text0)[0] if c_true0 == c_pred0: loss = 0 text = text0 c_pred = c_pred0 index = i0 break else: clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update(c_pred0, text0, copy=True) test_proba = clf.predict_max_proba(test_datas) loss0 = clf.metrics_my_zero_one_loss(test_proba) if loss0 < loss: loss = loss0 text = text0 c_pred = c_pred0 index = i0 clf.bayes.class_log_prior_ = origin_class_log_prob_ clf.bayes.feature_log_prob_ = origin_feature_log_prob_ return [(loss, text, c_pred, index)] def handle_second(clf): # 另一种分类损失度的计算 # predict_true = handle(clf, "zero") # if predict_true: # return predict_true # 分类损失,求最小值的处理方式 loss = 9999 # 增量集中优先选择更改分类器参数的文本 text = None # 增量集中优先选择更改分类器参数的文本所对应的类别 c_pred = None # 增量集中优先选择更改分类器参数的文本所对应的下标 index = 0 origin_class_log_prob_ = clf.bayes.class_log_prior_ origin_feature_log_prob_ = clf.bayes.feature_log_prob_ origin_proba = clf.predict_max_proba(test_datas) for i0 in range(fit_incr_datas.shape[0]): c_true0 = incr_class_label[i0: i0 + 1][0] text0 = fit_incr_datas.getrow(i0) c_pred0 = clf.predict(text0)[0] if c_true0 == c_pred0: loss = 0 text = text0 c_pred = c_pred0 index = i0 break else: clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update(c_pred0, text0, copy=True) test_proba = clf.predict_max_proba(test_datas) loss0 = clf.metrics_another_zero_one_loss(origin_proba, test_proba) if loss0 < loss: loss = loss0 text = text0 c_pred = c_pred0 index = i0 clf.bayes.class_log_prior_ = origin_class_log_prob_ clf.bayes.feature_log_prob_ = origin_feature_log_prob_ return [(loss, text, c_pred, index)] def handle_third(clf): # todo # 如何获得合适的阖值 def get_fit(e0): # 获得合适的阖值 return 20 # while len((r >= e0).nonzero()[0]) == 0: # e0 = int(e0 / 2) # return e0 global e # 类支持度的计算 proba = clf.predict_proba(fit_incr_datas) label = clf.predict(fit_incr_datas) max_proba = np.max(proba, axis=1).reshape(-1, 1) second_max_proba = -np.partition(-proba, kth=1, axis=1)[:, 1:2] # 支持度 r = np.divide(max_proba, second_max_proba) # 阖值 e = get_fit(e) # select select_indices = (r >= e).nonzero() return [(0.0, fit_incr_datas.getrow(indice), label[indice], indice, max_proba[indice][0]) for indice in select_indices[0]] def handle_third_another(clf): # 类支持度的计算 proba = clf.predict_proba(fit_incr_datas) label = clf.predict(fit_incr_datas) max_proba = np.max(proba, axis=1).reshape(-1, 1) leave_proba = np.sum(proba, axis=1).reshape(-1, 1) - max_proba # 支持度 r = np.divide(max_proba, leave_proba) # 阖值 e = 5 # select select_indices = (r >= e).nonzero() return [(0.0, fit_incr_datas.getrow(indice), label[indice], indice, max_proba[indice][0]) for indice in select_indices[0]] def handle_four(clf): # My Own Idea # 存放 Test 的结果 predict_true = handle(clf, "zero") if predict_true: return predict_true f_res = [] origin_class_log_prob_ = clf.bayes.class_log_prior_ origin_feature_log_prob_ = clf.bayes.feature_log_prob_ origin_proba = clf.predict_max_proba(test_datas) origin_label = clf.predict(test_datas) for i0 in range(fit_incr_datas.shape[0]): text0 = fit_incr_datas.getrow(i0) c_pred0 = clf.predict(text0)[0] clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update(c_pred0, text0, copy=True) test_proba = clf.predict_max_proba(test_datas) label = clf.predict(test_datas) # 考虑到类别的影响 # 会出现以下的情况:某个样本属于某个类的概率很高,update后属于某个类别的概率也很高,但是 # 前后两个类别可能不一致 smooth = np.asarray([1 if origin_label[j] == label[j] else -1 for j in range(len(origin_label))]) np.multiply(test_proba, smooth, test_proba) f_test0 = pair_test(origin_proba, test_proba) if f_test0: loss0 = clf.metrics_another_zero_one_loss(origin_proba, test_proba) else: loss0 = -1 f_res.append((loss0, text0, c_pred0, i0, f_test0)) clf.bayes.class_log_prior_ = origin_class_log_prob_ clf.bayes.feature_log_prob_ = origin_feature_log_prob_ res = filter(lambda x: x[4], f_res) return [(r[0], r[1], r[2], r[3]) for r in res] def handle_five(clf): """ 类支持度和无显著性差异的结合 :param clf: :return: """ predict_true = handle(clf, "zero") if predict_true: return predict_true fit_for_class_support = handle(clf, "third") print "The result of class-support: %d samples" % len(fit_for_class_support) # fit_for_class_support = filter(lambda x: x[4] > clf.bayes.class_log_prior_[np.where(clf.bayes.classes_ == x[2])[0][0]], fit_for_class_support) # print "The result of class-support: %d samples" % len(fit_for_class_support) # My Own Idea # 存放 Test 的结果 f_res = [] origin_class_log_prob_ = clf.bayes.class_log_prior_ origin_feature_log_prob_ = clf.bayes.feature_log_prob_ origin_proba = clf.predict_max_proba(test_datas) origin_label = clf.predict(test_datas) for i0 in range(len(fit_for_class_support)): text0 = fit_for_class_support[i0][1] c_pred0 = fit_for_class_support[i0][2] clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update(c_pred0, text0, copy=True) test_proba = clf.predict_max_proba(test_datas) label = clf.predict(test_datas) # 考虑到类别的影响 # 会出现以下的情况:某个样本属于某个类的概率很高,update后属于某个类别的概率也很高,但是 # 前后两个类别可能不一致 smooth = np.asarray([1 if origin_label[j] == label[j] else -1 for j in range(len(origin_label))]) np.multiply(test_proba, smooth, test_proba) f_test0 = pair_test(origin_proba, test_proba) if f_test0: loss0 = clf.metrics_another_zero_one_loss(origin_proba, test_proba) else: loss0 = -1 f_res.append((loss0, text0, c_pred0, i0, f_test0)) clf.bayes.class_log_prior_ = origin_class_log_prob_ clf.bayes.feature_log_prob_ = origin_feature_log_prob_ res = filter(lambda x: x[4], f_res) return [(r[0], r[1], r[2], r[3]) for r in res] method_options = ("first", "second", "third", "four", "five") if method not in method_options: raise ValueError("method has to be one of " + str(method_options)) print "Begin Increment Classification: ", time.strftime('%Y-%m-%d %H:%M:%S') # 将参数写入/读取 dir_ = os.path.join(TEXT_OUT, "bayes_args") FileUtil.mkdirs(dir_) suffix = ".blp" class_count_out = os.path.join(dir_, "class_count_" + method + suffix) class_log_prob_out = os.path.join(dir_, "class_log_prob_" + method + suffix) feature_count_out = os.path.join(dir_, "feature_count_" + method + suffix) feature_log_prob_out = os.path.join(dir_, "feature_log_prob_" + method + suffix) out = (class_count_out, class_log_prob_out, feature_count_out, feature_log_prob_out) if self.f or not FileUtil.isexist(out) or FileUtil.isempty(out): if not hasattr(self.bayes, "feature_log_prob_") or not hasattr(self.bayes, "class_log_prior_"): raise ValueError("please use get_classificator() to get classificator firstly") fit_incr_datas = self.fit_data(incr_datas) incr_class_label = np.asanyarray(incr_class_label) # 保存需要增加到key_words.txt文档中的数据 add_to_key_words = [] i = 0 while fit_incr_datas.nnz > 0: print print "Begin Increment Classification_%d: %s" % (i, time.strftime('%Y-%m-%d %H:%M:%S')) need_to_update = handle(self, method) # 如果没有可更新的,表示剩余的增量集并不适合当前的分类器,所以舍去 # 更新时,增量集会不断减少 block = [] label_block = [] # 更新时,训练集会不断增加 block0 = [] if need_to_update: # 根据 loss 从小到大排序 accord_to_loss = sorted(need_to_update, key=lambda x: x[0]) for data in accord_to_loss: self.bayes.update(data[2], data[1]) # 根据 index 排序 accord_to_index = sorted(need_to_update, key=lambda x: x[3]) # index = [index0[3] for index0 in accord_to_index] # [add_to_key_words.append(raw_incr_datas[index0]) for index0 in index] # raw_incr_datas = [raw for index0, raw in enumerate(raw_incr_datas) if index0 not in index] block0.append(test_datas) reduce(func, accord_to_index, (0.0, "", "", -1)) block.append(fit_incr_datas[accord_to_index[-1][3] + 1:, :]) label_block.append(incr_class_label[accord_to_index[-1][3] + 1:]) test_datas = sp.vstack(block0) print "This times updates %d samples" % len(need_to_update) else: block.append(fit_incr_datas[0:0, :]) label_block.append(incr_class_label[0:0]) print "Finally leaving %d samples that unnecessary added to train sets" % fit_incr_datas.shape[0] fit_incr_datas = sp.vstack(block) incr_class_label = np.concatenate(label_block) i += 1 bayes_args = (self.bayes.class_count_, self.bayes.class_log_prior_, self.bayes.feature_count_, self.bayes.feature_log_prob_) # 保存到文本 map(lambda x: bp.pack_ndarray_file(x[0], x[1]), zip(bayes_args, out)) # 追加 # path = os.path.join(TEXT_OUT, "key_words/CHIFeature.txt") # FileUtil.write(path, add_to_key_words, "a") else: # speed up self.bayes.class_count_ = bp.unpack_ndarray_file(out[0]) self.bayes.class_log_prior_ = bp.unpack_ndarray_file(out[1]) self.bayes.feature_count_ = bp.unpack_ndarray_file(out[2]) self.bayes.feature_log_prob_ = bp.unpack_ndarray_file(out[3]) # self.bayes.class_count_ = np.loadtxt(out[0]) # self.bayes.class_log_prior_ = np.loadtxt(out[1]) # self.bayes.feature_count_ = np.loadtxt(out[2]) # self.bayes.feature_log_prob_ = np.loadtxt(out[3]) print "Increment Classification Done: ", time.strftime('%Y-%m-%d %H:%M:%S') print return self