def init_bayes_model(category_tree: Category, documents_size: int, vocab_size: int): """ 初始化模型所用参数 :param category_tree: 分类树根节点 :param documents_size: 文档数 :param vocab_size: 单词数 :return: P(C) -> (category_size, ) P(C|D) -> (category_size, documents_size) P(W|C) -> (vocab_size, category_size) """ category_list = category_tree.get_category_list() category_size = len(category_list) category_prior_probability = np.zeros(category_size) # 类别先验概率P(C) category_document_cond_probability = np.zeros( ([documents_size, category_size])) # 文档条件概率P(C|D) # 根据预标注结果初始化P(C)和P(C|D) logging.info("参数初始化") for c, category in tqdm(enumerate(category_list)): category_path = category.split("/") category_documents = category_tree.find_category( category_path).get_documents() for document_index in category_documents: category_document_cond_probability[document_index, c] = 1.0 category_prior_probability[c] = (1.0 + len(category_documents)) / ( category_size + documents_size) # using Laplace smooth category_document_cond_probability = category_document_cond_probability.T # 转置便于矩阵乘法 word_category_cond_probability = np.zeros([vocab_size, len(category_list)]) logging.info("预标注比例: {}/{}".format( int(category_document_cond_probability.sum()), documents_size)) return category_prior_probability, category_document_cond_probability, word_category_cond_probability
def hierarchical_shrinkage_init(category_tree: Category, document_vectors): """ shrinkage步骤利用分类的层次关系来缓解特征稀疏的问题 1/|V|(λ4) <- ROOT(λ3) <- 新闻(λ2) <- 国际新闻(λ1) <- 经济新闻(λ0) 按层次关系将父分类词的概率加权后累加在子分类上 :param category_tree: 分类树root节点 :param document_vectors: 文档词频矩阵 :return: λ -> (category_size, max_depth + 2) β -> (documents_size, category_size, max_depth + 2) P^{α}(W|C) -> (vocab_size, category_size, max_depth + 2) """ logging.info("初始化shrinkage参数") max_depth = Category.get_max_depth(category_tree) category_list = category_tree.get_category_list() category_size = len(category_list) lambda_size = max_depth + 2 lambda_matrix = np.zeros([category_size, lambda_size]) for c, path in enumerate(category_list): category_node = category_tree.find_category(path.split("/")) depth = category_node.get_depth() init_lambda_val = 1.0 / (depth + 2) for k in range(depth): lambda_matrix[c, k] = init_lambda_val lambda_matrix[c, max_depth] = init_lambda_val lambda_matrix[c, max_depth + 1] = init_lambda_val # init β documents_size, vocab_size = document_vectors.shape beta_matrix = np.zeros([documents_size, category_size, lambda_size]) # init P^{α}(W|C) p_w_c_k = np.zeros([vocab_size, category_size, lambda_size]) return lambda_matrix, beta_matrix, p_w_c_k