print(topic) doc_topics = btn_ok(topics) doc_topics = sorted(doc_topics, key=lambda d: d['topic']) print('\n\ndocument\n\n') for doc in doc_topics: print(doc) # n_topic = 3 do topics = ... dong thu 52 if __name__ == '__main__': # import json # data = DataAccess().get_documents() # json.dump(data,open('data.json','w',encoding='utf-8'),ensure_ascii=False) data = DataAccess().get_documents()[:10] # vn_segment_word = ['a_b a_b b_a b_a', 'd_c c_d c_d d_c', 'b_a a_b b_a', 'c_d c_d d_c d_c c_d'] vn_segment_word = [d['content'] for d in data] tv = TfidfVectorizer(max_df=1.0, min_df=0, max_features=5000) X = tv.fit_transform(vn_segment_word) lda_model = LatentDirichletAllocation(n_topics=2, n_components=10, max_iter=500, learning_method='batch', n_jobs=-1) lda_output = lda_model.fit_transform(X) print(lda_output) print(lda_model.set_params(n_topics=["topic1", "topic2"])) print(lda_model.transform(X[:5]))
def LDA(train_size, random_state): """ Classification pipeline with LDA preprocessing. Inputs: - train_size (int): number of training samples. - random_state (int): seed for random number generators Output: (None) """ subset = 'subset_%s' % train_size input_dir = INPUT_DIR / subset model_dir = OUTPUT_DIR / subset createDirs(model_dir) X_train, X_test, y_train, y_test = loadClean(input_dir) X_train_sub, X_val, y_train_sub, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=random_state) scaler = StandardScaler() best_params = [] best_k, best_auc, best_acc = None, 0, 0 for k in K: model_name = "lda_%s.joblib" % k try: lda = load(model_dir / model_name) # logger.info("\t\tk = %s, fitted LDA model loaded." % k) except: lda = LatentDirichletAllocation(n_components=k, doc_topic_prior=50 / k, topic_word_prior=0.01, n_jobs=-1, random_state=random_state) lda.fit(X_train) dump(lda, model_dir / model_name) X_train_ = scaler.fit_transform(lda.transform(X_train_sub)) X_val_ = scaler.transform(lda.transform(X_val)) clf_val = LogisticRegressionVal(X_train_, y_train_sub, X_val_, y_val, k, random_state=random_state) best_k, best_auc, best_acc, best_params = clf_val.tune(best_k, best_auc, best_acc, best_params) clf, file_name, header = clf_val.bestClassifier(best_params) lda.set_params(**{'n_components': best_k, 'doc_topic_prior': 50 / best_k}) preprocess = make_pipeline(lda, scaler) tr_time, tr_metrics, test_time, test_metrics = evaluate(preprocess, clf, X_train, y_train, X_test, y_test) writeResults(file_name, header, 'lda', train_size, best_k, best_params, tr_time, tr_metrics, test_time, test_metrics) logger.info(("\tFor training size = %s, best number of topics k = %s " "best parameter grid: %s (train AUC: {:.3f}, train acc: {:.3f};" " test AUC: {:.3f}, test acc: {:.3f})"). format(tr_metrics[0], tr_metrics[1], test_metrics[0], test_metrics[1]) % (train_size, best_k, best_params))
timestamp = time.time() print("加载停词表...") load_stopwords(swlist) print("加载停词表耗时:", time.time() - timestamp, "s") timestamp = time.time() print("分词...") lemmatizer = WordNetLemmatizer() load_corpus(expected_tags, lemmatizer, swlist, corpus) print("分词耗时:", time.time() - timestamp, "s") tf_vectorizer = CountVectorizer(stop_words="english", lowercase=False) word_freq = tf_vectorizer.fit_transform(corpus) tf_feature_names = tf_vectorizer.get_feature_names() print("语料库总词数:", len(tf_feature_names)) lda = LatentDirichletAllocation(max_iter=50, doc_topic_prior=0.5, \ topic_word_prior=0.1, learning_method="batch", random_state=0) for n_topics in [5, 10, 20]: lda.set_params(n_components=n_topics) params = lda.get_params(False) print("\nLDA模型参数:") for key, value in params.items(): print(key, "<-", value) timestamp = time.time() print("LDA(" + "n_components = " + str(n_topics) + ")训练...") lda.fit(word_freq) print("LDA(" + "n_components = " + str(n_topics) + ")训练耗时:", time.time() - timestamp, "s") print("输出结果到" + "../../output_python/topic" + str(n_topics) + "/topic-top" + str(n_top_words) + "keywords.txt") save_top_topciwords(lda, tf_feature_names, n_top_words) print("\n结束时间:", time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
class LDA(GenericModel): def __init__(self, **kwargs): self._corpus_matrix = None self._query_vector = None self.vectorizer = None self.lda_model = LatentDirichletAllocation(n_jobs=-1) super().__init__() self.similarity_measure = None self.set_basic_params(**kwargs) self.set_vectorizer(**kwargs) self.set_lda_model(**kwargs) def set_name(self, name): super().set_name(name) def set_model_gen_name(self, gen_name): super().set_model_gen_name(gen_name) def set_basic_params(self, **kwargs): self.set_name('LDA' if LDA_Model_Hyperp.NAME.value not in kwargs.keys() else kwargs[LDA_Model_Hyperp.NAME.value]) self.set_model_gen_name('lda') self.set_similarity_measure( sm.SimilarityMeasure.COSINE if LDA_Model_Hyperp.SIMILARITY_MEASURE. value not in kwargs.keys() else kwargs[LDA_Model_Hyperp. SIMILARITY_MEASURE.value]) def set_similarity_measure(self, sim_measure): self.similarity_measure = sim_measure def set_vectorizer(self, **kwargs): self.vectorizer = TfidfVectorizer( stop_words='english', use_idf=True, smooth_idf=True ) if LDA_Model_Hyperp.VECTORIZER.value not in kwargs.keys( ) else kwargs[LDA_Model_Hyperp.VECTORIZER.value] vec_params = { key.split('__')[2]: kwargs[key] for key, val in kwargs.items() if '__vectorizer__' in key } self.vectorizer.set_params(**vec_params) def set_lda_model(self, **kwargs): lda_model_params = { key.split('__')[2]: kwargs[key] for key, val in kwargs.items() if '__lda_model__' in key } self.lda_model.set_params(**lda_model_params) def recover_links(self, corpus, query, test_cases_names, bug_reports_names): self._corpus_matrix = self.vectorizer.fit_transform(corpus) self._query_vector = self.vectorizer.transform(query) self.out_1 = self.lda_model.fit_transform(self._corpus_matrix) self.out_2 = self.lda_model.transform(self._query_vector) metric = self.similarity_measure if metric == sm.SimilarityMeasure.COSINE: self._sim_matrix = pairwise.cosine_similarity(X=self.out_1, Y=self.out_2) elif metric == sm.SimilarityMeasure.JSD: self._sim_matrix = pairwise_distances(X=self.out_1, Y=self.out_2, metric=SimilarityMeasure.jsd) elif metric == sm.SimilarityMeasure.EUCLIDIAN_DISTANCE: self._sim_matrix = pairwise_distances(X=self.out_1, Y=self.out_2, metric='euclidean') #self._sim_matrix = super().normalize_sim_matrix(self._sim_matrix) self._sim_matrix = pd.DataFrame(data=self._sim_matrix, index=test_cases_names, columns=bug_reports_names) self._record_docs_feats(corpus, query, test_cases_names, bug_reports_names) def _record_docs_feats(self, corpus, query, test_cases_names, bug_reports_names): self.mrw_tcs = self._recover_mrw_list(test_cases_names, corpus) self.mrw_brs = self._recover_mrw_list(bug_reports_names, query) self.dl_tcs = self._recover_dl_list(test_cases_names, corpus) self.dl_brs = self._recover_dl_list(bug_reports_names, query) index = list(test_cases_names) + list(bug_reports_names) self.docs_feats_df = pd.DataFrame(index=index, columns=['mrw', 'dl']) for tc_name, mrw in self.mrw_tcs: self.docs_feats_df.at[tc_name, 'mrw'] = mrw for tc_name, dl in self.dl_tcs: self.docs_feats_df.at[tc_name, 'dl'] = dl for br_name, mrw in self.mrw_brs: self.docs_feats_df.at[br_name, 'mrw'] = mrw for br_name, dl in self.dl_brs: self.docs_feats_df.at[br_name, 'dl'] = dl def _recover_dl_list(self, artf_names, artf_descs): tokenizer = PorterStemmerBased_Tokenizer() dl_list = [] for artf_name, artf_desc in zip(artf_names, artf_descs): dl_list.append((artf_name, len(tokenizer.__call__(artf_desc)))) return dl_list def _recover_mrw_list(self, artf_names, artf_descs): N_REL_WORDS = 6 mrw_list = [] # list of tuples (artf_name, mrw_list={}) for artf_name, artf_desc in zip(artf_names, artf_descs): X = self.vectorizer.transform([artf_desc]) df1 = pd.DataFrame(X.T.toarray()) df1['token'] = self.vectorizer.get_feature_names() df1.sort_values(by=0, ascending=False, inplace=True) mrw = list(df1.iloc[0:N_REL_WORDS, 1].values) mrw_list.append((artf_name, mrw)) return mrw_list def model_setup(self): return { "Setup": [{ "Name": self.get_name() }, { "Similarity Measure and Minimum Threshold": self.get_sim_measure_min_threshold() }, { "Top Value": self.get_top_value() }, { "LDA Model": self.lda_model.get_params() }, { "Vectorizer": self.vectorizer.get_params() }, { "Vectorizer Type": type(self.vectorizer) }] } def get_name(self): return super().get_name() def get_model_gen_name(self): return super().get_model_gen_name() def get_similarity_measure(self): return self.similarity_measure def get_sim_matrix(self): return super().get_sim_matrix() def get_tokenizer_type(self): return type(self.tokenizer) def save_sim_matrix(self): super().save_sim_matrix() def get_query_vector(self): return self._query_vector def get_corpus_matrix(self): return self._corpus_matrix def get_vectorizer_type(self): return type(self.vectorizer) def print_topics(self): feature_names = self.vectorizer.get_feature_names() n_top_words = 10 for topic_idx, topic in enumerate(self.lda_model.components_): message = "Topic #%d: " % topic_idx message += " ".join([ feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1] ]) print(message)
class SegmentationEngine(BaseEstimator): """ Implementation of segmentation engine used for segmenting documents. Based on Latent Dirichlet Allocation model and Topic Tiling algorithm. :param vectorizer: CountVectorizer class used for transforming and cleaning input data. :param lda: Latent Dirichlet Allocation model. :param tt: Topic Tiling class. :param n_topics: Number of topics parameter of LDA. :param max_iter: Maximum number of iterations parameter of LDA. :param a: Document topics prior parameter of LDA. :param b: Topic document prior parameter of LDA. :param m: Multiplier parameter of Topic Tiling :param random_state: Random state. """ def __init__(self, n_topics=10, max_iter=None, a=None, b=None, m=None, random_state=None,lda_learning_method = "batch",opt = seg_poem_opt): """ Initializes estimator. """ self.n_topics = n_topics self.max_iter = max_iter self.a = a self.b = b self.m = m self.random_state = random_state self.vectorizer = CountVectorizer(max_df=0.95, min_df=2, tokenizer=tokenize, stop_words=CHINESE_STOP_WORDS) self.lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=max_iter, doc_topic_prior=a, topic_word_prior=b, random_state=random_state,learning_method=lda_learning_method) self.tt = TopicTiling(m=m) self.opt = opt def fit(self, documents, input_type='sentence'): """ Trains segmentation engine. :param documents: List (iterable) of documents (class Document). :input_type: Determines basic input unit.Possible values are 'segment', 'document', 'sentence'. By default we use 'segment'. """ t0 = time() train_data = self.parse_data(documents, input_type) X = self.vectorizer.fit_transform(train_data) self.lda.fit(X) print('Fitted in %0.2f seconds' % (time() - t0)) def pickle_lda(self,path): if not os.path.exists(path): os.mkdir(path) pickle.dump(self.vectorizer,open(os.path.join(path,'vectorizer.pkl'),"wb")) pickle.dump(self.lda,open(os.path.join(path,"lda.pkl"),"wb")) def get_pickled_lda(self , path): self.vectorizer = pickle.load(open(os.path.join(path,"vectorizer.pkl"),"rb")) self.lda = pickle.load(open(os.path.join(path,"lda.pkl"),"rb")) def predict(self, documents): """ Calculates segment boundaries for documents. :param documents: List (iterable) of documents (class Document). :return: List of boundaries for each document. """ # TODO check if fit has been called estimated_boundaries = [] for document in documents: sentence_vectors = [self.lda.transform(self.vectorizer.transform([sentence])) for sentence in document.sentences] '''sentence 是用空格间隔的 string type''' each_sentence_len = [len(''.join(sentence.split())) for sentence in document.sentences] boundaries, depth_scores = self.tt.fit(sentence_vectors) estimated_boundaries.append((boundaries, depth_scores, each_sentence_len)) Res = self.infer_further(estimated_boundaries) return Res def infer_further(self, estimated_boundaries): Res = [] def find_dp(start, end, each_sentence_len, depth_scores): ''' 在开区间(start,end)上寻找,使得分割点满足要求的,分割点,并且最好是分割在depth_score最大的点上 算法,从depth_socre最大的开始增加,直到满足要求 ''' start_i = start + 1 end_i = end - 1 if start_i > end_i: return [] seg_able_point = [] for ele in depth_scores: if start_i<=ele[1]<=end_i: seg_able_point.append(ele) if len(seg_able_point)==0: return [] seg_able_point.sort(key = lambda t:t[0],reverse=True) for seg_num in range(1,len(seg_able_point)+1): Flag = True pre_bound = start-1 for seg_point in seg_able_point[:seg_num]: if not (self.opt['min_seg_length'] <= sum(each_sentence_len[pre_bound+1:seg_point[1]]) <= self.opt['max_seg_length']): Flag = False break pre_bound = seg_point[1] if not (self.opt['min_seg_length'] <= sum(each_sentence_len[pre_bound + 1:end]) <= self.opt['max_seg_length']): Flag = False if Flag: return [ele[1] for ele in seg_able_point[:seg_num]] return [] for (boundaries, depth_scores, each_sentence_len) in estimated_boundaries: print("boundaries = ",boundaries) print("depth_scores = ",depth_scores) print("each_sentence_len = ",each_sentence_len) ''' sent = 秋季 随想 sent = 秋天 多愁善感 夏季 华丽 落幕 中 悄然 而临 黄色 枯叶 空中 划出 一道 思索 弧线 悠然 沉寂 真 可谓 一叶知秋 相比 秋 洗刷 蔚蓝 高空 更 喜欢 渲染 火红 枫林 停车 坐爱 枫林晚 霜叶 红于 二月 花 枫林 秋 私语 述说 沧桑 生命 故事 秋雨 缠绵悱恻 世界 织 一条 精致 雨帘 可谓 大珠小珠落玉盘 清脆 声是 动听 催眠曲 秋 春 傲慢 夏 奔放 冬 冷酷 秋是 知性 感性 时常 思考 生命 意义 思索 生命 厚度 青春期 生命 索取 美丽 智慧 面对 挫折 只能 望天 兴叹 sent = 纯真 童年 真挚 告白 不屑 笑容 留恋 踏入 青春期 面对 生命 变化 感到 无所适从 秋天 新 环境 新 老师 同学 新 学习 生活 生命 美丽 雾 置身其中 分辨 不清 雨 遮住 眼 虚无 寻找 奇迹 也许 生命 无尽 失望 中 找寻 一丝 希望 犹如 空谷回音 听 不到 真切 回答 听见 无助 呐喊 城市 灯塔 中喊出 愿望 总有一天 听见 城市 回复 城市 幸福 漂流瓶 命运 中 搁浅 找到 爱 sent = 面对 生活 放荡不羁 谨小慎微 时常 听到 只缘身在此山中 无力 哀叹 青春 迷茫 生活 残酷 喜欢 沉浸 空想 中 海市蜃楼 美丽 是因为 神秘 世外桃源 令人 向往 是因为 束之高阁 清高 颐指气使 朋友 不解 中 幡然醒悟 清高 修身 处世 做 一支 出淤泥而不染 濯 清涟 妖 花朵 意志 前提 一朵 莲花 根 支撑 鲜艳 外表 只能 山花 烂漫 时 丛中 笑 sent = 秋是 忧郁 多愁善感 依稀记得 红衣 女子 悲痛 余 葬花 故事 糟糕 成绩 中 潸然泪下 一只 蛹 尚未 摆脱 束缚 蓝色 蛹 等待 金色 碟 幻化 想 蛹 厚重 外壳 保护 安然 沉睡 受 不到 外界 风雨 侵袭 父母 温暖 双臂 做 美好 梦 蓝色 蛹 内心 充满 挣脱 外壳 悸动 外壳 破碎 声中 破茧 成蝶 金色 翅膀 金色 阳光 熠熠生辉 诧异 生 出手 奔跑 追逐 金色 蝶 金色 蝶 回头 渐渐 消失 眼帘 哭 泣不成声 是因为 孤独 是因为 懦弱 体会 温暖 港湾 停留 太久 世界 奋斗 拥有 生活 岁月蹉跎 黄了 树叶 绿 芭蕉 风雨兼程 中 刻骨 伤害 中 加深 生命 智慧 美丽 厚度 sent = 擦干 眼泪 站 脚步 踉踉跄跄 一份 坚定 起书 主动 地去 汲取 生命 营养 一改 自命清高 常态 融入 朋友 主动 交谈 参加 活动 锻炼 面对 失败 挫折 依然 笑颜 如花 秋 忧伤 生命 传承 坠入 堕落 悬崖 众目睽睽 中 高姿态 面对 城市 星空 喊 出 时 早已 答案 答案 千磨 万击 坚劲 任尔 东西南北 风 一叶知秋 管中窥豹 努力 奋斗 中 生命 真谛 缓缓的 展现 面前 喜欢 深沉 热爱 生命 boundaries = [1] depth_scores = [(0.4447948396505025, 1), (0.05610152935946028, 3)] each_sentence_len = [4, 171, 137, 129, 204, 142] ''' pre_bound_start = -1 pre_bound = -1 res = [] boundaries.append(len(each_sentence_len)-1) Flag=True for indx, bound in enumerate( boundaries): if self.opt['min_seg_length'] <= sum(each_sentence_len[pre_bound+1:bound+1]) <= self.opt['max_seg_length']: res.append(bound) pre_bound_start = pre_bound + 1 pre_bound = bound elif sum(each_sentence_len[pre_bound+1:bound+1]) > self.opt['max_seg_length']: temp = find_dp(pre_bound+1,bound,each_sentence_len,depth_scores) #在pre_bound+1 ,bound之间寻找一个满足长度要求的分割点,再次分隔 if temp: res.append(pre_bound+1) res.extend(temp) res.append(bound) pre_bound_start = pre_bound+1 pre_bound = bound else: Flag = False #这一段不存在满足条件的分割,只好放弃这个节目 break elif sum(each_sentence_len[pre_bound+1:bound+1]) < self.opt['min_seg_length']: #如果遇到某一个段太短的情况的话,只能将这个段粘贴到前一段之后,或者下一段之前了 #注意当前的seg是开头的seg,和结尾的seg的情况。 ''' 与之前的seg合并 ''' if indx != 0: if len(res) >= 2: if pre_bound_start != -1: last_seg_start = pre_bound_start + 1 else: last_seg_start = res[-2] print("正常情况不会走这个分支") assert 1==0 else: last_seg_start = 0 if sum(each_sentence_len[last_seg_start:bound+1]) <= self.opt['max_seg_length']: if len(res)>0: res.pop(-1) else: res.append(0) assert bound != 0 res.append(bound) pre_bound_start = pre_bound + 1 pre_bound = bound continue ''' 其余的情况都是要与下一个seg合并的 ''' if indx != len(boundaries)-1: continue #pre_bound不变,之间走到下一个bound else: #这是最后一个bound,肯定不可能拼出来了。 Flag = False break if Flag: res.pop(0) Res.append(res) else: Res.append([]) return Res def score(self, X, method='pk',k=None): """ Calculates segmentation score with Pk or WindowDiff measure. :param X: List (iterable) of documents (class Document). :param method: String which indicates which evaluation method should be used. Possible evaluation methods are Pk measure ('pk') or WindowDiff method ('wd'). By default Pk measure is used. :return float: Evaluation score (actually method returns 1 - pk or 1 - wd because standard scikit learn grid search treats higher values as better while the oposite is the case with pk and wd). """ if method == 'wd': scorer = windowdiff else: scorer = pk scores = np.empty(0) estimated_boundaries = self.predict(X) for i, document in enumerate(X): ref_doc = doc_to_seg_string(len(document.sentences), document.boundaries) estimated_doc = doc_to_seg_string(len(document.sentences), estimated_boundaries[i]) # calculate k if k is None: k = int(round(len(ref_doc) / (ref_doc.count('1') * 2.))) scores = np.append(scores, scorer(ref_doc, estimated_doc, k)) return 1 - scores.mean() def set_params(self, **params): """ Sets value of parameters. :param params: Dictionary of parameters to be set. """ super(SegmentationEngine, self).set_params(**params) # refresh parameters self.lda.set_params(n_topics=self.n_topics, max_iter=self.max_iter, doc_topic_prior=self.a, topic_word_prior=self.b, random_state=self.random_state) self.tt.set_m(self.m) return self def parse_data(self, documents, input_type='sentence'): """ Transforms list of documents into list of segments. :param documents: List of documents (class Document) :input_type: Determines basic input unit.Possible values are 'segment', 'document', 'sentence'. By default we use 'segment'. :return list: List of segments. """ train_data = [] for document in documents: if input_type == 'segment': train_data.extend(document.to_segments()) elif input_type == 'sentence': train_data.extend(document.sentences) elif input_type == 'document': train_data.append(document.to_text()) else: raise ValueError('Invalid input_type parameter!') return train_data
class SegmentationEngine(BaseEstimator): """ Implementation of segmentation engine used for segmenting documents. Based on Latent Dirichlet Allocation model and Topic Tiling algorithm. :param vectorizer: CountVectorizer class used for transforming and cleaning input data. :param lda: Latent Dirichlet Allocation model. :param tt: Topic Tiling class. :param n_topics: Number of topics parameter of LDA. :param max_iter: Maximum number of iterations parameter of LDA. :param a: Document topics prior parameter of LDA. :param b: Topic document prior parameter of LDA. :param m: Multiplier parameter of Topic Tiling :param random_state: Random state. """ def __init__(self, n_topics=10, max_iter=None, a=None, b=None, m=None, random_state=None, lda_learning_method="batch"): """ Initializes estimator. """ self.n_topics = n_topics self.max_iter = max_iter self.a = a self.b = b self.m = m self.random_state = random_state self.vectorizer = CountVectorizer(max_df=0.95, min_df=2, tokenizer=tokenize, stop_words=CHINESE_STOP_WORDS) self.lda = LatentDirichletAllocation( n_topics=n_topics, max_iter=max_iter, doc_topic_prior=a, topic_word_prior=b, random_state=random_state, learning_method=lda_learning_method) self.tt = TopicTiling(m=m) def fit(self, documents, input_type='sentence'): """ Trains segmentation engine. :param documents: List (iterable) of documents (class Document). :input_type: Determines basic input unit.Possible values are 'segment', 'document', 'sentence'. By default we use 'segment'. """ t0 = time() train_data = self.parse_data(documents, input_type) X = self.vectorizer.fit_transform(train_data) self.lda.fit(X) print('Fitted in %0.2f seconds' % (time() - t0)) def pickle_lda(self, path): if not os.path.exists(path): os.mkdir(path) pickle.dump(self.vectorizer, open(os.path.join(path, 'vectorizer.pkl'), "wb")) pickle.dump(self.lda, open(os.path.join(path, "lda.pkl"), "wb")) def get_pickled_lda(self, path): self.vectorizer = pickle.load( open(os.path.join(path, "vectorizer.pkl"), "rb")) self.lda = pickle.load(open(os.path.join(path, "lda.pkl"), "rb")) def predict(self, documents): """ Calculates segment boundaries for documents. :param documents: List (iterable) of documents (class Document). :return: List of boundaries for each document. """ # TODO check if fit has been called estimated_boundaries = [] for document in documents: sentence_vectors = [ self.lda.transform(self.vectorizer.transform([sentence])) for sentence in document.sentences ] estimated_boundaries.append(self.tt.fit(sentence_vectors)) return estimated_boundaries def score(self, X, method='pk', k=None): """ Calculates segmentation score with Pk or WindowDiff measure. :param X: List (iterable) of documents (class Document). :param method: String which indicates which evaluation method should be used. Possible evaluation methods are Pk measure ('pk') or WindowDiff method ('wd'). By default Pk measure is used. :return float: Evaluation score (actually method returns 1 - pk or 1 - wd because standard scikit learn grid search treats higher values as better while the oposite is the case with pk and wd). """ if method == 'wd': scorer = windowdiff else: scorer = pk scores = np.empty(0) estimated_boundaries = self.predict(X) for i, document in enumerate(X): ref_doc = doc_to_seg_string(len(document.sentences), document.boundaries) estimated_doc = doc_to_seg_string(len(document.sentences), estimated_boundaries[i]) # calculate k if k is None: k = int(round(len(ref_doc) / (ref_doc.count('1') * 2.))) scores = np.append(scores, scorer(ref_doc, estimated_doc, k)) return 1 - scores.mean() def set_params(self, **params): """ Sets value of parameters. :param params: Dictionary of parameters to be set. """ super(SegmentationEngine, self).set_params(**params) # refresh parameters self.lda.set_params(n_topics=self.n_topics, max_iter=self.max_iter, doc_topic_prior=self.a, topic_word_prior=self.b, random_state=self.random_state) self.tt.set_m(self.m) return self def parse_data(self, documents, input_type='sentence'): """ Transforms list of documents into list of segments. :param documents: List of documents (class Document) :input_type: Determines basic input unit.Possible values are 'segment', 'document', 'sentence'. By default we use 'segment'. :return list: List of segments. """ train_data = [] for document in documents: if input_type == 'segment': train_data.extend(document.to_segments()) elif input_type == 'sentence': train_data.extend(document.sentences) elif input_type == 'document': train_data.append(document.to_text()) else: raise ValueError('Invalid input_type parameter!') return train_data