示例#1
0
        print(topic)

    doc_topics = btn_ok(topics)
    doc_topics = sorted(doc_topics, key=lambda d: d['topic'])
    print('\n\ndocument\n\n')
    for doc in doc_topics:
        print(doc)


# n_topic = 3 do topics = ... dong thu 52
if __name__ == '__main__':
    # import json
    # data = DataAccess().get_documents()
    # json.dump(data,open('data.json','w',encoding='utf-8'),ensure_ascii=False)

    data = DataAccess().get_documents()[:10]

    # vn_segment_word = ['a_b a_b b_a b_a', 'd_c c_d c_d d_c', 'b_a a_b b_a', 'c_d c_d d_c d_c c_d']
    vn_segment_word = [d['content'] for d in data]
    tv = TfidfVectorizer(max_df=1.0, min_df=0, max_features=5000)
    X = tv.fit_transform(vn_segment_word)
    lda_model = LatentDirichletAllocation(n_topics=2,
                                          n_components=10,
                                          max_iter=500,
                                          learning_method='batch',
                                          n_jobs=-1)
    lda_output = lda_model.fit_transform(X)
    print(lda_output)
    print(lda_model.set_params(n_topics=["topic1", "topic2"]))
    print(lda_model.transform(X[:5]))
def LDA(train_size, random_state):
    """
    Classification pipeline with LDA preprocessing.

    Inputs:
        - train_size (int): number of training samples.
        - random_state (int): seed for random number generators

    Output:
        (None)
    """
    subset = 'subset_%s' % train_size

    input_dir = INPUT_DIR / subset
    model_dir = OUTPUT_DIR / subset
    createDirs(model_dir)

    X_train, X_test, y_train, y_test = loadClean(input_dir)
    X_train_sub, X_val, y_train_sub, y_val = train_test_split(X_train, y_train,
                                                              test_size=0.2,
                                                              random_state=random_state)
    scaler = StandardScaler()

    best_params = []
    best_k, best_auc, best_acc = None, 0, 0

    for k in K:
        model_name = "lda_%s.joblib" % k
        try:
            lda = load(model_dir / model_name)
            # logger.info("\t\tk = %s, fitted LDA model loaded." % k)
        except:
            lda = LatentDirichletAllocation(n_components=k,
                                            doc_topic_prior=50 / k,
                                            topic_word_prior=0.01,
                                            n_jobs=-1,
                                            random_state=random_state)
            lda.fit(X_train)
            dump(lda, model_dir / model_name)

        X_train_ = scaler.fit_transform(lda.transform(X_train_sub))
        X_val_ = scaler.transform(lda.transform(X_val))

        clf_val = LogisticRegressionVal(X_train_, y_train_sub, X_val_, y_val,
                                        k, random_state=random_state)
        best_k, best_auc, best_acc, best_params = clf_val.tune(best_k, best_auc,
                                                               best_acc,
                                                               best_params)

    clf, file_name, header = clf_val.bestClassifier(best_params)
    lda.set_params(**{'n_components': best_k,
                      'doc_topic_prior': 50 / best_k})
    preprocess = make_pipeline(lda, scaler)
    tr_time, tr_metrics, test_time, test_metrics = evaluate(preprocess, clf,
                                                            X_train, y_train,
                                                            X_test, y_test)

    writeResults(file_name, header, 'lda',
                 train_size, best_k, best_params,
                 tr_time, tr_metrics, test_time, test_metrics)

    logger.info(("\tFor training size = %s, best number of topics k = %s "
                 "best parameter grid: %s (train AUC: {:.3f}, train acc: {:.3f};"
                 " test AUC: {:.3f}, test acc: {:.3f})").
                format(tr_metrics[0], tr_metrics[1],
                       test_metrics[0], test_metrics[1])
                % (train_size, best_k, best_params))
示例#3
0
    timestamp = time.time()
    print("加载停词表...")
    load_stopwords(swlist)
    print("加载停词表耗时:", time.time() - timestamp, "s")
    timestamp = time.time()
    print("分词...")
    lemmatizer = WordNetLemmatizer()
    load_corpus(expected_tags, lemmatizer, swlist, corpus)
    print("分词耗时:", time.time() - timestamp, "s")
    tf_vectorizer = CountVectorizer(stop_words="english", lowercase=False)
    word_freq = tf_vectorizer.fit_transform(corpus)
    tf_feature_names = tf_vectorizer.get_feature_names()
    print("语料库总词数:", len(tf_feature_names))
    lda = LatentDirichletAllocation(max_iter=50, doc_topic_prior=0.5, \
        topic_word_prior=0.1, learning_method="batch", random_state=0)
    for n_topics in [5, 10, 20]:
        lda.set_params(n_components=n_topics)
        params = lda.get_params(False)
        print("\nLDA模型参数:")
        for key, value in params.items():
            print(key, "<-", value)
        timestamp = time.time()
        print("LDA(" + "n_components = " + str(n_topics) + ")训练...")
        lda.fit(word_freq)
        print("LDA(" + "n_components = " + str(n_topics) + ")训练耗时:",
              time.time() - timestamp, "s")
        print("输出结果到" + "../../output_python/topic" + str(n_topics) +
              "/topic-top" + str(n_top_words) + "keywords.txt")
        save_top_topciwords(lda, tf_feature_names, n_top_words)
    print("\n结束时间:", time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
class LDA(GenericModel):
    def __init__(self, **kwargs):
        self._corpus_matrix = None
        self._query_vector = None

        self.vectorizer = None
        self.lda_model = LatentDirichletAllocation(n_jobs=-1)

        super().__init__()

        self.similarity_measure = None
        self.set_basic_params(**kwargs)

        self.set_vectorizer(**kwargs)
        self.set_lda_model(**kwargs)

    def set_name(self, name):
        super().set_name(name)

    def set_model_gen_name(self, gen_name):
        super().set_model_gen_name(gen_name)

    def set_basic_params(self, **kwargs):
        self.set_name('LDA' if LDA_Model_Hyperp.NAME.value not in
                      kwargs.keys() else kwargs[LDA_Model_Hyperp.NAME.value])
        self.set_model_gen_name('lda')
        self.set_similarity_measure(
            sm.SimilarityMeasure.COSINE if LDA_Model_Hyperp.SIMILARITY_MEASURE.
            value not in kwargs.keys() else kwargs[LDA_Model_Hyperp.
                                                   SIMILARITY_MEASURE.value])

    def set_similarity_measure(self, sim_measure):
        self.similarity_measure = sim_measure

    def set_vectorizer(self, **kwargs):
        self.vectorizer = TfidfVectorizer(
            stop_words='english', use_idf=True, smooth_idf=True
        ) if LDA_Model_Hyperp.VECTORIZER.value not in kwargs.keys(
        ) else kwargs[LDA_Model_Hyperp.VECTORIZER.value]
        vec_params = {
            key.split('__')[2]: kwargs[key]
            for key, val in kwargs.items() if '__vectorizer__' in key
        }
        self.vectorizer.set_params(**vec_params)

    def set_lda_model(self, **kwargs):
        lda_model_params = {
            key.split('__')[2]: kwargs[key]
            for key, val in kwargs.items() if '__lda_model__' in key
        }
        self.lda_model.set_params(**lda_model_params)

    def recover_links(self, corpus, query, test_cases_names,
                      bug_reports_names):
        self._corpus_matrix = self.vectorizer.fit_transform(corpus)
        self._query_vector = self.vectorizer.transform(query)

        self.out_1 = self.lda_model.fit_transform(self._corpus_matrix)
        self.out_2 = self.lda_model.transform(self._query_vector)

        metric = self.similarity_measure
        if metric == sm.SimilarityMeasure.COSINE:
            self._sim_matrix = pairwise.cosine_similarity(X=self.out_1,
                                                          Y=self.out_2)
        elif metric == sm.SimilarityMeasure.JSD:
            self._sim_matrix = pairwise_distances(X=self.out_1,
                                                  Y=self.out_2,
                                                  metric=SimilarityMeasure.jsd)
        elif metric == sm.SimilarityMeasure.EUCLIDIAN_DISTANCE:
            self._sim_matrix = pairwise_distances(X=self.out_1,
                                                  Y=self.out_2,
                                                  metric='euclidean')

        #self._sim_matrix =  super().normalize_sim_matrix(self._sim_matrix)
        self._sim_matrix = pd.DataFrame(data=self._sim_matrix,
                                        index=test_cases_names,
                                        columns=bug_reports_names)

        self._record_docs_feats(corpus, query, test_cases_names,
                                bug_reports_names)

    def _record_docs_feats(self, corpus, query, test_cases_names,
                           bug_reports_names):
        self.mrw_tcs = self._recover_mrw_list(test_cases_names, corpus)
        self.mrw_brs = self._recover_mrw_list(bug_reports_names, query)

        self.dl_tcs = self._recover_dl_list(test_cases_names, corpus)
        self.dl_brs = self._recover_dl_list(bug_reports_names, query)

        index = list(test_cases_names) + list(bug_reports_names)
        self.docs_feats_df = pd.DataFrame(index=index, columns=['mrw', 'dl'])

        for tc_name, mrw in self.mrw_tcs:
            self.docs_feats_df.at[tc_name, 'mrw'] = mrw

        for tc_name, dl in self.dl_tcs:
            self.docs_feats_df.at[tc_name, 'dl'] = dl

        for br_name, mrw in self.mrw_brs:
            self.docs_feats_df.at[br_name, 'mrw'] = mrw

        for br_name, dl in self.dl_brs:
            self.docs_feats_df.at[br_name, 'dl'] = dl

    def _recover_dl_list(self, artf_names, artf_descs):
        tokenizer = PorterStemmerBased_Tokenizer()
        dl_list = []
        for artf_name, artf_desc in zip(artf_names, artf_descs):
            dl_list.append((artf_name, len(tokenizer.__call__(artf_desc))))
        return dl_list

    def _recover_mrw_list(self, artf_names, artf_descs):
        N_REL_WORDS = 6
        mrw_list = []  # list of tuples (artf_name, mrw_list={})

        for artf_name, artf_desc in zip(artf_names, artf_descs):
            X = self.vectorizer.transform([artf_desc])
            df1 = pd.DataFrame(X.T.toarray())
            df1['token'] = self.vectorizer.get_feature_names()
            df1.sort_values(by=0, ascending=False, inplace=True)
            mrw = list(df1.iloc[0:N_REL_WORDS, 1].values)
            mrw_list.append((artf_name, mrw))

        return mrw_list

    def model_setup(self):
        return {
            "Setup": [{
                "Name": self.get_name()
            }, {
                "Similarity Measure and Minimum Threshold":
                self.get_sim_measure_min_threshold()
            }, {
                "Top Value": self.get_top_value()
            }, {
                "LDA Model": self.lda_model.get_params()
            }, {
                "Vectorizer": self.vectorizer.get_params()
            }, {
                "Vectorizer Type": type(self.vectorizer)
            }]
        }

    def get_name(self):
        return super().get_name()

    def get_model_gen_name(self):
        return super().get_model_gen_name()

    def get_similarity_measure(self):
        return self.similarity_measure

    def get_sim_matrix(self):
        return super().get_sim_matrix()

    def get_tokenizer_type(self):
        return type(self.tokenizer)

    def save_sim_matrix(self):
        super().save_sim_matrix()

    def get_query_vector(self):
        return self._query_vector

    def get_corpus_matrix(self):
        return self._corpus_matrix

    def get_vectorizer_type(self):
        return type(self.vectorizer)

    def print_topics(self):
        feature_names = self.vectorizer.get_feature_names()
        n_top_words = 10

        for topic_idx, topic in enumerate(self.lda_model.components_):
            message = "Topic #%d: " % topic_idx
            message += " ".join([
                feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]
            ])
            print(message)
示例#5
0
class SegmentationEngine(BaseEstimator):
    """
    Implementation of segmentation engine used for segmenting documents.
    Based on Latent Dirichlet Allocation model and Topic Tiling algorithm.

    :param vectorizer: CountVectorizer class used for transforming and cleaning input data.
    :param lda: Latent Dirichlet Allocation model.
    :param tt: Topic Tiling class.
    :param n_topics: Number of topics parameter of LDA.
    :param max_iter: Maximum number of iterations parameter of LDA.
    :param a: Document topics prior parameter of LDA.
    :param b: Topic document prior parameter of LDA.
    :param m: Multiplier parameter of Topic Tiling
    :param random_state: Random state.
    """

    def __init__(self, n_topics=10, max_iter=None, a=None, b=None, m=None, random_state=None,lda_learning_method = "batch",opt = seg_poem_opt):
        """
        Initializes estimator.
        """
        self.n_topics = n_topics
        self.max_iter = max_iter
        self.a = a
        self.b = b
        self.m = m
        self.random_state = random_state

        self.vectorizer = CountVectorizer(max_df=0.95, min_df=2, tokenizer=tokenize, stop_words=CHINESE_STOP_WORDS)
        self.lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=max_iter, doc_topic_prior=a,
                                             topic_word_prior=b, random_state=random_state,learning_method=lda_learning_method)
        self.tt = TopicTiling(m=m)
        self.opt = opt

    def fit(self, documents, input_type='sentence'):
        """
        Trains segmentation engine.

        :param documents: List (iterable) of documents (class Document).
        :input_type: Determines basic input unit.Possible values are 'segment', 'document', 'sentence'.
                     By default we use 'segment'.
        """
        t0 = time()

        train_data = self.parse_data(documents, input_type)
        X = self.vectorizer.fit_transform(train_data)
        self.lda.fit(X)
        print('Fitted in %0.2f seconds' % (time() - t0))

    def pickle_lda(self,path):
        if not os.path.exists(path):
            os.mkdir(path)
        pickle.dump(self.vectorizer,open(os.path.join(path,'vectorizer.pkl'),"wb"))
        pickle.dump(self.lda,open(os.path.join(path,"lda.pkl"),"wb"))

    def get_pickled_lda(self , path):
        self.vectorizer = pickle.load(open(os.path.join(path,"vectorizer.pkl"),"rb"))
        self.lda = pickle.load(open(os.path.join(path,"lda.pkl"),"rb"))

    def predict(self, documents):
        """
        Calculates segment boundaries for documents.
        :param documents: List (iterable) of documents (class Document).
        :return: List of boundaries for each document.
        """
        # TODO check if fit has been called
        estimated_boundaries = []
        for document in documents:
            sentence_vectors = [self.lda.transform(self.vectorizer.transform([sentence])) for sentence in
                                document.sentences]
            '''sentence 是用空格间隔的 string type'''
            each_sentence_len = [len(''.join(sentence.split())) for sentence in document.sentences]
            boundaries, depth_scores = self.tt.fit(sentence_vectors)
            estimated_boundaries.append((boundaries, depth_scores, each_sentence_len))
        Res = self.infer_further(estimated_boundaries)
        return Res

    def infer_further(self, estimated_boundaries):
        Res = []
        def find_dp(start, end, each_sentence_len, depth_scores):
            '''
            在开区间(start,end)上寻找,使得分割点满足要求的,分割点,并且最好是分割在depth_score最大的点上
            算法,从depth_socre最大的开始增加,直到满足要求
            '''
            start_i = start + 1
            end_i = end - 1
            if start_i > end_i:
                return []
            seg_able_point = []
            for ele in depth_scores:
                if start_i<=ele[1]<=end_i:
                    seg_able_point.append(ele)
            if len(seg_able_point)==0:
                return []

            seg_able_point.sort(key = lambda t:t[0],reverse=True)

            for seg_num in range(1,len(seg_able_point)+1):
                Flag = True
                pre_bound = start-1
                for seg_point in seg_able_point[:seg_num]:
                    if not (self.opt['min_seg_length'] <= sum(each_sentence_len[pre_bound+1:seg_point[1]]) <= self.opt['max_seg_length']):
                        Flag = False
                        break
                    pre_bound = seg_point[1]
                if not (self.opt['min_seg_length'] <= sum(each_sentence_len[pre_bound + 1:end]) <= self.opt['max_seg_length']):
                    Flag = False
                if Flag:
                    return [ele[1] for ele in seg_able_point[:seg_num]]
            return []

        for (boundaries, depth_scores, each_sentence_len) in estimated_boundaries:
            print("boundaries = ",boundaries)
            print("depth_scores = ",depth_scores)
            print("each_sentence_len = ",each_sentence_len)
            '''
            sent =  秋季 随想
            sent =  秋天 多愁善感 夏季 华丽 落幕 中 悄然 而临 黄色 枯叶 空中 划出 一道 思索 弧线 悠然 沉寂 真 可谓 一叶知秋 相比 秋 洗刷 蔚蓝 高空 更 喜欢 渲染 火红 枫林 停车 坐爱 枫林晚 霜叶 红于 二月 花 枫林 秋 私语 述说 沧桑 生命 故事 秋雨 缠绵悱恻 世界 织 一条 精致 雨帘 可谓 大珠小珠落玉盘 清脆 声是 动听 催眠曲 秋 春 傲慢 夏 奔放 冬 冷酷 秋是 知性 感性 时常 思考 生命 意义 思索 生命 厚度 青春期 生命 索取 美丽 智慧 面对 挫折 只能 望天 兴叹
            sent =  纯真 童年 真挚 告白 不屑 笑容 留恋 踏入 青春期 面对 生命 变化 感到 无所适从 秋天 新 环境 新 老师 同学 新 学习 生活 生命 美丽 雾 置身其中 分辨 不清 雨 遮住 眼 虚无 寻找 奇迹 也许 生命 无尽 失望 中 找寻 一丝 希望 犹如 空谷回音 听 不到 真切 回答 听见 无助 呐喊 城市 灯塔 中喊出 愿望 总有一天 听见 城市 回复 城市 幸福 漂流瓶 命运 中 搁浅 找到 爱
            sent =  面对 生活 放荡不羁 谨小慎微 时常 听到 只缘身在此山中 无力 哀叹 青春 迷茫 生活 残酷 喜欢 沉浸 空想 中 海市蜃楼 美丽 是因为 神秘 世外桃源 令人 向往 是因为 束之高阁 清高 颐指气使 朋友 不解 中 幡然醒悟 清高 修身 处世 做 一支 出淤泥而不染 濯 清涟 妖 花朵 意志 前提 一朵 莲花 根 支撑 鲜艳 外表 只能 山花 烂漫 时 丛中 笑
            sent =  秋是 忧郁 多愁善感 依稀记得 红衣 女子 悲痛 余 葬花 故事 糟糕 成绩 中 潸然泪下 一只 蛹 尚未 摆脱 束缚 蓝色 蛹 等待 金色 碟 幻化 想 蛹 厚重 外壳 保护 安然 沉睡 受 不到 外界 风雨 侵袭 父母 温暖 双臂 做 美好 梦 蓝色 蛹 内心 充满 挣脱 外壳 悸动 外壳 破碎 声中 破茧 成蝶 金色 翅膀 金色 阳光 熠熠生辉 诧异 生 出手 奔跑 追逐 金色 蝶 金色 蝶 回头 渐渐 消失 眼帘 哭 泣不成声 是因为 孤独 是因为 懦弱 体会 温暖 港湾 停留 太久 世界 奋斗 拥有 生活 岁月蹉跎 黄了 树叶 绿 芭蕉 风雨兼程 中 刻骨 伤害 中 加深 生命 智慧 美丽 厚度
            sent =  擦干 眼泪 站 脚步 踉踉跄跄 一份 坚定 起书 主动 地去 汲取 生命 营养 一改 自命清高 常态 融入 朋友 主动 交谈 参加 活动 锻炼 面对 失败 挫折 依然 笑颜 如花 秋 忧伤 生命 传承 坠入 堕落 悬崖 众目睽睽 中 高姿态 面对 城市 星空 喊 出 时 早已 答案 答案 千磨 万击 坚劲 任尔 东西南北 风 一叶知秋 管中窥豹 努力 奋斗 中 生命 真谛 缓缓的 展现 面前 喜欢 深沉 热爱 生命
            boundaries =  [1]
            depth_scores =  [(0.4447948396505025, 1), (0.05610152935946028, 3)]
            each_sentence_len =  [4, 171, 137, 129, 204, 142]
            '''
            pre_bound_start = -1
            pre_bound = -1
            res = []
            boundaries.append(len(each_sentence_len)-1)
            Flag=True
            for indx, bound in enumerate( boundaries):
                if self.opt['min_seg_length'] <= sum(each_sentence_len[pre_bound+1:bound+1]) <= self.opt['max_seg_length']:
                    res.append(bound)
                    pre_bound_start = pre_bound + 1
                    pre_bound = bound
                elif sum(each_sentence_len[pre_bound+1:bound+1]) > self.opt['max_seg_length']:
                    temp = find_dp(pre_bound+1,bound,each_sentence_len,depth_scores) #在pre_bound+1 ,bound之间寻找一个满足长度要求的分割点,再次分隔
                    if temp:
                        res.append(pre_bound+1)
                        res.extend(temp)
                        res.append(bound)
                        pre_bound_start = pre_bound+1
                        pre_bound = bound
                    else:
                        Flag = False #这一段不存在满足条件的分割,只好放弃这个节目
                        break
                elif sum(each_sentence_len[pre_bound+1:bound+1]) < self.opt['min_seg_length']:
                    #如果遇到某一个段太短的情况的话,只能将这个段粘贴到前一段之后,或者下一段之前了
                    #注意当前的seg是开头的seg,和结尾的seg的情况。
                    '''
                    与之前的seg合并
                    '''
                    if indx != 0:
                        if len(res) >= 2:
                            if pre_bound_start != -1:
                                last_seg_start = pre_bound_start + 1
                            else:
                                last_seg_start = res[-2]
                                print("正常情况不会走这个分支")
                                assert 1==0
                        else:
                            last_seg_start = 0
                        if sum(each_sentence_len[last_seg_start:bound+1]) <= self.opt['max_seg_length']:
                            if len(res)>0:
                                res.pop(-1)
                            else:
                                res.append(0)
                                assert bound != 0
                            res.append(bound)
                            pre_bound_start = pre_bound + 1
                            pre_bound = bound
                            continue
                    '''
                    其余的情况都是要与下一个seg合并的
                    '''
                    if indx != len(boundaries)-1:
                        continue
                        #pre_bound不变,之间走到下一个bound
                    else:
                        #这是最后一个bound,肯定不可能拼出来了。
                        Flag = False
                        break
            if Flag:
                res.pop(0)
                Res.append(res)
            else:
                Res.append([])
        return Res


    def score(self, X, method='pk',k=None):
        """
        Calculates segmentation score with Pk or WindowDiff measure.

        :param X: List (iterable) of documents (class Document).
        :param method: String which indicates which evaluation method should be used.
                       Possible evaluation methods are Pk measure ('pk') or WindowDiff method ('wd').
                       By default Pk measure is used.
        :return float: Evaluation score (actually method returns 1 - pk or 1 - wd because standard
                       scikit learn grid search treats higher values as better while the oposite is
                       the case with pk and wd).
        """
        if method == 'wd':
            scorer = windowdiff
        else:
            scorer = pk

        scores = np.empty(0)
        estimated_boundaries = self.predict(X)
        for i, document in enumerate(X):
            ref_doc = doc_to_seg_string(len(document.sentences), document.boundaries)
            estimated_doc = doc_to_seg_string(len(document.sentences), estimated_boundaries[i])
            # calculate k
            if k is None:
                k = int(round(len(ref_doc) / (ref_doc.count('1') * 2.)))
            scores = np.append(scores, scorer(ref_doc, estimated_doc, k))
        return 1 - scores.mean()

    def set_params(self, **params):
        """
        Sets value of parameters.

        :param params: Dictionary of parameters to be set.
        """
        super(SegmentationEngine, self).set_params(**params)

        # refresh parameters
        self.lda.set_params(n_topics=self.n_topics, max_iter=self.max_iter, doc_topic_prior=self.a,
                            topic_word_prior=self.b, random_state=self.random_state)
        self.tt.set_m(self.m)
        return self

    def parse_data(self, documents, input_type='sentence'):
        """
        Transforms list of documents into list of segments.
        :param documents: List of documents (class Document)
        :input_type: Determines basic input unit.Possible values are 'segment', 'document', 'sentence'.
                     By default we use 'segment'.
        :return list: List of segments.
        """
        train_data = []
        for document in documents:
            if input_type == 'segment':
                train_data.extend(document.to_segments())
            elif input_type == 'sentence':
                train_data.extend(document.sentences)
            elif input_type == 'document':
                train_data.append(document.to_text())
            else:
                raise ValueError('Invalid input_type parameter!')
        return train_data
class SegmentationEngine(BaseEstimator):
    """
    Implementation of segmentation engine used for segmenting documents.
    Based on Latent Dirichlet Allocation model and Topic Tiling algorithm.

    :param vectorizer: CountVectorizer class used for transforming and cleaning input data.
    :param lda: Latent Dirichlet Allocation model.
    :param tt: Topic Tiling class.
    :param n_topics: Number of topics parameter of LDA.
    :param max_iter: Maximum number of iterations parameter of LDA.
    :param a: Document topics prior parameter of LDA.
    :param b: Topic document prior parameter of LDA.
    :param m: Multiplier parameter of Topic Tiling
    :param random_state: Random state.
    """
    def __init__(self,
                 n_topics=10,
                 max_iter=None,
                 a=None,
                 b=None,
                 m=None,
                 random_state=None,
                 lda_learning_method="batch"):
        """
        Initializes estimator.
        """
        self.n_topics = n_topics
        self.max_iter = max_iter
        self.a = a
        self.b = b
        self.m = m
        self.random_state = random_state

        self.vectorizer = CountVectorizer(max_df=0.95,
                                          min_df=2,
                                          tokenizer=tokenize,
                                          stop_words=CHINESE_STOP_WORDS)
        self.lda = LatentDirichletAllocation(
            n_topics=n_topics,
            max_iter=max_iter,
            doc_topic_prior=a,
            topic_word_prior=b,
            random_state=random_state,
            learning_method=lda_learning_method)
        self.tt = TopicTiling(m=m)

    def fit(self, documents, input_type='sentence'):
        """
        Trains segmentation engine.

        :param documents: List (iterable) of documents (class Document).
        :input_type: Determines basic input unit.Possible values are 'segment', 'document', 'sentence'.
                     By default we use 'segment'.
        """
        t0 = time()

        train_data = self.parse_data(documents, input_type)
        X = self.vectorizer.fit_transform(train_data)
        self.lda.fit(X)
        print('Fitted in %0.2f seconds' % (time() - t0))

    def pickle_lda(self, path):
        if not os.path.exists(path):
            os.mkdir(path)
        pickle.dump(self.vectorizer,
                    open(os.path.join(path, 'vectorizer.pkl'), "wb"))
        pickle.dump(self.lda, open(os.path.join(path, "lda.pkl"), "wb"))

    def get_pickled_lda(self, path):
        self.vectorizer = pickle.load(
            open(os.path.join(path, "vectorizer.pkl"), "rb"))
        self.lda = pickle.load(open(os.path.join(path, "lda.pkl"), "rb"))

    def predict(self, documents):
        """
        Calculates segment boundaries for documents.

        :param documents: List (iterable) of documents (class Document).
        :return: List of boundaries for each document.
        """
        # TODO check if fit has been called
        estimated_boundaries = []
        for document in documents:
            sentence_vectors = [
                self.lda.transform(self.vectorizer.transform([sentence]))
                for sentence in document.sentences
            ]
            estimated_boundaries.append(self.tt.fit(sentence_vectors))
        return estimated_boundaries

    def score(self, X, method='pk', k=None):
        """
        Calculates segmentation score with Pk or WindowDiff measure.

        :param X: List (iterable) of documents (class Document).
        :param method: String which indicates which evaluation method should be used.
                       Possible evaluation methods are Pk measure ('pk') or WindowDiff method ('wd').
                       By default Pk measure is used.
        :return float: Evaluation score (actually method returns 1 - pk or 1 - wd because standard
                       scikit learn grid search treats higher values as better while the oposite is
                       the case with pk and wd).
        """
        if method == 'wd':
            scorer = windowdiff
        else:
            scorer = pk

        scores = np.empty(0)
        estimated_boundaries = self.predict(X)
        for i, document in enumerate(X):
            ref_doc = doc_to_seg_string(len(document.sentences),
                                        document.boundaries)
            estimated_doc = doc_to_seg_string(len(document.sentences),
                                              estimated_boundaries[i])
            # calculate k
            if k is None:
                k = int(round(len(ref_doc) / (ref_doc.count('1') * 2.)))
            scores = np.append(scores, scorer(ref_doc, estimated_doc, k))
        return 1 - scores.mean()

    def set_params(self, **params):
        """
        Sets value of parameters.

        :param params: Dictionary of parameters to be set.
        """
        super(SegmentationEngine, self).set_params(**params)

        # refresh parameters
        self.lda.set_params(n_topics=self.n_topics,
                            max_iter=self.max_iter,
                            doc_topic_prior=self.a,
                            topic_word_prior=self.b,
                            random_state=self.random_state)
        self.tt.set_m(self.m)
        return self

    def parse_data(self, documents, input_type='sentence'):
        """
        Transforms list of documents into list of segments.
        :param documents: List of documents (class Document)
        :input_type: Determines basic input unit.Possible values are 'segment', 'document', 'sentence'.
                     By default we use 'segment'.
        :return list: List of segments.
        """
        train_data = []
        for document in documents:
            if input_type == 'segment':
                train_data.extend(document.to_segments())
            elif input_type == 'sentence':
                train_data.extend(document.sentences)
            elif input_type == 'document':
                train_data.append(document.to_text())
            else:
                raise ValueError('Invalid input_type parameter!')
        return train_data