Exemplo n.º 1
0
    def step3_cal():
        """计算各簇的特征词、代表文本、去重, 更新簇的大小、增幅信息
        """
        print '[%s] ' % ts2datetime(int(time.time(
        ))), 'event ', eventid, ' %s start step3' % ts2datetime(timestamp)

        inputs = []
        subevents = event.getSubEvents()
        for subevent in subevents:
            subeventid = subevent["_id"]
            inputs.extend(event.getSubeventInfos(subeventid))

        for r in inputs:
            r["title"] = r["title"].encode("utf-8")
            r["content"] = r["content168"].encode("utf-8")
            r["label"] = r["subeventid"]

        # 计算各簇的存量特征词
        cluster_feature = extract_feature(inputs)
        for label, fwords in cluster_feature.iteritems():
            feature = Feature(label)
            feature.upsert_newest(fwords)

        # 计算文本权重
        for r in inputs:
            weight = text_weight_cal(r, cluster_feature[r['label']])
            news = News(r["_id"], event.id)
            news.update_news_weight(weight)

        # 文本去重
        items_dict = {}
        for r in inputs:
            try:
                items_dict[r["label"]].append(r)
            except KeyError:
                items_dict[r["label"]] = [r]

        for label, items in items_dict.iteritems():
            results = duplicate(items)
            for r in results:
                news = News(r["_id"], event.id)
                news.update_news_duplicate(r["duplicate"], r["same_from"])

            # 更新簇的大小、增幅信息
            before_size = event.get_subevent_size(label)
            event.update_subevent_size(label, len(items))
            event.update_subevent_addsize(label, len(items) - before_size)

        if initializing:
            # 更新事件状态由initializing变为active
            event.activate()

        print '[%s] ' % ts2datetime(int(time.time(
        ))), 'event ', eventid, ' %s end step3' % ts2datetime(timestamp)
Exemplo n.º 2
0
    def step3_cal():
        """计算各簇的特征词、代表文本、去重, 更新簇的大小、增幅信息
        """
        print '[%s] ' % ts2datetime(int(time.time())), 'event ', eventid, ' %s start step3' % ts2datetime(timestamp)

        inputs = []
        subevents = event.getSubEvents()
        for subevent in subevents:
            subeventid = subevent["_id"]
            inputs.extend(event.getSubeventInfos(subeventid))

        for r in inputs:
            r["title"] = r["title"].encode("utf-8")
            r["content"] = r["content168"].encode("utf-8")
            r["label"] = r["subeventid"]

        # 计算各簇的存量特征词
        cluster_feature = extract_feature(inputs)
        for label, fwords in cluster_feature.iteritems():
            feature = Feature(label)
            feature.upsert_newest(fwords)

        # 计算文本权重
        for r in inputs:
            weight = text_weight_cal(r, cluster_feature[r['label']])
            news = News(r["_id"], event.id)
            news.update_news_weight(weight)

        # 文本去重
        items_dict = {}
        for r in inputs:
            try:
                items_dict[r["label"]].append(r)
            except KeyError:
                items_dict[r["label"]] = [r]

        for label, items in items_dict.iteritems():
            results = duplicate(items)
            for r in results:
                news = News(r["_id"], event.id)
                news.update_news_duplicate(r["duplicate"], r["same_from"])

            # 更新簇的大小、增幅信息
            before_size = event.get_subevent_size(label)
            event.update_subevent_size(label, len(items))
            event.update_subevent_addsize(label, len(items) - before_size)

        if initializing:
            # 更新事件状态由initializing变为active
            event.activate()

        print '[%s] ' % ts2datetime(int(time.time())), 'event ', eventid, ' %s end step3' % ts2datetime(timestamp)