示例#1
0
def load_subject_weibo_data():
    """加载客观微博数据
    """
    # topicname = u'外滩踩踏-微博'
    # topicname = u'呼格案-微博'
    # topicname = u'复旦投毒案-微博'
    topicname = u'APEC-微博'

    em = EventManager()
    topicid = em.getEventIDByName(topicname)
    event = Event(topicid)

    # f = open('caitai.jl')
    # f = open('huge.jl')
    # f = open('fudan.jl')
    f = open('apec.jl')
    for line in f:
        item = json.loads(line.strip())
        item['text'] = item['text'].encode('utf-8')
        item = subob_classifier(item)
        if not item['subject']:
            news = subject_weibo2news(item)
            event.saveItem(news)

    f.close()
示例#2
0
def load_subject_weibo_data():
    """加载客观微博数据
    """
    # topicname = u'外滩踩踏-微博'
    # topicname = u'呼格案-微博'
    # topicname = u'复旦投毒案-微博'
    topicname = u'APEC-微博'

    em = EventManager()
    topicid = em.getEventIDByName(topicname)
    event = Event(topicid)

    # f = open('caitai.jl')
    # f = open('huge.jl')
    # f = open('fudan.jl')
    f = open('apec.jl')
    for line in f:
        item = json.loads(line.strip())
        item['text'] = item['text'].encode('utf-8')
        item = subob_classifier(item)
        if not item['subject']:
            news = subject_weibo2news(item)
            event.saveItem(news)

    f.close()
示例#3
0
def eventriver():
    """event river数据
    """
    topic_name = request.args.get('query', default_topic)  # 话题名
    sort = request.args.get('sort',
                            'tfidf')  # weight, addweight, created_at, tfidf
    end_ts = request.args.get('ts', None)
    during = request.args.get('during', None)

    if end_ts:
        end_ts = int(end_ts)

    if during:
        during = int(during)
        start_ts = end_ts - during

    topicid = em.getEventIDByName(topic_name)
    event = Event(topicid)
    subeventlist, dates, total_weight = event.getEventRiverData(start_ts,
                                                                end_ts,
                                                                sort=sort)

    return json.dumps({
        "dates": dates,
        "name": topic_name,
        "type": "eventRiver",
        "weight": total_weight,
        "eventList": subeventlist
    })
示例#4
0
def one_topic_clear(topicname_start):
    topicname, start_datetime = topicname_start
    em = EventManager()
    topicid = em.getEventIDByName(topicname)
    start_ts = datetime2ts(start_datetime)

    event = Event(topicid)
    event.initialize(start_ts)
示例#5
0
def one_topic_clear(topicname_start):
    topicname, start_datetime = topicname_start
    em = EventManager()
    topicid = em.getEventIDByName(topicname)
    start_ts = datetime2ts(start_datetime)

    event = Event(topicid)
    event.initialize(start_ts)
示例#6
0
def initializeNewsTopic():
    """初始化新闻话题
    """
    em = EventManager()

    topicname = u'外滩踩踏'
    start_datetime = "2015-01-02 00:00:00"
    topicid = em.getEventIDByName(topicname)
    start_ts = datetime2ts(start_datetime)

    event = Event(topicid)
    event.initialize(start_ts)
示例#7
0
def initializeNewsTopic():
    """初始化新闻话题
    """
    em = EventManager()

    topicname = u'外滩踩踏'
    start_datetime = "2015-01-02 00:00:00"
    topicid = em.getEventIDByName(topicname)
    start_ts = datetime2ts(start_datetime)

    event = Event(topicid)
    event.initialize(start_ts)
示例#8
0
def createEvent(name,begin,end,location,description,leader,img):
	size=6
	chars=string.ascii_uppercase + string.digits
	unHashed = ''.join(random.choice(chars) for _ in range(size))
	qr_code = pbkdf2_sha256.hash(unHashed)
	if(name == '' or
	   begin == '' or
	   end == '' or
	   location == '' or
	   description == '' or
	   leader== '' or
	   img==''):
		return 400
	event = Event(
			name=name,
			begin=begin,
			end=end,
			location=location,
			desc=description,
			leader=leader,
			cancel=0,
			img=img,
			qr_code=qr_code,
			created= datetime.datetime.now(),
			link= None
		)
	return Persister.persist_object(event)
示例#9
0
def __make_database():
    engine = create_engine(f'sqlite:///{DATABASE_NAME}', echo=False)
    metadata = MetaData()
    users_table = Table(USER_TABLE_NAME, metadata,
                        *User.get_all_columns_with_attributes())
    events_table = Table(EVENT_TABLE_NAME, metadata,
                         *Event.get_all_columns_with_attributes())
    metadata.create_all(engine)
    __fill_database()
示例#10
0
def handle_error(eventid):
    """
    """
    event = Event(eventid)
    last_modify = event.getLastmodify()

    event.setLastmodify(last_modify - 3600)
    event.setModifysuccess(True)
示例#11
0
def initializeWeiboTopic():
    """初始化weibo话题
    """
    em = EventManager()

    # topicname = u'外滩踩踏-微博'
    # start_datetime = "2015-01-02 00:00:00"
    # topicname = u'呼格案-微博'
    # start_datetime = "2014-12-14 00:00:00"
    # topicname = u'复旦投毒案-微博'
    # start_datetime = "2014-12-15 00:00:00"
    topicname = u'APEC-微博'
    start_datetime = "2014-12-15 00:00:00"

    topicid = em.getEventIDByName(topicname)
    start_ts = datetime2ts(start_datetime)

    event = Event(topicid)
    event.initialize(start_ts)
示例#12
0
def initializeWeiboTopic():
    """初始化weibo话题
    """
    em = EventManager()

    # topicname = u'外滩踩踏-微博'
    # start_datetime = "2015-01-02 00:00:00"
    # topicname = u'呼格案-微博'
    # start_datetime = "2014-12-14 00:00:00"
    # topicname = u'复旦投毒案-微博'
    # start_datetime = "2014-12-15 00:00:00"
    topicname = u'APEC-微博'
    start_datetime = "2014-12-15 00:00:00"

    topicid = em.getEventIDByName(topicname)
    start_ts = datetime2ts(start_datetime)

    event = Event(topicid)
    event.initialize(start_ts)
示例#13
0
 def addEvent(self, title, description, project, beginDate, endDate):
     if persister.checkProjectExists(project):
         currentDate = datetime.datetime.now()
         eventObject = Event(title=title,
                             description=description,
                             project=project,
                             beginDate=beginDate,
                             endDate=endDate,
                             createdAt=currentDate)
         return persister.storeObject(eventObject)
     return False
示例#14
0
def one_topic_merge(eventid_initializing):
    """合并簇
       input:
           eventid_initializing: (eventid, initializing)
               eventid: 话题ID
               initializing: 是否做初始聚类
    """
    eventid, initializing = eventid_initializing

    # 根据话题ID初始化话题实例
    event = Event(eventid)

    timestamp = event.getLastmodify() + 3600  # 当前的时间戳,int, 默认为最后修改日期+3600
    now_hour = int(time.strftime('%H', time.localtime(timestamp)))

    subevents = event.getSubEvents()
    subevent_fwords = dict()
    for subevent in subevents:
        subeventid = subevent["_id"]
        feature = Feature(subeventid)

        # 获取每个子事件最新的特征词
        fwords = feature.get_newest()
        subevent_fwords[subeventid] = fwords

    subeventids_sort_timestamp = event.get_sorted_subeventids()

    cids, mids = merge_subevents(subevent_fwords,
                                 subeventids_sort_timestamp,
                                 top_tfidf_para=10,
                                 top_percent=0.3)

    for res_id, mer_id in mids:
        # 将mer_id下的文本扔入res_id下的簇,remove mer_id的簇,同时重新计算各簇的特征词, 并计算文本权重, 并去重
        temp_infos = event.get_subevent_infos(mer_id)

        for r in temp_infos:
            news = News(r["_id"], event.id)
            news.update_news_subeventid(res_id)

        event.remove_subevents([mer_id])
示例#15
0
def one_topic_merge(eventid_initializing):
    """合并簇
       input:
           eventid_initializing: (eventid, initializing)
               eventid: 话题ID
               initializing: 是否做初始聚类
    """
    eventid, initializing = eventid_initializing

    # 根据话题ID初始化话题实例
    event = Event(eventid)

    timestamp = event.getLastmodify() + 3600 # 当前的时间戳,int, 默认为最后修改日期+3600
    now_hour = int(time.strftime('%H', time.localtime(timestamp)))

    subevents = event.getSubEvents()
    subevent_fwords = dict()
    for subevent in subevents:
        subeventid = subevent["_id"]
        feature = Feature(subeventid)

        # 获取每个子事件最新的特征词
        fwords = feature.get_newest()
        subevent_fwords[subeventid] = fwords

    subeventids_sort_timestamp = event.get_sorted_subeventids()

    cids, mids = merge_subevents(subevent_fwords, subeventids_sort_timestamp, top_tfidf_para=10, top_percent=0.3)

    for res_id, mer_id in mids:
        # 将mer_id下的文本扔入res_id下的簇,remove mer_id的簇,同时重新计算各簇的特征词, 并计算文本权重, 并去重
        temp_infos = event.get_subevent_infos(mer_id)

        for r in temp_infos:
            news = News(r["_id"], event.id)
            news.update_news_subeventid(res_id)

        event.remove_subevents([mer_id])
示例#16
0
def one_topic_calculation(eventid_initializing):
    """多步计算
       input:
           eventid_initializing: (eventid, initializing)
               eventid: 话题ID
               initializing: 是否做初始聚类
    """
    eventid, initializing = eventid_initializing

    # 根据话题ID初始化话题实例
    event = Event(eventid)

    timestamp = event.getLastmodify() + 3600 # 当前的时间戳,int, 默认为最后修改日期+3600
    now_hour = int(time.strftime('%H', time.localtime(timestamp)))

    def step1_cal():
        """第一步计算,获取子事件特征词,新文本与特征词匹配分类
        """
        print '[%s] ' % ts2datetime(int(time.time())), 'event ', eventid, ' %s start step1' % ts2datetime(timestamp)

        if initializing:
            # 若话题需要做初始聚类,获取话题开始时间之前的文本
            results = event.getInitialInfos()
        else:
            # 若话题已做完初始聚类,获取话题最新一小时的文本
            results = event.getInfos(timestamp - 3600, timestamp)

            if now_hour == 0:
                # 如果不是在做初始化,24时的时候, 一定把当天(大于或等于0时小于24时)产生的簇(非其他簇)下的文本重新做一下匹配, 同时删除这些簇
                temp_subeventids = event.getTodayCreatSubeventIds()
                temp_infos = event.getTodayCreatSubeventInfos()
                event.remove_subevents(temp_subeventids)
                results.extend(temp_infos)

        print eventid, ' before classify: ', len(results)

        # 获取子事件
        subevents = event.getSubEvents()
        labels_list = []
        feature_words_list = []
        for subevent in subevents:
            subeventid = subevent["_id"]
            feature = Feature(subeventid)

            # 获取每个子事件最新的特征词
            fwords = feature.get_newest()
            feature_words_list.append(fwords)
            labels_list.append(subeventid)

        for r in results:
            text = (r['title'] + r['content168']).encode('utf-8')
            feature_words_inputs = []
            for fwords in feature_words_list:
                wcdict = dict()
                for w, c in fwords.iteritems():
                    if isinstance(w, unicode):
                        w = w.encode('utf-8')
                    wcdict[w] = c
                feature_words_inputs.append(wcdict)

            # 单条文本与各子事件的特征词进行匹配,得到每条文本的簇标签
            label = subevent_classifier(text, labels_list, feature_words_inputs)

            if label == "other":
                label = event.getOtherSubEventID()

            news = News(r["_id"], event.id)
            news.update_news_subeventid(label)

        print '[%s] ' % ts2datetime(int(time.time())), 'event ', eventid, ' %s end step1' % ts2datetime(timestamp)

    def step2_cal():
        """第二步计算,判断其他类是否需要分裂,若需要,则对其他类进行文本聚类,并做聚类评价
        """
        # 聚类评价时选取TOPK_FREQ_WORD的高频词
        TOPK_FREQ_WORD = 50
        # 聚类评价时最小簇的大小
        LEAST_SIZE = 8

        # 判断其他类是否需要分裂
        ifsplit = event.check_ifsplit(initializing)
        print '[%s] ' % ts2datetime(int(time.time())), 'event ', eventid, ' split ', ifsplit, ' %s start step2' % ts2datetime(timestamp)

        if ifsplit:
            inputs, kmeans_cluster_num, reserve_num = event.getOtherSubEventInfos(initializing)
            print eventid, ' after classify before split: ', len(inputs), kmeans_cluster_num, reserve_num
            if len(inputs) > 2:
                items = []
                for r in inputs:
                    r["title"] = r["title"].encode("utf-8")
                    r["content"] = r["content168"].encode("utf-8")
                    items.append(r)

                # kmeans聚类
                kmeans_results = kmeans(items, k=kmeans_cluster_num)

                # 聚类评价
                if initializing or now_hour == 0:
                    min_tfidf = event.get_min_tfidf()
                    final_cluster_results, tfidf_dict = cluster_evaluation(kmeans_results, top_num=reserve_num, topk_freq=TOPK_FREQ_WORD, least_size=LEAST_SIZE, min_tfidf=min_tfidf)
                else:
                    # 每小时聚类时,不用和已有簇的最小tfidf作比
                    final_cluster_results, tfidf_dict = cluster_evaluation(kmeans_results, top_num=reserve_num, topk_freq=TOPK_FREQ_WORD, least_size=LEAST_SIZE)

                # 更新新闻簇标签,更新子事件表
                for label, items in final_cluster_results.iteritems():
                    if label == "other":
                        label = event.getOtherSubEventID()

                    event.save_subevent(label, timestamp)

                    if label != event.getOtherSubEventID():
                        # 更新每类的tfidf
                        event.update_subevent_tfidf(label, tfidf_dict[label])

                    for r in items:
                        news = News(r["_id"], event.id)
                        news.update_news_subeventid(label)
            else:
                print 'inputs less than 2, kmeans aborted'

        print '[%s] ' % ts2datetime(int(time.time())), 'event ', eventid, ' %s end step2' % ts2datetime(timestamp)

    def step3_cal():
        """计算各簇的特征词、代表文本、去重, 更新簇的大小、增幅信息
        """
        print '[%s] ' % ts2datetime(int(time.time())), 'event ', eventid, ' %s start step3' % ts2datetime(timestamp)

        inputs = []
        subevents = event.getSubEvents()
        for subevent in subevents:
            subeventid = subevent["_id"]
            inputs.extend(event.getSubeventInfos(subeventid))

        for r in inputs:
            r["title"] = r["title"].encode("utf-8")
            r["content"] = r["content168"].encode("utf-8")
            r["label"] = r["subeventid"]

        # 计算各簇的存量特征词
        cluster_feature = extract_feature(inputs)
        for label, fwords in cluster_feature.iteritems():
            feature = Feature(label)
            feature.upsert_newest(fwords)

        # 计算文本权重
        for r in inputs:
            weight = text_weight_cal(r, cluster_feature[r['label']])
            news = News(r["_id"], event.id)
            news.update_news_weight(weight)

        # 文本去重
        items_dict = {}
        for r in inputs:
            try:
                items_dict[r["label"]].append(r)
            except KeyError:
                items_dict[r["label"]] = [r]

        for label, items in items_dict.iteritems():
            results = duplicate(items)
            for r in results:
                news = News(r["_id"], event.id)
                news.update_news_duplicate(r["duplicate"], r["same_from"])

            # 更新簇的大小、增幅信息
            before_size = event.get_subevent_size(label)
            event.update_subevent_size(label, len(items))
            event.update_subevent_addsize(label, len(items) - before_size)

        if initializing:
            # 更新事件状态由initializing变为active
            event.activate()

        print '[%s] ' % ts2datetime(int(time.time())), 'event ', eventid, ' %s end step3' % ts2datetime(timestamp)

    def step4_cal():
        """ 24 点时merge已有的簇
        """
        if not initializing and now_hour == 0:
            print '[%s] ' % ts2datetime(int(time.time())), 'event ', eventid, ' %s start step4' % ts2datetime(timestamp)

            subevents = event.getSubEvents()
            subevent_fwords = dict()
            for subevent in subevents:
                subeventid = subevent["_id"]
                feature = Feature(subeventid)

                # 获取每个子事件最新的特征词
                fwords = feature.get_newest()
                subevent_fwords[subeventid] = fwords

            subeventids_sort_timestamp = event.get_sorted_subeventids()

            cids, mids = merge_subevents(subevent_fwords, subeventids_sort_timestamp, top_tfidf_para=10, top_percent=0.3)

            for res_id, mer_id in mids:
                # 将mer_id下的文本扔入res_id下的簇,remove mer_id的簇
                temp_infos = event.get_subevent_infos(mer_id)

                for r in temp_infos:
                    news = News(r["_id"], event.id)
                    news.update_news_subeventid(res_id)

                event.remove_subevents([mer_id])

            # 重新计算各簇的特征词, 并计算文本权重, 并去重
            if len(mids):
                step3_cal()

            print '[%s] ' % ts2datetime(int(time.time())), 'event ', eventid, ' %s end step3' % ts2datetime(timestamp)

    # 首先检测该事件最近一次修改是否成功
    success = event.checkLastModify()
    if success:
        """
        step1_cal()
        step2_cal()
        step3_cal()
        step4_cal()
        """
        try:
            # 进行多步计算
            step1_cal()
            step2_cal()
            step3_cal()
            step4_cal()
            event.setLastmodify(timestamp) # 更新事件的last_modify
            event.setModifysuccess(True) # 更新事件的modify_success为True
        except Exception, e:
            # 如果做计算时出错,更新last_modify, 并将modify_success设置为False
            print '[Error]: ', e
            event.setLastmodify(timestamp)
            event.setModifysuccess(False)
示例#17
0
def one_topic_calculation(eventid_initializing):
    """多步计算
       input:
           eventid_initializing: (eventid, initializing)
               eventid: 话题ID
               initializing: 是否做初始聚类
    """
    eventid, initializing = eventid_initializing

    # 根据话题ID初始化话题实例
    event = Event(eventid)

    timestamp = event.getLastmodify() + 3600  # 当前的时间戳,int, 默认为最后修改日期+3600
    now_hour = int(time.strftime('%H', time.localtime(timestamp)))

    def step1_cal():
        """第一步计算,获取子事件特征词,新文本与特征词匹配分类
        """
        print '[%s] ' % ts2datetime(int(time.time(
        ))), 'event ', eventid, ' %s start step1' % ts2datetime(timestamp)

        if initializing:
            # 若话题需要做初始聚类,获取话题开始时间之前的文本
            results = event.getInitialInfos()
        else:
            # 若话题已做完初始聚类,获取话题最新一小时的文本
            results = event.getInfos(timestamp - 3600, timestamp)

            if now_hour == 0:
                # 如果不是在做初始化,24时的时候, 一定把当天(大于或等于0时小于24时)产生的簇(非其他簇)下的文本重新做一下匹配, 同时删除这些簇
                temp_subeventids = event.getTodayCreatSubeventIds()
                temp_infos = event.getTodayCreatSubeventInfos()
                event.remove_subevents(temp_subeventids)
                results.extend(temp_infos)

        print eventid, ' before classify: ', len(results)

        # 获取子事件
        subevents = event.getSubEvents()
        labels_list = []
        feature_words_list = []
        for subevent in subevents:
            subeventid = subevent["_id"]
            feature = Feature(subeventid)

            # 获取每个子事件最新的特征词
            fwords = feature.get_newest()
            feature_words_list.append(fwords)
            labels_list.append(subeventid)

        for r in results:
            text = (r['title'] + r['content168']).encode('utf-8')
            feature_words_inputs = []
            for fwords in feature_words_list:
                wcdict = dict()
                for w, c in fwords.iteritems():
                    if isinstance(w, unicode):
                        w = w.encode('utf-8')
                    wcdict[w] = c
                feature_words_inputs.append(wcdict)

            # 单条文本与各子事件的特征词进行匹配,得到每条文本的簇标签
            label = subevent_classifier(text, labels_list,
                                        feature_words_inputs)

            if label == "other":
                label = event.getOtherSubEventID()

            news = News(r["_id"], event.id)
            news.update_news_subeventid(label)

        print '[%s] ' % ts2datetime(int(time.time(
        ))), 'event ', eventid, ' %s end step1' % ts2datetime(timestamp)

    def step2_cal():
        """第二步计算,判断其他类是否需要分裂,若需要,则对其他类进行文本聚类,并做聚类评价
        """
        # 聚类评价时选取TOPK_FREQ_WORD的高频词
        TOPK_FREQ_WORD = 50
        # 聚类评价时最小簇的大小
        LEAST_SIZE = 8

        # 判断其他类是否需要分裂
        ifsplit = event.check_ifsplit(initializing)
        print '[%s] ' % ts2datetime(
            int(time.time())
        ), 'event ', eventid, ' split ', ifsplit, ' %s start step2' % ts2datetime(
            timestamp)

        if ifsplit:
            inputs, kmeans_cluster_num, reserve_num = event.getOtherSubEventInfos(
                initializing)
            print eventid, ' after classify before split: ', len(
                inputs), kmeans_cluster_num, reserve_num
            if len(inputs) > 2:
                items = []
                for r in inputs:
                    r["title"] = r["title"].encode("utf-8")
                    r["content"] = r["content168"].encode("utf-8")
                    items.append(r)

                # kmeans聚类
                kmeans_results = kmeans(items, k=kmeans_cluster_num)

                # 聚类评价
                if initializing or now_hour == 0:
                    min_tfidf = event.get_min_tfidf()
                    final_cluster_results, tfidf_dict = cluster_evaluation(
                        kmeans_results,
                        top_num=reserve_num,
                        topk_freq=TOPK_FREQ_WORD,
                        least_size=LEAST_SIZE,
                        min_tfidf=min_tfidf)
                else:
                    # 每小时聚类时,不用和已有簇的最小tfidf作比
                    final_cluster_results, tfidf_dict = cluster_evaluation(
                        kmeans_results,
                        top_num=reserve_num,
                        topk_freq=TOPK_FREQ_WORD,
                        least_size=LEAST_SIZE)

                # 更新新闻簇标签,更新子事件表
                for label, items in final_cluster_results.iteritems():
                    if label == "other":
                        label = event.getOtherSubEventID()

                    event.save_subevent(label, timestamp)

                    if label != event.getOtherSubEventID():
                        # 更新每类的tfidf
                        event.update_subevent_tfidf(label, tfidf_dict[label])

                    for r in items:
                        news = News(r["_id"], event.id)
                        news.update_news_subeventid(label)
            else:
                print 'inputs less than 2, kmeans aborted'

        print '[%s] ' % ts2datetime(int(time.time(
        ))), 'event ', eventid, ' %s end step2' % ts2datetime(timestamp)

    def step3_cal():
        """计算各簇的特征词、代表文本、去重, 更新簇的大小、增幅信息
        """
        print '[%s] ' % ts2datetime(int(time.time(
        ))), 'event ', eventid, ' %s start step3' % ts2datetime(timestamp)

        inputs = []
        subevents = event.getSubEvents()
        for subevent in subevents:
            subeventid = subevent["_id"]
            inputs.extend(event.getSubeventInfos(subeventid))

        for r in inputs:
            r["title"] = r["title"].encode("utf-8")
            r["content"] = r["content168"].encode("utf-8")
            r["label"] = r["subeventid"]

        # 计算各簇的存量特征词
        cluster_feature = extract_feature(inputs)
        for label, fwords in cluster_feature.iteritems():
            feature = Feature(label)
            feature.upsert_newest(fwords)

        # 计算文本权重
        for r in inputs:
            weight = text_weight_cal(r, cluster_feature[r['label']])
            news = News(r["_id"], event.id)
            news.update_news_weight(weight)

        # 文本去重
        items_dict = {}
        for r in inputs:
            try:
                items_dict[r["label"]].append(r)
            except KeyError:
                items_dict[r["label"]] = [r]

        for label, items in items_dict.iteritems():
            results = duplicate(items)
            for r in results:
                news = News(r["_id"], event.id)
                news.update_news_duplicate(r["duplicate"], r["same_from"])

            # 更新簇的大小、增幅信息
            before_size = event.get_subevent_size(label)
            event.update_subevent_size(label, len(items))
            event.update_subevent_addsize(label, len(items) - before_size)

        if initializing:
            # 更新事件状态由initializing变为active
            event.activate()

        print '[%s] ' % ts2datetime(int(time.time(
        ))), 'event ', eventid, ' %s end step3' % ts2datetime(timestamp)

    def step4_cal():
        """ 24 点时merge已有的簇
        """
        if not initializing and now_hour == 0:
            print '[%s] ' % ts2datetime(int(time.time(
            ))), 'event ', eventid, ' %s start step4' % ts2datetime(timestamp)

            subevents = event.getSubEvents()
            subevent_fwords = dict()
            for subevent in subevents:
                subeventid = subevent["_id"]
                feature = Feature(subeventid)

                # 获取每个子事件最新的特征词
                fwords = feature.get_newest()
                subevent_fwords[subeventid] = fwords

            subeventids_sort_timestamp = event.get_sorted_subeventids()

            cids, mids = merge_subevents(subevent_fwords,
                                         subeventids_sort_timestamp,
                                         top_tfidf_para=10,
                                         top_percent=0.3)

            for res_id, mer_id in mids:
                # 将mer_id下的文本扔入res_id下的簇,remove mer_id的簇
                temp_infos = event.get_subevent_infos(mer_id)

                for r in temp_infos:
                    news = News(r["_id"], event.id)
                    news.update_news_subeventid(res_id)

                event.remove_subevents([mer_id])

            # 重新计算各簇的特征词, 并计算文本权重, 并去重
            if len(mids):
                step3_cal()

            print '[%s] ' % ts2datetime(int(time.time(
            ))), 'event ', eventid, ' %s end step3' % ts2datetime(timestamp)

    # 首先检测该事件最近一次修改是否成功
    success = event.checkLastModify()
    if success:
        """
        step1_cal()
        step2_cal()
        step3_cal()
        step4_cal()
        """
        try:
            # 进行多步计算
            step1_cal()
            step2_cal()
            step3_cal()
            step4_cal()
            event.setLastmodify(timestamp)  # 更新事件的last_modify
            event.setModifysuccess(True)  # 更新事件的modify_success为True
        except Exception, e:
            # 如果做计算时出错,更新last_modify, 并将modify_success设置为False
            print '[Error]: ', e
            event.setLastmodify(timestamp)
            event.setModifysuccess(False)
示例#18
0
def test_subevent_classifier():
    labels_list = []
    feature_words_inputs = []
    subevents = event.getSubEvents()
    for subevent in subevents:
        subeventid = subevent["_id"]
        if subeventid != "575612b6-a26f-4df9-a2de-01c85cae56a2":
            labels_list.append(subeventid)
            feature = Feature(subeventid)
            feature_words = feature.get_newest()
            new_feature_words = dict()
            for k, v in feature_words.iteritems():
                new_feature_words[k.encode('utf-8')] = v
            feature_words_inputs.append(new_feature_words)

    news_id = "http://news.xinhuanet.com/comments/2014-11/03/c_1113084515.htm"
    news = News(news_id, event.id)
    ns = news.get_news_info()
    text = ns['title'].encode('utf-8') + ns['content168'].encode('utf-8')
    label = subevent_classifier(text, labels_list, feature_words_inputs)

    print label

if __name__ == '__main__':
    from bson.objectid import ObjectId
    eventid = ObjectId("54916b0d955230e752f2a94e")
    event = Event(eventid)

    test_subevent_classifier()