Exemplo n.º 1
0
 def action_edititem(self, user, timestamp, itemid, meta):
     from item.models import Item
     i = Item.get(itemid)
     if not i:
         logger.debug('ignore edititem for unknown item %s %s', timestamp, itemid)
         return True
     if i.timestamp > timestamp:
         logger.debug('ignore edititem change %s %s %s', timestamp, itemid, meta)
         return True
     primary = None
     if 'primaryid' in meta:
         primary = meta['primaryid']
         key = primary[0]
     else:
         keys = [k for k in meta if k in Item.id_keys]
         if keys:
             key = keys[0]
             primary = [key, meta[key]]
     if primary:
         if not meta[key] and i.meta.get('primaryid', [''])[0] == key:
             logger.debug('remove id mapping %s %s', i.id, primary)
             i.update_primaryid(*primary, scrape=False)
             i.modified = ts2datetime(timestamp)
         elif meta[key] and i.meta.get('primaryid') != primary:
             logger.debug('edit mapping %s %s', i.id, primary)
             i.update_primaryid(*primary, scrape=False)
             i.modified = ts2datetime(timestamp)
     else:
         i.update_meta(meta)
         i.modified = ts2datetime(timestamp)
     i.save()
     user.clear_smart_list_cache()
     return True
Exemplo n.º 2
0
    def step4_cal():
        """ 24 点时merge已有的簇
        """
        if not initializing and now_hour == 0:
            print '[%s] ' % ts2datetime(int(time.time())), 'event ', eventid, ' %s start step4' % ts2datetime(timestamp)

            subevents = event.getSubEvents()
            subevent_fwords = dict()
            for subevent in subevents:
                subeventid = subevent["_id"]
                feature = Feature(subeventid)

                # 获取每个子事件最新的特征词
                fwords = feature.get_newest()
                subevent_fwords[subeventid] = fwords

            subeventids_sort_timestamp = event.get_sorted_subeventids()

            cids, mids = merge_subevents(subevent_fwords, subeventids_sort_timestamp, top_tfidf_para=10, top_percent=0.3)

            for res_id, mer_id in mids:
                # 将mer_id下的文本扔入res_id下的簇,remove mer_id的簇
                temp_infos = event.get_subevent_infos(mer_id)

                for r in temp_infos:
                    news = News(r["_id"], event.id)
                    news.update_news_subeventid(res_id)

                event.remove_subevents([mer_id])

            # 重新计算各簇的特征词, 并计算文本权重, 并去重
            if len(mids):
                step3_cal()

            print '[%s] ' % ts2datetime(int(time.time())), 'event ', eventid, ' %s end step3' % ts2datetime(timestamp)
Exemplo n.º 3
0
 def action_edititem(self, user, timestamp, itemid, meta):
     from item.models import Item
     i = Item.get(itemid)
     if not i:
         logger.debug('ignore edititem for unknown item %s %s', timestamp,
                      itemid)
         return True
     if i.timestamp > timestamp:
         logger.debug('ignore edititem change %s %s %s', timestamp, itemid,
                      meta)
         return True
     primary = None
     if 'primaryid' in meta:
         primary = meta['primaryid']
         key = primary[0]
     else:
         keys = [k for k in meta if k in Item.id_keys]
         if keys:
             key = keys[0]
             primary = [key, meta[key]]
     if primary:
         if not meta[key] and i.meta.get('primaryid', [''])[0] == key:
             logger.debug('remove id mapping %s %s', i.id, primary)
             i.update_primaryid(*primary, scrape=False)
             i.modified = ts2datetime(timestamp)
         elif meta[key] and i.meta.get('primaryid') != primary:
             logger.debug('edit mapping %s %s', i.id, primary)
             i.update_primaryid(*primary, scrape=False)
             i.modified = ts2datetime(timestamp)
     else:
         i.update_meta(meta)
         i.modified = ts2datetime(timestamp)
     i.save()
     user.clear_smart_list_cache()
     return True
Exemplo n.º 4
0
    def step1_cal():
        """第一步计算,获取子事件特征词,新文本与特征词匹配分类
        """
        print '[%s] ' % ts2datetime(int(time.time(
        ))), 'event ', eventid, ' %s start step1' % ts2datetime(timestamp)

        if initializing:
            # 若话题需要做初始聚类,获取话题开始时间之前的文本
            results = event.getInitialInfos()
        else:
            # 若话题已做完初始聚类,获取话题最新一小时的文本
            results = event.getInfos(timestamp - 3600, timestamp)

            if now_hour == 0:
                # 如果不是在做初始化,24时的时候, 一定把当天(大于或等于0时小于24时)产生的簇(非其他簇)下的文本重新做一下匹配, 同时删除这些簇
                temp_subeventids = event.getTodayCreatSubeventIds()
                temp_infos = event.getTodayCreatSubeventInfos()
                event.remove_subevents(temp_subeventids)
                results.extend(temp_infos)

        print eventid, ' before classify: ', len(results)

        # 获取子事件
        subevents = event.getSubEvents()
        labels_list = []
        feature_words_list = []
        for subevent in subevents:
            subeventid = subevent["_id"]
            feature = Feature(subeventid)

            # 获取每个子事件最新的特征词
            fwords = feature.get_newest()
            feature_words_list.append(fwords)
            labels_list.append(subeventid)

        for r in results:
            text = (r['title'] + r['content168']).encode('utf-8')
            feature_words_inputs = []
            for fwords in feature_words_list:
                wcdict = dict()
                for w, c in fwords.iteritems():
                    if isinstance(w, unicode):
                        w = w.encode('utf-8')
                    wcdict[w] = c
                feature_words_inputs.append(wcdict)

            # 单条文本与各子事件的特征词进行匹配,得到每条文本的簇标签
            label = subevent_classifier(text, labels_list,
                                        feature_words_inputs)

            if label == "other":
                label = event.getOtherSubEventID()

            news = News(r["_id"], event.id)
            news.update_news_subeventid(label)

        print '[%s] ' % ts2datetime(int(time.time(
        ))), 'event ', eventid, ' %s end step1' % ts2datetime(timestamp)
Exemplo n.º 5
0
    def step1_cal():
        """第一步计算,获取子事件特征词,新文本与特征词匹配分类
        """
        print '[%s] ' % ts2datetime(int(time.time())), 'event ', eventid, ' %s start step1' % ts2datetime(timestamp)

        if initializing:
            # 若话题需要做初始聚类,获取话题开始时间之前的文本
            results = event.getInitialInfos()
        else:
            # 若话题已做完初始聚类,获取话题最新一小时的文本
            results = event.getInfos(timestamp - 3600, timestamp)

            if now_hour == 0:
                # 如果不是在做初始化,24时的时候, 一定把当天(大于或等于0时小于24时)产生的簇(非其他簇)下的文本重新做一下匹配, 同时删除这些簇
                temp_subeventids = event.getTodayCreatSubeventIds()
                temp_infos = event.getTodayCreatSubeventInfos()
                event.remove_subevents(temp_subeventids)
                results.extend(temp_infos)

        print eventid, ' before classify: ', len(results)

        # 获取子事件
        subevents = event.getSubEvents()
        labels_list = []
        feature_words_list = []
        for subevent in subevents:
            subeventid = subevent["_id"]
            feature = Feature(subeventid)

            # 获取每个子事件最新的特征词
            fwords = feature.get_newest()
            feature_words_list.append(fwords)
            labels_list.append(subeventid)

        for r in results:
            text = (r['title'] + r['content168']).encode('utf-8')
            feature_words_inputs = []
            for fwords in feature_words_list:
                wcdict = dict()
                for w, c in fwords.iteritems():
                    if isinstance(w, unicode):
                        w = w.encode('utf-8')
                    wcdict[w] = c
                feature_words_inputs.append(wcdict)

            # 单条文本与各子事件的特征词进行匹配,得到每条文本的簇标签
            label = subevent_classifier(text, labels_list, feature_words_inputs)

            if label == "other":
                label = event.getOtherSubEventID()

            news = News(r["_id"], event.id)
            news.update_news_subeventid(label)

        print '[%s] ' % ts2datetime(int(time.time())), 'event ', eventid, ' %s end step1' % ts2datetime(timestamp)
Exemplo n.º 6
0
    def step3_cal():
        """计算各簇的特征词、代表文本、去重, 更新簇的大小、增幅信息
        """
        print '[%s] ' % ts2datetime(int(time.time(
        ))), 'event ', eventid, ' %s start step3' % ts2datetime(timestamp)

        inputs = []
        subevents = event.getSubEvents()
        for subevent in subevents:
            subeventid = subevent["_id"]
            inputs.extend(event.getSubeventInfos(subeventid))

        for r in inputs:
            r["title"] = r["title"].encode("utf-8")
            r["content"] = r["content168"].encode("utf-8")
            r["label"] = r["subeventid"]

        # 计算各簇的存量特征词
        cluster_feature = extract_feature(inputs)
        for label, fwords in cluster_feature.iteritems():
            feature = Feature(label)
            feature.upsert_newest(fwords)

        # 计算文本权重
        for r in inputs:
            weight = text_weight_cal(r, cluster_feature[r['label']])
            news = News(r["_id"], event.id)
            news.update_news_weight(weight)

        # 文本去重
        items_dict = {}
        for r in inputs:
            try:
                items_dict[r["label"]].append(r)
            except KeyError:
                items_dict[r["label"]] = [r]

        for label, items in items_dict.iteritems():
            results = duplicate(items)
            for r in results:
                news = News(r["_id"], event.id)
                news.update_news_duplicate(r["duplicate"], r["same_from"])

            # 更新簇的大小、增幅信息
            before_size = event.get_subevent_size(label)
            event.update_subevent_size(label, len(items))
            event.update_subevent_addsize(label, len(items) - before_size)

        if initializing:
            # 更新事件状态由initializing变为active
            event.activate()

        print '[%s] ' % ts2datetime(int(time.time(
        ))), 'event ', eventid, ' %s end step3' % ts2datetime(timestamp)
Exemplo n.º 7
0
    def step3_cal():
        """计算各簇的特征词、代表文本、去重, 更新簇的大小、增幅信息
        """
        print '[%s] ' % ts2datetime(int(time.time())), 'event ', eventid, ' %s start step3' % ts2datetime(timestamp)

        inputs = []
        subevents = event.getSubEvents()
        for subevent in subevents:
            subeventid = subevent["_id"]
            inputs.extend(event.getSubeventInfos(subeventid))

        for r in inputs:
            r["title"] = r["title"].encode("utf-8")
            r["content"] = r["content168"].encode("utf-8")
            r["label"] = r["subeventid"]

        # 计算各簇的存量特征词
        cluster_feature = extract_feature(inputs)
        for label, fwords in cluster_feature.iteritems():
            feature = Feature(label)
            feature.upsert_newest(fwords)

        # 计算文本权重
        for r in inputs:
            weight = text_weight_cal(r, cluster_feature[r['label']])
            news = News(r["_id"], event.id)
            news.update_news_weight(weight)

        # 文本去重
        items_dict = {}
        for r in inputs:
            try:
                items_dict[r["label"]].append(r)
            except KeyError:
                items_dict[r["label"]] = [r]

        for label, items in items_dict.iteritems():
            results = duplicate(items)
            for r in results:
                news = News(r["_id"], event.id)
                news.update_news_duplicate(r["duplicate"], r["same_from"])

            # 更新簇的大小、增幅信息
            before_size = event.get_subevent_size(label)
            event.update_subevent_size(label, len(items))
            event.update_subevent_addsize(label, len(items) - before_size)

        if initializing:
            # 更新事件状态由initializing变为active
            event.activate()

        print '[%s] ' % ts2datetime(int(time.time())), 'event ', eventid, ' %s end step3' % ts2datetime(timestamp)
Exemplo n.º 8
0
    def step2_cal():
        """第二步计算,判断其他类是否需要分裂,若需要,则对其他类进行文本聚类,并做聚类评价
        """
        # 聚类评价时选取TOPK_FREQ_WORD的高频词
        TOPK_FREQ_WORD = 50
        # 聚类评价时最小簇的大小
        LEAST_SIZE = 8

        # 判断其他类是否需要分裂
        ifsplit = event.check_ifsplit(initializing)
        print '[%s] ' % ts2datetime(int(time.time())), 'event ', eventid, ' split ', ifsplit, ' %s start step2' % ts2datetime(timestamp)

        if ifsplit:
            inputs, kmeans_cluster_num, reserve_num = event.getOtherSubEventInfos(initializing)
            print eventid, ' after classify before split: ', len(inputs), kmeans_cluster_num, reserve_num
            if len(inputs) > 2:
                items = []
                for r in inputs:
                    r["title"] = r["title"].encode("utf-8")
                    r["content"] = r["content168"].encode("utf-8")
                    items.append(r)

                # kmeans聚类
                kmeans_results = kmeans(items, k=kmeans_cluster_num)

                # 聚类评价
                if initializing or now_hour == 0:
                    min_tfidf = event.get_min_tfidf()
                    final_cluster_results, tfidf_dict = cluster_evaluation(kmeans_results, top_num=reserve_num, topk_freq=TOPK_FREQ_WORD, least_size=LEAST_SIZE, min_tfidf=min_tfidf)
                else:
                    # 每小时聚类时,不用和已有簇的最小tfidf作比
                    final_cluster_results, tfidf_dict = cluster_evaluation(kmeans_results, top_num=reserve_num, topk_freq=TOPK_FREQ_WORD, least_size=LEAST_SIZE)

                # 更新新闻簇标签,更新子事件表
                for label, items in final_cluster_results.iteritems():
                    if label == "other":
                        label = event.getOtherSubEventID()

                    event.save_subevent(label, timestamp)

                    if label != event.getOtherSubEventID():
                        # 更新每类的tfidf
                        event.update_subevent_tfidf(label, tfidf_dict[label])

                    for r in items:
                        news = News(r["_id"], event.id)
                        news.update_news_subeventid(label)
            else:
                print 'inputs less than 2, kmeans aborted'

        print '[%s] ' % ts2datetime(int(time.time())), 'event ', eventid, ' %s end step2' % ts2datetime(timestamp)
Exemplo n.º 9
0
def subject_weibo2news(item):
    news = dict()
    for field in NULL_FIELDS:
        news[field] = None

    for k, v in item.iteritems():
        if k == 'timestamp':
            news['timestamp'] = v
            news['date'] = ts2date(v)
            news['datetime'] = ts2datetime(v)
        if k == '_id':
            news['_id'] = v
            news['id'] = v
        if k == 'reposts_count':
            news['replies'] = v
        if k == 'comments_count':
            news['same_news_num'] = v
        if k == 'name':
            news['news_author'] = v
            news['user_name'] = v
        if k == 'user':
            news['user_id'] = v
        if k == 'text':
            text = v
            news['title'] = '【' + re.search(r'【(.*?)】',
                                            str(text)).group(1) + '】'
            news['content168'] = text.replace(news['title'], '')
        if k == 'weibourl':
            news['showurl'] = v

    return news
Exemplo n.º 10
0
def subject_weibo2news(item):
    news = dict()
    for field in NULL_FIELDS:
        news[field] = None

    for k, v in item.iteritems():
        if k == 'timestamp':
            news['timestamp'] = v
            news['date'] = ts2date(v)
            news['datetime'] = ts2datetime(v)
        if k == '_id':
            news['_id'] = v
            news['id'] = v
        if k == 'reposts_count':
            news['replies'] = v
        if k == 'comments_count':
            news['same_news_num'] = v
        if k == 'name':
            news['news_author'] = v
            news['user_name'] = v
        if k == 'user':
            news['user_id'] = v
        if k == 'text':
            text = v
            news['title'] = '【' + re.search(r'【(.*?)】', str(text)).group(1) + '】'
            news['content168'] = text.replace(news['title'], '')
        if k == 'weibourl':
            news['showurl'] = v

    return news
Exemplo n.º 11
0
def object_weibo2comment(item):
    comment = dict()
    for field in NULL_FIELDS:
        comment[field] = None
    comment['news_id'] = 'weibo'
    for k, v in item.iteritems():
        if k == 'timestamp':
            comment['timestamp'] = v
            comment['date'] = ts2date(v)
            comment['datetime'] = ts2datetime(v)
        if k == '_id':
            comment['_id'] = v
            comment['id'] = v
        if k == 'reposts_count':
            comment[k] = v
        if k == 'comments_count':
            comment[k] = v
        if k == 'attitudes_count':
            comment[k] = v
        if k == 'name':
            comment['user_name'] = v
        if k == 'weibourl':
            comment['comment_source'] = v
        if k == 'text':
            text = v
            comment['content168'] = text

    return comment
Exemplo n.º 12
0
def object_weibo2comment(item):
    comment = dict()
    for field in NULL_FIELDS:
        comment[field] = None
    comment['news_id'] = 'weibo'
    for k, v in item.iteritems():
        if k == 'timestamp':
            comment['timestamp'] = v
            comment['date'] = ts2date(v)
            comment['datetime'] = ts2datetime(v)
        if k == '_id':
            comment['_id'] = v
            comment['id'] = v
        if k == 'reposts_count':
            comment[k] = v
        if k == 'comments_count':
            comment[k] = v
        if k == 'attitudes_count':
            comment[k] = v
        if k == 'name':
            comment['user_name'] = v
        if k =='weibourl':
            comment['comment_source'] = v
        if k == 'text':
            text = v
            comment['content168'] = text

    return comment
Exemplo n.º 13
0
    def step4_cal():
        """ 24 点时merge已有的簇
        """
        if not initializing and now_hour == 0:
            print '[%s] ' % ts2datetime(int(time.time(
            ))), 'event ', eventid, ' %s start step4' % ts2datetime(timestamp)

            subevents = event.getSubEvents()
            subevent_fwords = dict()
            for subevent in subevents:
                subeventid = subevent["_id"]
                feature = Feature(subeventid)

                # 获取每个子事件最新的特征词
                fwords = feature.get_newest()
                subevent_fwords[subeventid] = fwords

            subeventids_sort_timestamp = event.get_sorted_subeventids()

            cids, mids = merge_subevents(subevent_fwords,
                                         subeventids_sort_timestamp,
                                         top_tfidf_para=10,
                                         top_percent=0.3)

            for res_id, mer_id in mids:
                # 将mer_id下的文本扔入res_id下的簇,remove mer_id的簇
                temp_infos = event.get_subevent_infos(mer_id)

                for r in temp_infos:
                    news = News(r["_id"], event.id)
                    news.update_news_subeventid(res_id)

                event.remove_subevents([mer_id])

            # 重新计算各簇的特征词, 并计算文本权重, 并去重
            if len(mids):
                step3_cal()

            print '[%s] ' % ts2datetime(int(time.time(
            ))), 'event ', eventid, ' %s end step3' % ts2datetime(timestamp)
Exemplo n.º 14
0
 def action_additem(self, user, timestamp, itemid, info):
     from item.models import Item
     i = Item.get(itemid)
     if i:
         if user not in i.users:
             i.add_user(user)
             i.update()
     else:
         i = Item.get_or_create(itemid, info)
         i.modified = ts2datetime(timestamp)
         if user not in i.users:
             i.add_user(user)
         i.update()
     user.clear_smart_list_cache()
     return True
Exemplo n.º 15
0
 def action_additem(self, user, timestamp, itemid, info):
     from item.models import Item
     i = Item.get(itemid)
     if i:
         if user not in i.users:
             i.add_user(user)
             i.update()
     else:
         i = Item.get_or_create(itemid, info)
         i.modified = ts2datetime(timestamp)
         if user not in i.users:
             i.add_user(user)
         i.update()
     user.clear_smart_list_cache()
     return True
Exemplo n.º 16
0
def transform(item):
    result = dict()
    result['reposts_count'] = item['reposts_count']
    result['user_comment_url'] = item['weibourl']
    result['comment_source'] = item['weibourl']
    result['first_in'] = None
    result['last_modify'] = None
    result['timestamp'] = item['timestamp']
    result['content168'] = item['text']
    result['datetime'] = ts2datetime(item['timestamp'])
    result['news_id'] = 'weibo'
    result['attitudes_count'] = item['attitudes_count']
    result['news_content'] = None
    result['comments_count'] = item['comments_count']
    result['location'] = item['geo']
    result['date'] = ts2date(item['timestamp'])
    result['_id'] = item['_id']
    result['id'] = item['_id']
    result['user_name'] = item['name']
    return result
Exemplo n.º 17
0
def transform(item):
    result = dict()
    result['reposts_count'] = item['reposts_count']
    result['user_comment_url'] = item['weibourl']
    result['comment_source'] = item['weibourl']
    result['first_in'] = None
    result['last_modify'] = None
    result['timestamp'] = item['timestamp']
    result['content168'] = item['text']
    result['datetime'] = ts2datetime(item['timestamp'])
    result['news_id'] = 'weibo'
    result['attitudes_count'] = item['attitudes_count']
    result['news_content'] = None
    result['comments_count'] = item['comments_count']
    result['location'] = item['geo']
    result['date'] = ts2date(item['timestamp'])
    result['_id'] = item['_id']
    result['id'] = item['_id']
    result['user_name'] = item['name']
    return result
Exemplo n.º 18
0
        "source_website": "zhengyi_search",
    }

    count = mongo.boatcol.find(query_dict).count()
    print "zhengyi news %s count: " % sort_field, count
    results = mongo.boatcol.find(query_dict).sort(sort_field, pymongo.DESCENDING)

    results = [r for r in results]

    for r in results:
        r['title'] = r['title'].encode('utf-8')
        r['content'] = r['summary'].encode('utf-8')

    fw = csv.writer(open(result_path + 'zhengyi_news_sort_%s_%s_%s.csv' % (sort_field, START_DATETIME, END_DATETIME), 'wb'), delimiter='^')
    fw.writerow(ZHENGYI_NEWS_KEYS)
    for r in results:
        csvrow = []
        for key in ZHENGYI_NEWS_KEYS:
            csvrow.append(_encode_utf8(r[key]))

        fw.writerow(csvrow)


if __name__ == "__main__":
    print "recommend begins..."
    sheqi_rec(sort_field='hot')
    sheqi_rec(sort_field='timestamp')
    sheqi_rec(sort_field='sensi')
    zhengyi_rec(sort_field='timestamp')
    print "[%s] recommend ends..." % ts2datetime(int(time.time()))
Exemplo n.º 19
0
    flag = '1'
    test(inputs, flag)
    label = choose_ad(flag)

    return label

def _encode_utf8(us):
    if isinstance(us, unicode):
        us = us.encode('utf-8')

    if not us:
        us = ''

    return us

print "[%s] rubbish filter begins..." % ts2datetime(int(time.time()))

mongo = _default_mongo()

module_keywords = get_module_keywords()

for bankuai, lanmu, source, source_en, keywords_file in module_keywords:
    query_dict = {
        "timestamp": {
            "$gte": START_TS,
            "$lt": END_TS
        },
        "keywords_hit": True
    }

    if source_en == "weibo_api_search_spider":
Exemplo n.º 20
0
    def step2_cal():
        """第二步计算,判断其他类是否需要分裂,若需要,则对其他类进行文本聚类,并做聚类评价
        """
        # 聚类评价时选取TOPK_FREQ_WORD的高频词
        TOPK_FREQ_WORD = 50
        # 聚类评价时最小簇的大小
        LEAST_SIZE = 8

        # 判断其他类是否需要分裂
        ifsplit = event.check_ifsplit(initializing)
        print '[%s] ' % ts2datetime(
            int(time.time())
        ), 'event ', eventid, ' split ', ifsplit, ' %s start step2' % ts2datetime(
            timestamp)

        if ifsplit:
            inputs, kmeans_cluster_num, reserve_num = event.getOtherSubEventInfos(
                initializing)
            print eventid, ' after classify before split: ', len(
                inputs), kmeans_cluster_num, reserve_num
            if len(inputs) > 2:
                items = []
                for r in inputs:
                    r["title"] = r["title"].encode("utf-8")
                    r["content"] = r["content168"].encode("utf-8")
                    items.append(r)

                # kmeans聚类
                kmeans_results = kmeans(items, k=kmeans_cluster_num)

                # 聚类评价
                if initializing or now_hour == 0:
                    min_tfidf = event.get_min_tfidf()
                    final_cluster_results, tfidf_dict = cluster_evaluation(
                        kmeans_results,
                        top_num=reserve_num,
                        topk_freq=TOPK_FREQ_WORD,
                        least_size=LEAST_SIZE,
                        min_tfidf=min_tfidf)
                else:
                    # 每小时聚类时,不用和已有簇的最小tfidf作比
                    final_cluster_results, tfidf_dict = cluster_evaluation(
                        kmeans_results,
                        top_num=reserve_num,
                        topk_freq=TOPK_FREQ_WORD,
                        least_size=LEAST_SIZE)

                # 更新新闻簇标签,更新子事件表
                for label, items in final_cluster_results.iteritems():
                    if label == "other":
                        label = event.getOtherSubEventID()

                    event.save_subevent(label, timestamp)

                    if label != event.getOtherSubEventID():
                        # 更新每类的tfidf
                        event.update_subevent_tfidf(label, tfidf_dict[label])

                    for r in items:
                        news = News(r["_id"], event.id)
                        news.update_news_subeventid(label)
            else:
                print 'inputs less than 2, kmeans aborted'

        print '[%s] ' % ts2datetime(int(time.time(
        ))), 'event ', eventid, ' %s end step2' % ts2datetime(timestamp)
Exemplo n.º 21
0
    return us

def get_keywords(file_name):
    f = open('../source/' + file_name)
    hit_keywords = []
    for line in f:
        ks = line.strip().strip('(').strip(')').split(' | ')
        ks_list = [word.split(' ') for word in ks]
        for k in ks_list:
            hit_keywords.append(k)

    f.close()

    return hit_keywords

print "[%s] post filter begins..." % ts2datetime(int(time.time()))

mongo = _default_mongo()

module_keywords = get_module_keywords()

for bankuai, lanmu, source, source_en, keywords_file in module_keywords:
    query_dict = {
        "timestamp": {
            "$gte": START_TS,
            "$lt": END_TS
        }
    }

    hit_keywords = get_keywords(keywords_file)