def action_edititem(self, user, timestamp, itemid, meta): from item.models import Item i = Item.get(itemid) if not i: logger.debug('ignore edititem for unknown item %s %s', timestamp, itemid) return True if i.timestamp > timestamp: logger.debug('ignore edititem change %s %s %s', timestamp, itemid, meta) return True primary = None if 'primaryid' in meta: primary = meta['primaryid'] key = primary[0] else: keys = [k for k in meta if k in Item.id_keys] if keys: key = keys[0] primary = [key, meta[key]] if primary: if not meta[key] and i.meta.get('primaryid', [''])[0] == key: logger.debug('remove id mapping %s %s', i.id, primary) i.update_primaryid(*primary, scrape=False) i.modified = ts2datetime(timestamp) elif meta[key] and i.meta.get('primaryid') != primary: logger.debug('edit mapping %s %s', i.id, primary) i.update_primaryid(*primary, scrape=False) i.modified = ts2datetime(timestamp) else: i.update_meta(meta) i.modified = ts2datetime(timestamp) i.save() user.clear_smart_list_cache() return True
def step4_cal(): """ 24 点时merge已有的簇 """ if not initializing and now_hour == 0: print '[%s] ' % ts2datetime(int(time.time())), 'event ', eventid, ' %s start step4' % ts2datetime(timestamp) subevents = event.getSubEvents() subevent_fwords = dict() for subevent in subevents: subeventid = subevent["_id"] feature = Feature(subeventid) # 获取每个子事件最新的特征词 fwords = feature.get_newest() subevent_fwords[subeventid] = fwords subeventids_sort_timestamp = event.get_sorted_subeventids() cids, mids = merge_subevents(subevent_fwords, subeventids_sort_timestamp, top_tfidf_para=10, top_percent=0.3) for res_id, mer_id in mids: # 将mer_id下的文本扔入res_id下的簇,remove mer_id的簇 temp_infos = event.get_subevent_infos(mer_id) for r in temp_infos: news = News(r["_id"], event.id) news.update_news_subeventid(res_id) event.remove_subevents([mer_id]) # 重新计算各簇的特征词, 并计算文本权重, 并去重 if len(mids): step3_cal() print '[%s] ' % ts2datetime(int(time.time())), 'event ', eventid, ' %s end step3' % ts2datetime(timestamp)
def step1_cal(): """第一步计算,获取子事件特征词,新文本与特征词匹配分类 """ print '[%s] ' % ts2datetime(int(time.time( ))), 'event ', eventid, ' %s start step1' % ts2datetime(timestamp) if initializing: # 若话题需要做初始聚类,获取话题开始时间之前的文本 results = event.getInitialInfos() else: # 若话题已做完初始聚类,获取话题最新一小时的文本 results = event.getInfos(timestamp - 3600, timestamp) if now_hour == 0: # 如果不是在做初始化,24时的时候, 一定把当天(大于或等于0时小于24时)产生的簇(非其他簇)下的文本重新做一下匹配, 同时删除这些簇 temp_subeventids = event.getTodayCreatSubeventIds() temp_infos = event.getTodayCreatSubeventInfos() event.remove_subevents(temp_subeventids) results.extend(temp_infos) print eventid, ' before classify: ', len(results) # 获取子事件 subevents = event.getSubEvents() labels_list = [] feature_words_list = [] for subevent in subevents: subeventid = subevent["_id"] feature = Feature(subeventid) # 获取每个子事件最新的特征词 fwords = feature.get_newest() feature_words_list.append(fwords) labels_list.append(subeventid) for r in results: text = (r['title'] + r['content168']).encode('utf-8') feature_words_inputs = [] for fwords in feature_words_list: wcdict = dict() for w, c in fwords.iteritems(): if isinstance(w, unicode): w = w.encode('utf-8') wcdict[w] = c feature_words_inputs.append(wcdict) # 单条文本与各子事件的特征词进行匹配,得到每条文本的簇标签 label = subevent_classifier(text, labels_list, feature_words_inputs) if label == "other": label = event.getOtherSubEventID() news = News(r["_id"], event.id) news.update_news_subeventid(label) print '[%s] ' % ts2datetime(int(time.time( ))), 'event ', eventid, ' %s end step1' % ts2datetime(timestamp)
def step1_cal(): """第一步计算,获取子事件特征词,新文本与特征词匹配分类 """ print '[%s] ' % ts2datetime(int(time.time())), 'event ', eventid, ' %s start step1' % ts2datetime(timestamp) if initializing: # 若话题需要做初始聚类,获取话题开始时间之前的文本 results = event.getInitialInfos() else: # 若话题已做完初始聚类,获取话题最新一小时的文本 results = event.getInfos(timestamp - 3600, timestamp) if now_hour == 0: # 如果不是在做初始化,24时的时候, 一定把当天(大于或等于0时小于24时)产生的簇(非其他簇)下的文本重新做一下匹配, 同时删除这些簇 temp_subeventids = event.getTodayCreatSubeventIds() temp_infos = event.getTodayCreatSubeventInfos() event.remove_subevents(temp_subeventids) results.extend(temp_infos) print eventid, ' before classify: ', len(results) # 获取子事件 subevents = event.getSubEvents() labels_list = [] feature_words_list = [] for subevent in subevents: subeventid = subevent["_id"] feature = Feature(subeventid) # 获取每个子事件最新的特征词 fwords = feature.get_newest() feature_words_list.append(fwords) labels_list.append(subeventid) for r in results: text = (r['title'] + r['content168']).encode('utf-8') feature_words_inputs = [] for fwords in feature_words_list: wcdict = dict() for w, c in fwords.iteritems(): if isinstance(w, unicode): w = w.encode('utf-8') wcdict[w] = c feature_words_inputs.append(wcdict) # 单条文本与各子事件的特征词进行匹配,得到每条文本的簇标签 label = subevent_classifier(text, labels_list, feature_words_inputs) if label == "other": label = event.getOtherSubEventID() news = News(r["_id"], event.id) news.update_news_subeventid(label) print '[%s] ' % ts2datetime(int(time.time())), 'event ', eventid, ' %s end step1' % ts2datetime(timestamp)
def step3_cal(): """计算各簇的特征词、代表文本、去重, 更新簇的大小、增幅信息 """ print '[%s] ' % ts2datetime(int(time.time( ))), 'event ', eventid, ' %s start step3' % ts2datetime(timestamp) inputs = [] subevents = event.getSubEvents() for subevent in subevents: subeventid = subevent["_id"] inputs.extend(event.getSubeventInfos(subeventid)) for r in inputs: r["title"] = r["title"].encode("utf-8") r["content"] = r["content168"].encode("utf-8") r["label"] = r["subeventid"] # 计算各簇的存量特征词 cluster_feature = extract_feature(inputs) for label, fwords in cluster_feature.iteritems(): feature = Feature(label) feature.upsert_newest(fwords) # 计算文本权重 for r in inputs: weight = text_weight_cal(r, cluster_feature[r['label']]) news = News(r["_id"], event.id) news.update_news_weight(weight) # 文本去重 items_dict = {} for r in inputs: try: items_dict[r["label"]].append(r) except KeyError: items_dict[r["label"]] = [r] for label, items in items_dict.iteritems(): results = duplicate(items) for r in results: news = News(r["_id"], event.id) news.update_news_duplicate(r["duplicate"], r["same_from"]) # 更新簇的大小、增幅信息 before_size = event.get_subevent_size(label) event.update_subevent_size(label, len(items)) event.update_subevent_addsize(label, len(items) - before_size) if initializing: # 更新事件状态由initializing变为active event.activate() print '[%s] ' % ts2datetime(int(time.time( ))), 'event ', eventid, ' %s end step3' % ts2datetime(timestamp)
def step3_cal(): """计算各簇的特征词、代表文本、去重, 更新簇的大小、增幅信息 """ print '[%s] ' % ts2datetime(int(time.time())), 'event ', eventid, ' %s start step3' % ts2datetime(timestamp) inputs = [] subevents = event.getSubEvents() for subevent in subevents: subeventid = subevent["_id"] inputs.extend(event.getSubeventInfos(subeventid)) for r in inputs: r["title"] = r["title"].encode("utf-8") r["content"] = r["content168"].encode("utf-8") r["label"] = r["subeventid"] # 计算各簇的存量特征词 cluster_feature = extract_feature(inputs) for label, fwords in cluster_feature.iteritems(): feature = Feature(label) feature.upsert_newest(fwords) # 计算文本权重 for r in inputs: weight = text_weight_cal(r, cluster_feature[r['label']]) news = News(r["_id"], event.id) news.update_news_weight(weight) # 文本去重 items_dict = {} for r in inputs: try: items_dict[r["label"]].append(r) except KeyError: items_dict[r["label"]] = [r] for label, items in items_dict.iteritems(): results = duplicate(items) for r in results: news = News(r["_id"], event.id) news.update_news_duplicate(r["duplicate"], r["same_from"]) # 更新簇的大小、增幅信息 before_size = event.get_subevent_size(label) event.update_subevent_size(label, len(items)) event.update_subevent_addsize(label, len(items) - before_size) if initializing: # 更新事件状态由initializing变为active event.activate() print '[%s] ' % ts2datetime(int(time.time())), 'event ', eventid, ' %s end step3' % ts2datetime(timestamp)
def step2_cal(): """第二步计算,判断其他类是否需要分裂,若需要,则对其他类进行文本聚类,并做聚类评价 """ # 聚类评价时选取TOPK_FREQ_WORD的高频词 TOPK_FREQ_WORD = 50 # 聚类评价时最小簇的大小 LEAST_SIZE = 8 # 判断其他类是否需要分裂 ifsplit = event.check_ifsplit(initializing) print '[%s] ' % ts2datetime(int(time.time())), 'event ', eventid, ' split ', ifsplit, ' %s start step2' % ts2datetime(timestamp) if ifsplit: inputs, kmeans_cluster_num, reserve_num = event.getOtherSubEventInfos(initializing) print eventid, ' after classify before split: ', len(inputs), kmeans_cluster_num, reserve_num if len(inputs) > 2: items = [] for r in inputs: r["title"] = r["title"].encode("utf-8") r["content"] = r["content168"].encode("utf-8") items.append(r) # kmeans聚类 kmeans_results = kmeans(items, k=kmeans_cluster_num) # 聚类评价 if initializing or now_hour == 0: min_tfidf = event.get_min_tfidf() final_cluster_results, tfidf_dict = cluster_evaluation(kmeans_results, top_num=reserve_num, topk_freq=TOPK_FREQ_WORD, least_size=LEAST_SIZE, min_tfidf=min_tfidf) else: # 每小时聚类时,不用和已有簇的最小tfidf作比 final_cluster_results, tfidf_dict = cluster_evaluation(kmeans_results, top_num=reserve_num, topk_freq=TOPK_FREQ_WORD, least_size=LEAST_SIZE) # 更新新闻簇标签,更新子事件表 for label, items in final_cluster_results.iteritems(): if label == "other": label = event.getOtherSubEventID() event.save_subevent(label, timestamp) if label != event.getOtherSubEventID(): # 更新每类的tfidf event.update_subevent_tfidf(label, tfidf_dict[label]) for r in items: news = News(r["_id"], event.id) news.update_news_subeventid(label) else: print 'inputs less than 2, kmeans aborted' print '[%s] ' % ts2datetime(int(time.time())), 'event ', eventid, ' %s end step2' % ts2datetime(timestamp)
def subject_weibo2news(item): news = dict() for field in NULL_FIELDS: news[field] = None for k, v in item.iteritems(): if k == 'timestamp': news['timestamp'] = v news['date'] = ts2date(v) news['datetime'] = ts2datetime(v) if k == '_id': news['_id'] = v news['id'] = v if k == 'reposts_count': news['replies'] = v if k == 'comments_count': news['same_news_num'] = v if k == 'name': news['news_author'] = v news['user_name'] = v if k == 'user': news['user_id'] = v if k == 'text': text = v news['title'] = '【' + re.search(r'【(.*?)】', str(text)).group(1) + '】' news['content168'] = text.replace(news['title'], '') if k == 'weibourl': news['showurl'] = v return news
def object_weibo2comment(item): comment = dict() for field in NULL_FIELDS: comment[field] = None comment['news_id'] = 'weibo' for k, v in item.iteritems(): if k == 'timestamp': comment['timestamp'] = v comment['date'] = ts2date(v) comment['datetime'] = ts2datetime(v) if k == '_id': comment['_id'] = v comment['id'] = v if k == 'reposts_count': comment[k] = v if k == 'comments_count': comment[k] = v if k == 'attitudes_count': comment[k] = v if k == 'name': comment['user_name'] = v if k == 'weibourl': comment['comment_source'] = v if k == 'text': text = v comment['content168'] = text return comment
def object_weibo2comment(item): comment = dict() for field in NULL_FIELDS: comment[field] = None comment['news_id'] = 'weibo' for k, v in item.iteritems(): if k == 'timestamp': comment['timestamp'] = v comment['date'] = ts2date(v) comment['datetime'] = ts2datetime(v) if k == '_id': comment['_id'] = v comment['id'] = v if k == 'reposts_count': comment[k] = v if k == 'comments_count': comment[k] = v if k == 'attitudes_count': comment[k] = v if k == 'name': comment['user_name'] = v if k =='weibourl': comment['comment_source'] = v if k == 'text': text = v comment['content168'] = text return comment
def step4_cal(): """ 24 点时merge已有的簇 """ if not initializing and now_hour == 0: print '[%s] ' % ts2datetime(int(time.time( ))), 'event ', eventid, ' %s start step4' % ts2datetime(timestamp) subevents = event.getSubEvents() subevent_fwords = dict() for subevent in subevents: subeventid = subevent["_id"] feature = Feature(subeventid) # 获取每个子事件最新的特征词 fwords = feature.get_newest() subevent_fwords[subeventid] = fwords subeventids_sort_timestamp = event.get_sorted_subeventids() cids, mids = merge_subevents(subevent_fwords, subeventids_sort_timestamp, top_tfidf_para=10, top_percent=0.3) for res_id, mer_id in mids: # 将mer_id下的文本扔入res_id下的簇,remove mer_id的簇 temp_infos = event.get_subevent_infos(mer_id) for r in temp_infos: news = News(r["_id"], event.id) news.update_news_subeventid(res_id) event.remove_subevents([mer_id]) # 重新计算各簇的特征词, 并计算文本权重, 并去重 if len(mids): step3_cal() print '[%s] ' % ts2datetime(int(time.time( ))), 'event ', eventid, ' %s end step3' % ts2datetime(timestamp)
def action_additem(self, user, timestamp, itemid, info): from item.models import Item i = Item.get(itemid) if i: if user not in i.users: i.add_user(user) i.update() else: i = Item.get_or_create(itemid, info) i.modified = ts2datetime(timestamp) if user not in i.users: i.add_user(user) i.update() user.clear_smart_list_cache() return True
def transform(item): result = dict() result['reposts_count'] = item['reposts_count'] result['user_comment_url'] = item['weibourl'] result['comment_source'] = item['weibourl'] result['first_in'] = None result['last_modify'] = None result['timestamp'] = item['timestamp'] result['content168'] = item['text'] result['datetime'] = ts2datetime(item['timestamp']) result['news_id'] = 'weibo' result['attitudes_count'] = item['attitudes_count'] result['news_content'] = None result['comments_count'] = item['comments_count'] result['location'] = item['geo'] result['date'] = ts2date(item['timestamp']) result['_id'] = item['_id'] result['id'] = item['_id'] result['user_name'] = item['name'] return result
"source_website": "zhengyi_search", } count = mongo.boatcol.find(query_dict).count() print "zhengyi news %s count: " % sort_field, count results = mongo.boatcol.find(query_dict).sort(sort_field, pymongo.DESCENDING) results = [r for r in results] for r in results: r['title'] = r['title'].encode('utf-8') r['content'] = r['summary'].encode('utf-8') fw = csv.writer(open(result_path + 'zhengyi_news_sort_%s_%s_%s.csv' % (sort_field, START_DATETIME, END_DATETIME), 'wb'), delimiter='^') fw.writerow(ZHENGYI_NEWS_KEYS) for r in results: csvrow = [] for key in ZHENGYI_NEWS_KEYS: csvrow.append(_encode_utf8(r[key])) fw.writerow(csvrow) if __name__ == "__main__": print "recommend begins..." sheqi_rec(sort_field='hot') sheqi_rec(sort_field='timestamp') sheqi_rec(sort_field='sensi') zhengyi_rec(sort_field='timestamp') print "[%s] recommend ends..." % ts2datetime(int(time.time()))
flag = '1' test(inputs, flag) label = choose_ad(flag) return label def _encode_utf8(us): if isinstance(us, unicode): us = us.encode('utf-8') if not us: us = '' return us print "[%s] rubbish filter begins..." % ts2datetime(int(time.time())) mongo = _default_mongo() module_keywords = get_module_keywords() for bankuai, lanmu, source, source_en, keywords_file in module_keywords: query_dict = { "timestamp": { "$gte": START_TS, "$lt": END_TS }, "keywords_hit": True } if source_en == "weibo_api_search_spider":
def step2_cal(): """第二步计算,判断其他类是否需要分裂,若需要,则对其他类进行文本聚类,并做聚类评价 """ # 聚类评价时选取TOPK_FREQ_WORD的高频词 TOPK_FREQ_WORD = 50 # 聚类评价时最小簇的大小 LEAST_SIZE = 8 # 判断其他类是否需要分裂 ifsplit = event.check_ifsplit(initializing) print '[%s] ' % ts2datetime( int(time.time()) ), 'event ', eventid, ' split ', ifsplit, ' %s start step2' % ts2datetime( timestamp) if ifsplit: inputs, kmeans_cluster_num, reserve_num = event.getOtherSubEventInfos( initializing) print eventid, ' after classify before split: ', len( inputs), kmeans_cluster_num, reserve_num if len(inputs) > 2: items = [] for r in inputs: r["title"] = r["title"].encode("utf-8") r["content"] = r["content168"].encode("utf-8") items.append(r) # kmeans聚类 kmeans_results = kmeans(items, k=kmeans_cluster_num) # 聚类评价 if initializing or now_hour == 0: min_tfidf = event.get_min_tfidf() final_cluster_results, tfidf_dict = cluster_evaluation( kmeans_results, top_num=reserve_num, topk_freq=TOPK_FREQ_WORD, least_size=LEAST_SIZE, min_tfidf=min_tfidf) else: # 每小时聚类时,不用和已有簇的最小tfidf作比 final_cluster_results, tfidf_dict = cluster_evaluation( kmeans_results, top_num=reserve_num, topk_freq=TOPK_FREQ_WORD, least_size=LEAST_SIZE) # 更新新闻簇标签,更新子事件表 for label, items in final_cluster_results.iteritems(): if label == "other": label = event.getOtherSubEventID() event.save_subevent(label, timestamp) if label != event.getOtherSubEventID(): # 更新每类的tfidf event.update_subevent_tfidf(label, tfidf_dict[label]) for r in items: news = News(r["_id"], event.id) news.update_news_subeventid(label) else: print 'inputs less than 2, kmeans aborted' print '[%s] ' % ts2datetime(int(time.time( ))), 'event ', eventid, ' %s end step2' % ts2datetime(timestamp)
return us def get_keywords(file_name): f = open('../source/' + file_name) hit_keywords = [] for line in f: ks = line.strip().strip('(').strip(')').split(' | ') ks_list = [word.split(' ') for word in ks] for k in ks_list: hit_keywords.append(k) f.close() return hit_keywords print "[%s] post filter begins..." % ts2datetime(int(time.time())) mongo = _default_mongo() module_keywords = get_module_keywords() for bankuai, lanmu, source, source_en, keywords_file in module_keywords: query_dict = { "timestamp": { "$gte": START_TS, "$lt": END_TS } } hit_keywords = get_keywords(keywords_file)